1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
import argparse
import re
# dependencies
import bs4
import httpx
import user_agent
def main():
base_url = 'https://spys.one/en/http-proxy-list/'
client = httpx.Client(
headers={
'Host': 'spys.one',
'User-Agent': user_agent.generate_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
},
timeout=30
)
protocol, number, ssl, anonymity, date = return_args()
result = scrape(client, base_url, protocol, number, ssl, anonymity, date)
for i in result:
print(i)
def return_args():
parser = argparse.ArgumentParser(description='Proxy-scraper for spys.one')
parser.add_argument('-p', '--protocol', choices=['all', 'http', 'socks'], default='all', help='select protocol (default is all)')
parser.add_argument('-n', '--number', choices=['30', '50', '100', '200', '300', '500'], default='30', help='select number of proxys to show (default is 30)')
parser.add_argument('-s', '--ssl', choices=['all', '1', '0'], default='all', help='turn ssl on/off (default is all)')
parser.add_argument('-a', '--anonymity', choices=['all', 'a+h', 'noa', 'anm', 'hia'], default='all', help='select level of anonymity (default is all)')
parser.add_argument('-d', '--date', action='store_true', help='shows when proxy was last checked (default is off)')
args = parser.parse_args()
protocol_form_values = {'all': 0, 'http': 1, 'socks': 2}
number_form_values = {'30': 0, '50': 1, '100': 2, '200': 3, '300': 4, '500': 5}
ssl_form_values = {'all': 0, '1': 1, '0': 2}
anonymity_form_values = {'all': 0, 'a+h': 1, 'noa': 2, 'anm': 3, 'hia': 4}
return protocol_form_values[args.protocol], number_form_values[args.number], ssl_form_values[args.ssl], anonymity_form_values[args.anonymity], args.date
def scrape(client, base_url, protocol, number, ssl, anonymity, date=False):
# post instead of get to simulate first visit without form data
response = client.post(base_url)
soup = bs4.BeautifulSoup(response.content, 'lxml')
form_data = return_form_data(soup, protocol, number, ssl, anonymity)
response = client.post(base_url, data=form_data).text
yield from parse_proxy_results(response, date)
def return_form_data(soup, protocol=0, number=0, ssl=0, anonymity=0):
form_data = {}
form = soup.select_one('form')
for input_tag in form.find_all('input'):
form_data[input_tag['name']] = input_tag.get('value', '')
for select_tag in form.find_all('select'):
form_data[select_tag['name']] = select_tag.option['value']
form_data['xpp'] = number
form_data['xf1'] = anonymity
form_data['xf2'] = ssl
form_data['xf5'] = protocol
return form_data
def parse_proxy_results(response, date):
SCRIPT_PATTERN = r'javascript">(.*?);<'
PROXY_PATTERN = r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*?"\+(.*?)\)<'
DATE_TIME_PATTERN = r'spy14>(\d{2}-\w{3}-\d{4})</font> (\d{2}:\d{2}) <font class=spy5>([(]\d{1,2} \w{4,5} \w{3}[)])'
# building cypher to decrypt port number
cypher = {}
script = re.search(SCRIPT_PATTERN, response)
key_value_list = script.group(1).split(';')
for key_value in key_value_list:
key, value = key_value.split('=')
if value.isdigit():
cypher[key] = int(value)
else:
part1, part2 = value.split('^')
cypher[key] = int(part1) ^ int(cypher[part2])
proxy_matches = re.findall(PROXY_PATTERN, response)
date_time_matches = re.findall(DATE_TIME_PATTERN, response)
for proxy_data, date_time_data in zip(proxy_matches, date_time_matches):
proxy, port_script = proxy_data
last_check_date, last_check_time, since = date_time_data
# decrypting port number using cypher
port_list = []
for crypted_digit in port_script.split('+'):
stripped_crypted_digit = crypted_digit[1:-1]
xor1, xor2 = stripped_crypted_digit.split('^')
digit = str(int(cypher[xor1]) ^ int(cypher[xor2]))
port_list.append(digit)
port = ''.join(port_list)
if date:
yield proxy + ':' + port + ' ' + last_check_date + ' ' + last_check_time + ' ' + since
else:
yield proxy + ':' + port
if __name__ == '__main__':
main()
|