summaryrefslogtreecommitdiff
path: root/spysone_proxy_scraper.py
blob: 0fcf8441b31b437b1f2bf1e6a61f65a86de1fea1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import argparse
import re

# dependencies
import bs4
import httpx
import user_agent

def main():
    base_url = 'https://spys.one/en/http-proxy-list/'
    client = httpx.Client(
    	headers={
	    	'Host': 'spys.one', 
		    'User-Agent': user_agent.generate_user_agent(), 
   		    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
		    'Accept-Language': 'en-US,en;q=0.5'
	    }, 
	    timeout=30
    )   
    
    protocol, number, ssl, anonymity, date = return_args()
    result = scrape(client, base_url, protocol, number, ssl, anonymity, date)
    for i in result:
        print(i)

def return_args():
    parser = argparse.ArgumentParser(description='Proxy-scraper for spys.one')

    parser.add_argument('-p', '--protocol', choices=['all', 'http', 'socks'], default='all', help='select protocol (default is all)')
    parser.add_argument('-n', '--number', choices=['30', '50', '100', '200', '300', '500'], default='30', help='select number of proxys to show (default is 30)')
    parser.add_argument('-s', '--ssl', choices=['all', '1', '0'], default='all', help='turn ssl on/off (default is all)')
    parser.add_argument('-a', '--anonymity', choices=['all', 'a+h', 'noa', 'anm', 'hia'], default='all', help='select level of anonymity (default is all)')
    parser.add_argument('-d', '--date', action='store_true', help='shows when proxy was last checked (default is off)')

    args = parser.parse_args()

    protocol_form_values = {'all': 0, 'http': 1, 'socks': 2}
    number_form_values = {'30': 0, '50': 1, '100': 2, '200': 3, '300': 4, '500': 5}
    ssl_form_values = {'all': 0, '1': 1, '0': 2}
    anonymity_form_values = {'all': 0, 'a+h': 1, 'noa': 2, 'anm': 3, 'hia': 4}

    return protocol_form_values[args.protocol], number_form_values[args.number], ssl_form_values[args.ssl], anonymity_form_values[args.anonymity], args.date 

def scrape(client, base_url, protocol, number, ssl, anonymity, date=False):
    # post instead of get to simulate first visit without form data
    response = client.post(base_url)
    soup = bs4.BeautifulSoup(response.content, 'lxml')
    form_data = return_form_data(soup, protocol, number, ssl, anonymity)
    response = client.post(base_url, data=form_data).text
    yield from parse_proxy_results(response, date)

def return_form_data(soup, protocol=0, number=0, ssl=0, anonymity=0):
    form_data = {}
    form = soup.select_one('form')
    for input_tag in form.find_all('input'):
        form_data[input_tag['name']] = input_tag.get('value', '')
    for select_tag in form.find_all('select'):
        form_data[select_tag['name']] = select_tag.option['value']
    form_data['xpp'] = number 
    form_data['xf1'] = anonymity
    form_data['xf2'] = ssl
    form_data['xf5'] = protocol
    return form_data

def parse_proxy_results(response, date):
    SCRIPT_PATTERN = r'javascript">(.*?);<'
    PROXY_PATTERN = r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*?"\+(.*?)\)<'
    DATE_TIME_PATTERN = r'spy14>(\d{2}-\w{3}-\d{4})</font> (\d{2}:\d{2}) <font class=spy5>([(]\d{1,2} \w{4,5} \w{3}[)])'

    # building cypher to decrypt port number
    cypher = {}
    script = re.search(SCRIPT_PATTERN, response)
    key_value_list = script.group(1).split(';')
    for key_value in key_value_list:
        key, value = key_value.split('=')
        if value.isdigit():
            cypher[key] = int(value)
        else:
            part1, part2 = value.split('^')
            cypher[key] = int(part1) ^ int(cypher[part2])

    proxy_matches = re.findall(PROXY_PATTERN, response)
    date_time_matches = re.findall(DATE_TIME_PATTERN, response)
    for proxy_data, date_time_data in zip(proxy_matches, date_time_matches):
        proxy, port_script = proxy_data
        last_check_date, last_check_time, since = date_time_data

        # decrypting port number using cypher
        port_list = []
        for crypted_digit in port_script.split('+'):
            stripped_crypted_digit = crypted_digit[1:-1]
            xor1, xor2 = stripped_crypted_digit.split('^')
            digit = str(int(cypher[xor1]) ^ int(cypher[xor2]))
            port_list.append(digit)
        port = ''.join(port_list)

        if date:
            yield proxy + ':' + port + ' ' + last_check_date + ' ' + last_check_time + ' ' + since
        else:
            yield proxy + ':' + port

if __name__ == '__main__':
    main()