From 17c3dc56bcfce95d11a648820c4aac2701258a61 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 9 Jul 2025 16:11:17 +0200 Subject: initial commit --- spysone_proxy_scraper.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 spysone_proxy_scraper.py (limited to 'spysone_proxy_scraper.py') diff --git a/spysone_proxy_scraper.py b/spysone_proxy_scraper.py new file mode 100644 index 0000000..0fcf844 --- /dev/null +++ b/spysone_proxy_scraper.py @@ -0,0 +1,103 @@ +import argparse +import re + +# dependencies +import bs4 +import httpx +import user_agent + +def main(): + base_url = 'https://spys.one/en/http-proxy-list/' + client = httpx.Client( + headers={ + 'Host': 'spys.one', + 'User-Agent': user_agent.generate_user_agent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5' + }, + timeout=30 + ) + + protocol, number, ssl, anonymity, date = return_args() + result = scrape(client, base_url, protocol, number, ssl, anonymity, date) + for i in result: + print(i) + +def return_args(): + parser = argparse.ArgumentParser(description='Proxy-scraper for spys.one') + + parser.add_argument('-p', '--protocol', choices=['all', 'http', 'socks'], default='all', help='select protocol (default is all)') + parser.add_argument('-n', '--number', choices=['30', '50', '100', '200', '300', '500'], default='30', help='select number of proxys to show (default is 30)') + parser.add_argument('-s', '--ssl', choices=['all', '1', '0'], default='all', help='turn ssl on/off (default is all)') + parser.add_argument('-a', '--anonymity', choices=['all', 'a+h', 'noa', 'anm', 'hia'], default='all', help='select level of anonymity (default is all)') + parser.add_argument('-d', '--date', action='store_true', help='shows when proxy was last checked (default is off)') + + args = parser.parse_args() + + protocol_form_values = {'all': 0, 'http': 1, 'socks': 2} + number_form_values = {'30': 0, '50': 1, '100': 2, '200': 3, '300': 4, '500': 5} + ssl_form_values = {'all': 0, '1': 1, '0': 2} + anonymity_form_values = {'all': 0, 'a+h': 1, 'noa': 2, 'anm': 3, 'hia': 4} + + return protocol_form_values[args.protocol], number_form_values[args.number], ssl_form_values[args.ssl], anonymity_form_values[args.anonymity], args.date + +def scrape(client, base_url, protocol, number, ssl, anonymity, date=False): + # post instead of get to simulate first visit without form data + response = client.post(base_url) + soup = bs4.BeautifulSoup(response.content, 'lxml') + form_data = return_form_data(soup, protocol, number, ssl, anonymity) + response = client.post(base_url, data=form_data).text + yield from parse_proxy_results(response, date) + +def return_form_data(soup, protocol=0, number=0, ssl=0, anonymity=0): + form_data = {} + form = soup.select_one('form') + for input_tag in form.find_all('input'): + form_data[input_tag['name']] = input_tag.get('value', '') + for select_tag in form.find_all('select'): + form_data[select_tag['name']] = select_tag.option['value'] + form_data['xpp'] = number + form_data['xf1'] = anonymity + form_data['xf2'] = ssl + form_data['xf5'] = protocol + return form_data + +def parse_proxy_results(response, date): + SCRIPT_PATTERN = r'javascript">(.*?);<' + PROXY_PATTERN = r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*?"\+(.*?)\)<' + DATE_TIME_PATTERN = r'spy14>(\d{2}-\w{3}-\d{4}) (\d{2}:\d{2}) ([(]\d{1,2} \w{4,5} \w{3}[)])' + + # building cypher to decrypt port number + cypher = {} + script = re.search(SCRIPT_PATTERN, response) + key_value_list = script.group(1).split(';') + for key_value in key_value_list: + key, value = key_value.split('=') + if value.isdigit(): + cypher[key] = int(value) + else: + part1, part2 = value.split('^') + cypher[key] = int(part1) ^ int(cypher[part2]) + + proxy_matches = re.findall(PROXY_PATTERN, response) + date_time_matches = re.findall(DATE_TIME_PATTERN, response) + for proxy_data, date_time_data in zip(proxy_matches, date_time_matches): + proxy, port_script = proxy_data + last_check_date, last_check_time, since = date_time_data + + # decrypting port number using cypher + port_list = [] + for crypted_digit in port_script.split('+'): + stripped_crypted_digit = crypted_digit[1:-1] + xor1, xor2 = stripped_crypted_digit.split('^') + digit = str(int(cypher[xor1]) ^ int(cypher[xor2])) + port_list.append(digit) + port = ''.join(port_list) + + if date: + yield proxy + ':' + port + ' ' + last_check_date + ' ' + last_check_time + ' ' + since + else: + yield proxy + ':' + port + +if __name__ == '__main__': + main() -- cgit v1.2.3