diff options
Diffstat (limited to 'spysone_proxy_scraper.py')
-rw-r--r-- | spysone_proxy_scraper.py | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/spysone_proxy_scraper.py b/spysone_proxy_scraper.py new file mode 100644 index 0000000..0fcf844 --- /dev/null +++ b/spysone_proxy_scraper.py @@ -0,0 +1,103 @@ +import argparse
+import re
+
+# dependencies
+import bs4
+import httpx
+import user_agent
+
+def main():
+ base_url = 'https://spys.one/en/http-proxy-list/'
+ client = httpx.Client(
+ headers={
+ 'Host': 'spys.one',
+ 'User-Agent': user_agent.generate_user_agent(),
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.5'
+ },
+ timeout=30
+ )
+
+ protocol, number, ssl, anonymity, date = return_args()
+ result = scrape(client, base_url, protocol, number, ssl, anonymity, date)
+ for i in result:
+ print(i)
+
+def return_args():
+ parser = argparse.ArgumentParser(description='Proxy-scraper for spys.one')
+
+ parser.add_argument('-p', '--protocol', choices=['all', 'http', 'socks'], default='all', help='select protocol (default is all)')
+ parser.add_argument('-n', '--number', choices=['30', '50', '100', '200', '300', '500'], default='30', help='select number of proxys to show (default is 30)')
+ parser.add_argument('-s', '--ssl', choices=['all', '1', '0'], default='all', help='turn ssl on/off (default is all)')
+ parser.add_argument('-a', '--anonymity', choices=['all', 'a+h', 'noa', 'anm', 'hia'], default='all', help='select level of anonymity (default is all)')
+ parser.add_argument('-d', '--date', action='store_true', help='shows when proxy was last checked (default is off)')
+
+ args = parser.parse_args()
+
+ protocol_form_values = {'all': 0, 'http': 1, 'socks': 2}
+ number_form_values = {'30': 0, '50': 1, '100': 2, '200': 3, '300': 4, '500': 5}
+ ssl_form_values = {'all': 0, '1': 1, '0': 2}
+ anonymity_form_values = {'all': 0, 'a+h': 1, 'noa': 2, 'anm': 3, 'hia': 4}
+
+ return protocol_form_values[args.protocol], number_form_values[args.number], ssl_form_values[args.ssl], anonymity_form_values[args.anonymity], args.date
+
+def scrape(client, base_url, protocol, number, ssl, anonymity, date=False):
+ # post instead of get to simulate first visit without form data
+ response = client.post(base_url)
+ soup = bs4.BeautifulSoup(response.content, 'lxml')
+ form_data = return_form_data(soup, protocol, number, ssl, anonymity)
+ response = client.post(base_url, data=form_data).text
+ yield from parse_proxy_results(response, date)
+
+def return_form_data(soup, protocol=0, number=0, ssl=0, anonymity=0):
+ form_data = {}
+ form = soup.select_one('form')
+ for input_tag in form.find_all('input'):
+ form_data[input_tag['name']] = input_tag.get('value', '')
+ for select_tag in form.find_all('select'):
+ form_data[select_tag['name']] = select_tag.option['value']
+ form_data['xpp'] = number
+ form_data['xf1'] = anonymity
+ form_data['xf2'] = ssl
+ form_data['xf5'] = protocol
+ return form_data
+
+def parse_proxy_results(response, date):
+ SCRIPT_PATTERN = r'javascript">(.*?);<'
+ PROXY_PATTERN = r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*?"\+(.*?)\)<'
+ DATE_TIME_PATTERN = r'spy14>(\d{2}-\w{3}-\d{4})</font> (\d{2}:\d{2}) <font class=spy5>([(]\d{1,2} \w{4,5} \w{3}[)])'
+
+ # building cypher to decrypt port number
+ cypher = {}
+ script = re.search(SCRIPT_PATTERN, response)
+ key_value_list = script.group(1).split(';')
+ for key_value in key_value_list:
+ key, value = key_value.split('=')
+ if value.isdigit():
+ cypher[key] = int(value)
+ else:
+ part1, part2 = value.split('^')
+ cypher[key] = int(part1) ^ int(cypher[part2])
+
+ proxy_matches = re.findall(PROXY_PATTERN, response)
+ date_time_matches = re.findall(DATE_TIME_PATTERN, response)
+ for proxy_data, date_time_data in zip(proxy_matches, date_time_matches):
+ proxy, port_script = proxy_data
+ last_check_date, last_check_time, since = date_time_data
+
+ # decrypting port number using cypher
+ port_list = []
+ for crypted_digit in port_script.split('+'):
+ stripped_crypted_digit = crypted_digit[1:-1]
+ xor1, xor2 = stripped_crypted_digit.split('^')
+ digit = str(int(cypher[xor1]) ^ int(cypher[xor2]))
+ port_list.append(digit)
+ port = ''.join(port_list)
+
+ if date:
+ yield proxy + ':' + port + ' ' + last_check_date + ' ' + last_check_time + ' ' + since
+ else:
+ yield proxy + ':' + port
+
+if __name__ == '__main__':
+ main()
|