-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
executable file
·144 lines (126 loc) · 4.32 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# import codecs
import time
import re
import random
from fake_useragent import UserAgent
from urllib.request import Request, urlopen
import urllib.parse
from bs4 import BeautifulSoup
# Local Vars
input_name = "input"
output_name = "output"
ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]
no_proxy = True
def get_proxies():
global proxies
print("Getting New Proxies")
# Retrieve latest proxies
try_proxy = True
while try_proxy:
try:
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')
try_proxy = False
except:
print("Failed to get proxies, trying again in 5 minutes")
time.sleep(5)
soup = BeautifulSoup(proxies_doc, 'html.parser')
proxies_table = soup.find(id='proxylisttable')
# Save proxies in the array
for row in proxies_table.tbody.find_all('tr'):
proxies.append({
'ip': row.find_all('td')[0].string,
'port': row.find_all('td')[1].string
})
proxies = proxies[:75] # trim it to allow no proxy more often
print("Got new proxies")
# Retrieve a random index proxy (we need the index to delete it if not working)
def random_proxy():
global no_proxy
if len(proxies) < 10: # list almost exhausted
get_proxies()
no_proxy = True
return random.randint(0, len(proxies) - 1)
def get_no_proxy(entry):
# url https://search.yahoo.co.jp/search?p=
url = ("https://search.yahoo.co.jp/search?p="
+ urllib.parse.quote(entry, encoding="utf-8"))
try:
html = urllib.request.urlopen(url)
return(html)
except:
return(-1)
def get_proxy(entry, proxy_index):
proxy = proxies[proxy_index]
url = ("https://search.yahoo.co.jp/search?p="
+ urllib.parse.quote(entry, encoding="utf-8"))
req = Request(url)
req.add_header('User-Agent', ua.random)
req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')
# Make the call
try:
html = urlopen(req, timeout=30).read().decode('utf8')
return(html)
except: # If error, delete this proxy and find another one
return(-1)
def soup(html):
soup = BeautifulSoup(html, 'html.parser')
ba = soup.find_all("div", id="inf")
# print(ba[0].text)
count = re.search(r'^.*約(.*)件', ba[0].text).group(1)
count = count.replace(',', '')
# print(count)
# print(soup.prettify())
return(count)
if __name__ == "__main__":
input_file = open(input_name, 'r')
output = open(output_name, 'r+')
results = list()
get_proxies()
# Choose a random proxy
proxy_index = random_proxy()
first_scan = True
for line in input_file:
entry = re.search(r'^(.*?) .*$', line).group(1)
if first_scan:
check_output = output.readline()
if re.search(r'^'+entry, check_output):
# already searched
# print("Already searched " + entry)
# print(line[:-1] + " -- " + "0")
continue
else:
first_scan = False
print(entry)
# Found entries that were not searched yet
# print(str(entry) + " - " + str(line))
while True:
if no_proxy:
result = get_no_proxy(entry)
if result != -1: # no page error
count = soup(result)
break
else:
print("Failed to scrap with no proxy")
no_proxy = False
else:
result = get_proxy(entry, proxy_index)
if result != -1:
count = soup(result)
break
else:
proxy = proxies[proxy_index]
del proxies[proxy_index]
print('Proxy ' + proxy['ip'] + ':' +
proxy['port'] + ' deleted.')
proxy_index = random_proxy()
print(line[:-1] + " -- " + count)
output.write(line[:-1] + " -- " + count + "\n")
output.close()
output = open(output_name, 'a')
count = "AAA"
output.close()