Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove is_need_unlock #147

Merged
merged 2 commits into from
Oct 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ build/
.hypothesis/
test/.hypothesis/
t.py
y.py
y.py
tencent_captcha/
2 changes: 1 addition & 1 deletion wechatsogou/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

"""
WechatSogou Crawler Library
~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~

"""

Expand Down
24 changes: 15 additions & 9 deletions wechatsogou/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@

class WechatSogouAPI(object):
def __init__(self, captcha_break_time=1, proxies=None):
"""初始化参数

Parameters
----------
captcha_break_time : int
验证码输入错误重试次数
proxies : dict
代理
"""
assert isinstance(captcha_break_time, int) and 0 < captcha_break_time < 20

self.captcha_break_times = captcha_break_time
Expand All @@ -31,8 +40,10 @@ def __set_cookie(self, suv=None, snuid=None, referer=None):
suv = ws_cache.get('suv') if suv is None else suv
snuid = ws_cache.get('snuid') if snuid is None else snuid

return {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)} if referer is None else {
'Cookie': 'SUV={};SNUID={};'.format(suv, snuid), 'Referer': referer}
_headers = {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)}
if referer is not None:
_headers['Referer'] = referer
return _headers

def __set_cache(self, suv, snuid):
ws_cache.set('suv', suv)
Expand Down Expand Up @@ -79,9 +90,8 @@ def __unlock_wechat(self, url, resp, session, unlock_callback=None, identify_ima
'[WechatSogouAPI identify image] code: {ret}, msg: {errmsg}, cookie_count: {cookie_count}'.format(
ret=r_unlock.get('ret'), errmsg=r_unlock.get('errmsg'), cookie_count=r_unlock.get('cookie_count')))

def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platform=None, unlock_callback=None,
def __get_by_unlock(self, url, referer=None, unlock_platform=None, unlock_callback=None,
identify_image_callback=None):
assert is_need_unlock is None or callable(is_need_unlock)
assert unlock_platform is None or callable(unlock_platform)

if identify_image_callback is None:
Expand All @@ -92,7 +102,7 @@ def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platfor
session = requests.session()
resp = self.__get(url, session, headers=self.__set_cookie(referer=referer))

if is_need_unlock(resp):
if 'antispider' in resp.url or '请输入验证码' in resp.text:
for i in range(self.captcha_break_times):
try:
unlock_platform(url, resp, session, unlock_callback, identify_image_callback)
Expand Down Expand Up @@ -179,7 +189,6 @@ def search_gzh(self, keyword, page=1, unlock_callback=None, identify_image_callb
"""
url = WechatSogouRequest.gen_search_gzh_url(keyword, page)
resp = self.__get_by_unlock(url,
is_need_unlock=lambda x: 'antispider' in x.url,
unlock_platform=self.__unlock_sogou,
unlock_callback=unlock_callback,
identify_image_callback=identify_image_callback)
Expand Down Expand Up @@ -243,7 +252,6 @@ def search_article(self, keyword, page=1, timesn=WechatSogouConst.search_article
"""
url = WechatSogouRequest.gen_search_article_url(keyword, page, timesn, article_type, ft, et)
resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
is_need_unlock=lambda x: 'antispider' in x.url,
unlock_platform=self.__unlock_sogou,
unlock_callback=unlock_callback,
identify_image_callback=identify_image_callback)
Expand Down Expand Up @@ -321,7 +329,6 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,
url = gzh_list['profile_url']

resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
is_need_unlock=lambda x: '请输入验证码' in x.text,
unlock_platform=self.__unlock_wechat,
unlock_callback=unlock_callback_weixin,
identify_image_callback=identify_image_callback_weixin)
Expand Down Expand Up @@ -362,7 +369,6 @@ def get_gzh_artilce_by_hot(self, hot_index, page=1, unlock_callback=None, identi

url = WechatSogouRequest.gen_hot_url(hot_index, page)
resp = self.__get_by_unlock(url,
is_need_unlock=lambda x: 'antispider' in x.url,
unlock_platform=self.__unlock_sogou,
unlock_callback=unlock_callback,
identify_image_callback=identify_image_callback)
Expand Down
5 changes: 4 additions & 1 deletion wechatsogou/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ class _WechatSogouSearchArticleTypeConst(object):

@Const
class _WechatSogouSearchArticleTimeConst(object):
"""时间 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定"""
"""搜索条件 时间

0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定
"""
anytime = 0
day = 1
week = 2
Expand Down
7 changes: 5 additions & 2 deletions wechatsogou/identify_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from wechatsogou.five import readimg, input
from wechatsogou.filecache import WechatCache
from wechatsogou.exceptions import WechatSogouVcodeOcrException

ws_cache = WechatCache()

Expand Down Expand Up @@ -69,7 +70,8 @@ def unlock_sogou_callback_example(url, req, resp, img, identify_image_callback):
}
r_unlock = req.post(unlock_url, data, headers=headers)
if not r_unlock.ok:
raise Exception() # todo use ws exception
raise WechatSogouVcodeOcrException(
'unlock[{}] failed: {}'.format(unlock_url, r_unlock.text, r_unlock.status_code))

return r_unlock.json()

Expand Down Expand Up @@ -113,6 +115,7 @@ def unlock_weixin_callback_example(url, req, resp, img, identify_image_callback)
}
r_unlock = req.post(unlock_url, data, headers=headers)
if not r_unlock.ok:
raise Exception() # todo use ws exception
raise WechatSogouVcodeOcrException(
'unlock[{}] failed: {}[{}]'.format(unlock_url, r_unlock.text, r_unlock.status_code))

return r_unlock.json()
24 changes: 12 additions & 12 deletions wechatsogou/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from wechatsogou.five import urlencode
from wechatsogou.const import WechatSogouConst

_search_type_gzh = 1 # 1 是公号
_search_type_article = 2 # 2 是文章
_search_type_gzh = 1 # 公众号
_search_type_article = 2 # 文章


class WechatSogouRequest(object):
Expand Down Expand Up @@ -66,24 +66,24 @@ def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_artic
else:
interation = ''

qsDict = OrderedDict()
qsDict['type'] = _search_type_article
qsDict['page'] = page
qsDict['ie'] = 'utf8'
qsDict['query'] = keyword
qs_dict = OrderedDict()
qs_dict['type'] = _search_type_article
qs_dict['page'] = page
qs_dict['ie'] = 'utf8'
qs_dict['query'] = keyword
qs_dict['interation'] = interation
if timesn != 0:
qsDict['tsn'] = timesn
qsDict['ft'] = str(ft)
qsDict['et'] = str(et)
qsDict['interation'] = interation
qs_dict['tsn'] = timesn
qs_dict['ft'] = str(ft)
qs_dict['et'] = str(et)

# TODO 账号内搜索
# '账号内 http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E9%AB%98%E8%80%83&tsn=3&ft=&et=&interation=458754
# &wxid=oIWsFt1tmWoG6vO6BcsS7St61bRE&usip=nanhangqinggong'
# qs['wxid'] = wxid
# qs['usip'] = usip

return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qsDict))
return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict))

@staticmethod
def gen_search_gzh_url(keyword, page=1):
Expand Down
2 changes: 1 addition & 1 deletion wechatsogou/structuring.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def get_gzh_artilce_by_hot(text):

try:
send_time = int(send_time[0])
except:
except ValueError:
send_time = send_time[0]

gzh_article_list.append({
Expand Down