diff --git a/.gitignore b/.gitignore index e3d0557..784e884 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,5 @@ build/ .hypothesis/ test/.hypothesis/ t.py -y.py \ No newline at end of file +y.py +tencent_captcha/ \ No newline at end of file diff --git a/wechatsogou/__init__.py b/wechatsogou/__init__.py index 2a4aefe..0640c9b 100644 --- a/wechatsogou/__init__.py +++ b/wechatsogou/__init__.py @@ -9,7 +9,7 @@ """ WechatSogou Crawler Library -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~ """ diff --git a/wechatsogou/api.py b/wechatsogou/api.py index 8e1b008..f7fd6f3 100644 --- a/wechatsogou/api.py +++ b/wechatsogou/api.py @@ -22,6 +22,15 @@ class WechatSogouAPI(object): def __init__(self, captcha_break_time=1, proxies=None): + """初始化参数 + + Parameters + ---------- + captcha_break_time : int + 验证码输入错误重试次数 + proxies : dict + 代理 + """ assert isinstance(captcha_break_time, int) and 0 < captcha_break_time < 20 self.captcha_break_times = captcha_break_time @@ -31,8 +40,10 @@ def __set_cookie(self, suv=None, snuid=None, referer=None): suv = ws_cache.get('suv') if suv is None else suv snuid = ws_cache.get('snuid') if snuid is None else snuid - return {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)} if referer is None else { - 'Cookie': 'SUV={};SNUID={};'.format(suv, snuid), 'Referer': referer} + _headers = {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)} + if referer is not None: + _headers['Referer'] = referer + return _headers def __set_cache(self, suv, snuid): ws_cache.set('suv', suv) @@ -79,9 +90,8 @@ def __unlock_wechat(self, url, resp, session, unlock_callback=None, identify_ima '[WechatSogouAPI identify image] code: {ret}, msg: {errmsg}, cookie_count: {cookie_count}'.format( ret=r_unlock.get('ret'), errmsg=r_unlock.get('errmsg'), cookie_count=r_unlock.get('cookie_count'))) - def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platform=None, unlock_callback=None, + def __get_by_unlock(self, url, referer=None, unlock_platform=None, unlock_callback=None, identify_image_callback=None): - assert is_need_unlock is None or callable(is_need_unlock) assert unlock_platform is None or callable(unlock_platform) if identify_image_callback is None: @@ -92,7 +102,7 @@ def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platfor session = requests.session() resp = self.__get(url, session, headers=self.__set_cookie(referer=referer)) - if is_need_unlock(resp): + if 'antispider' in resp.url or '请输入验证码' in resp.text: for i in range(self.captcha_break_times): try: unlock_platform(url, resp, session, unlock_callback, identify_image_callback) @@ -179,7 +189,6 @@ def search_gzh(self, keyword, page=1, unlock_callback=None, identify_image_callb """ url = WechatSogouRequest.gen_search_gzh_url(keyword, page) resp = self.__get_by_unlock(url, - is_need_unlock=lambda x: 'antispider' in x.url, unlock_platform=self.__unlock_sogou, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback) @@ -243,7 +252,6 @@ def search_article(self, keyword, page=1, timesn=WechatSogouConst.search_article """ url = WechatSogouRequest.gen_search_article_url(keyword, page, timesn, article_type, ft, et) resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword), - is_need_unlock=lambda x: 'antispider' in x.url, unlock_platform=self.__unlock_sogou, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback) @@ -321,7 +329,6 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None, url = gzh_list['profile_url'] resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword), - is_need_unlock=lambda x: '请输入验证码' in x.text, unlock_platform=self.__unlock_wechat, unlock_callback=unlock_callback_weixin, identify_image_callback=identify_image_callback_weixin) @@ -362,7 +369,6 @@ def get_gzh_artilce_by_hot(self, hot_index, page=1, unlock_callback=None, identi url = WechatSogouRequest.gen_hot_url(hot_index, page) resp = self.__get_by_unlock(url, - is_need_unlock=lambda x: 'antispider' in x.url, unlock_platform=self.__unlock_sogou, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback) diff --git a/wechatsogou/const.py b/wechatsogou/const.py index 99ea5ed..ed41d8a 100644 --- a/wechatsogou/const.py +++ b/wechatsogou/const.py @@ -24,7 +24,10 @@ class _WechatSogouSearchArticleTypeConst(object): @Const class _WechatSogouSearchArticleTimeConst(object): - """时间 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定""" + """搜索条件 时间 + + 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定 + """ anytime = 0 day = 1 week = 2 diff --git a/wechatsogou/identify_image.py b/wechatsogou/identify_image.py index 0eb3578..a03510a 100644 --- a/wechatsogou/identify_image.py +++ b/wechatsogou/identify_image.py @@ -8,6 +8,7 @@ from wechatsogou.five import readimg, input from wechatsogou.filecache import WechatCache +from wechatsogou.exceptions import WechatSogouVcodeOcrException ws_cache = WechatCache() @@ -69,7 +70,8 @@ def unlock_sogou_callback_example(url, req, resp, img, identify_image_callback): } r_unlock = req.post(unlock_url, data, headers=headers) if not r_unlock.ok: - raise Exception() # todo use ws exception + raise WechatSogouVcodeOcrException( + 'unlock[{}] failed: {}'.format(unlock_url, r_unlock.text, r_unlock.status_code)) return r_unlock.json() @@ -113,6 +115,7 @@ def unlock_weixin_callback_example(url, req, resp, img, identify_image_callback) } r_unlock = req.post(unlock_url, data, headers=headers) if not r_unlock.ok: - raise Exception() # todo use ws exception + raise WechatSogouVcodeOcrException( + 'unlock[{}] failed: {}[{}]'.format(unlock_url, r_unlock.text, r_unlock.status_code)) return r_unlock.json() diff --git a/wechatsogou/request.py b/wechatsogou/request.py index 7e95dd9..f479a54 100644 --- a/wechatsogou/request.py +++ b/wechatsogou/request.py @@ -8,8 +8,8 @@ from wechatsogou.five import urlencode from wechatsogou.const import WechatSogouConst -_search_type_gzh = 1 # 1 是公号 -_search_type_article = 2 # 2 是文章 +_search_type_gzh = 1 # 公众号 +_search_type_article = 2 # 文章 class WechatSogouRequest(object): @@ -66,16 +66,16 @@ def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_artic else: interation = '' - qsDict = OrderedDict() - qsDict['type'] = _search_type_article - qsDict['page'] = page - qsDict['ie'] = 'utf8' - qsDict['query'] = keyword + qs_dict = OrderedDict() + qs_dict['type'] = _search_type_article + qs_dict['page'] = page + qs_dict['ie'] = 'utf8' + qs_dict['query'] = keyword + qs_dict['interation'] = interation if timesn != 0: - qsDict['tsn'] = timesn - qsDict['ft'] = str(ft) - qsDict['et'] = str(et) - qsDict['interation'] = interation + qs_dict['tsn'] = timesn + qs_dict['ft'] = str(ft) + qs_dict['et'] = str(et) # TODO 账号内搜索 # '账号内 http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E9%AB%98%E8%80%83&tsn=3&ft=&et=&interation=458754 @@ -83,7 +83,7 @@ def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_artic # qs['wxid'] = wxid # qs['usip'] = usip - return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qsDict)) + return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict)) @staticmethod def gen_search_gzh_url(keyword, page=1): diff --git a/wechatsogou/structuring.py b/wechatsogou/structuring.py index 8c8f1d6..2f7299b 100644 --- a/wechatsogou/structuring.py +++ b/wechatsogou/structuring.py @@ -391,7 +391,7 @@ def get_gzh_artilce_by_hot(text): try: send_time = int(send_time[0]) - except: + except ValueError: send_time = send_time[0] gzh_article_list.append({