chyroc · chyroc · Oct 1, 2017 · Oct 1, 2017 · Oct 1, 2017
diff --git a/.gitignore b/.gitignore
@@ -20,4 +20,5 @@ build/
 .hypothesis/
 test/.hypothesis/
 t.py
-y.py
+y.py
+tencent_captcha/
diff --git a/wechatsogou/__init__.py b/wechatsogou/__init__.py
@@ -9,7 +9,7 @@
 
 """
 WechatSogou Crawler Library
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 """
 

diff --git a/wechatsogou/api.py b/wechatsogou/api.py
@@ -22,6 +22,15 @@
 
 class WechatSogouAPI(object):
     def __init__(self, captcha_break_time=1, proxies=None):
+        """初始化参数
+
+        Parameters
+        ----------
+        captcha_break_time : int
+            验证码输入错误重试次数
+        proxies : dict
+            代理
+        """
         assert isinstance(captcha_break_time, int) and 0 < captcha_break_time < 20
 
         self.captcha_break_times = captcha_break_time
@@ -31,8 +40,10 @@ def __set_cookie(self, suv=None, snuid=None, referer=None):
         suv = ws_cache.get('suv') if suv is None else suv
         snuid = ws_cache.get('snuid') if snuid is None else snuid
 
-        return {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)} if referer is None else {
-            'Cookie': 'SUV={};SNUID={};'.format(suv, snuid), 'Referer': referer}
+        _headers = {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)}
+        if referer is not None:
+            _headers['Referer'] = referer
+        return _headers
 
     def __set_cache(self, suv, snuid):
         ws_cache.set('suv', suv)
@@ -79,9 +90,8 @@ def __unlock_wechat(self, url, resp, session, unlock_callback=None, identify_ima
                 '[WechatSogouAPI identify image] code: {ret}, msg: {errmsg}, cookie_count: {cookie_count}'.format(
                     ret=r_unlock.get('ret'), errmsg=r_unlock.get('errmsg'), cookie_count=r_unlock.get('cookie_count')))
 
-    def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platform=None, unlock_callback=None,
+    def __get_by_unlock(self, url, referer=None, unlock_platform=None, unlock_callback=None,
                         identify_image_callback=None):
-        assert is_need_unlock is None or callable(is_need_unlock)
         assert unlock_platform is None or callable(unlock_platform)
 
         if identify_image_callback is None:
@@ -92,7 +102,7 @@ def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platfor
         session = requests.session()
         resp = self.__get(url, session, headers=self.__set_cookie(referer=referer))
 
-        if is_need_unlock(resp):
+        if 'antispider' in resp.url or '请输入验证码' in resp.text:
             for i in range(self.captcha_break_times):
                 try:
                     unlock_platform(url, resp, session, unlock_callback, identify_image_callback)
@@ -179,7 +189,6 @@ def search_gzh(self, keyword, page=1, unlock_callback=None, identify_image_callb
         """
         url = WechatSogouRequest.gen_search_gzh_url(keyword, page)
         resp = self.__get_by_unlock(url,
-                                    is_need_unlock=lambda x: 'antispider' in x.url,
                                     unlock_platform=self.__unlock_sogou,
                                     unlock_callback=unlock_callback,
                                     identify_image_callback=identify_image_callback)
@@ -243,7 +252,6 @@ def search_article(self, keyword, page=1, timesn=WechatSogouConst.search_article
         """
         url = WechatSogouRequest.gen_search_article_url(keyword, page, timesn, article_type, ft, et)
         resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
-                                    is_need_unlock=lambda x: 'antispider' in x.url,
                                     unlock_platform=self.__unlock_sogou,
                                     unlock_callback=unlock_callback,
                                     identify_image_callback=identify_image_callback)
@@ -321,7 +329,6 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,
             url = gzh_list['profile_url']
 
         resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
-                                    is_need_unlock=lambda x: '请输入验证码' in x.text,
                                     unlock_platform=self.__unlock_wechat,
                                     unlock_callback=unlock_callback_weixin,
                                     identify_image_callback=identify_image_callback_weixin)
@@ -362,7 +369,6 @@ def get_gzh_artilce_by_hot(self, hot_index, page=1, unlock_callback=None, identi
 
         url = WechatSogouRequest.gen_hot_url(hot_index, page)
         resp = self.__get_by_unlock(url,
-                                    is_need_unlock=lambda x: 'antispider' in x.url,
                                     unlock_platform=self.__unlock_sogou,
                                     unlock_callback=unlock_callback,
                                     identify_image_callback=identify_image_callback)

diff --git a/wechatsogou/const.py b/wechatsogou/const.py
@@ -24,7 +24,10 @@ class _WechatSogouSearchArticleTypeConst(object):
 
 @Const
 class _WechatSogouSearchArticleTimeConst(object):
-    """时间 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定"""
+    """搜索条件 时间
+
+    0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定
+    """
     anytime = 0
     day = 1
     week = 2

diff --git a/wechatsogou/identify_image.py b/wechatsogou/identify_image.py
@@ -8,6 +8,7 @@
 
 from wechatsogou.five import readimg, input
 from wechatsogou.filecache import WechatCache
+from wechatsogou.exceptions import WechatSogouVcodeOcrException
 
 ws_cache = WechatCache()
 
@@ -69,7 +70,8 @@ def unlock_sogou_callback_example(url, req, resp, img, identify_image_callback):
     }
     r_unlock = req.post(unlock_url, data, headers=headers)
     if not r_unlock.ok:
-        raise Exception()  # todo use ws exception
+        raise WechatSogouVcodeOcrException(
+            'unlock[{}] failed: {}'.format(unlock_url, r_unlock.text, r_unlock.status_code))
 
     return r_unlock.json()
 
@@ -113,6 +115,7 @@ def unlock_weixin_callback_example(url, req, resp, img, identify_image_callback)
     }
     r_unlock = req.post(unlock_url, data, headers=headers)
     if not r_unlock.ok:
-        raise Exception()  # todo use ws exception
+        raise WechatSogouVcodeOcrException(
+            'unlock[{}] failed: {}[{}]'.format(unlock_url, r_unlock.text, r_unlock.status_code))
 
     return r_unlock.json()
diff --git a/wechatsogou/request.py b/wechatsogou/request.py
@@ -8,8 +8,8 @@
 from wechatsogou.five import urlencode
 from wechatsogou.const import WechatSogouConst
 
-_search_type_gzh = 1  # 1 是公号
-_search_type_article = 2  # 2 是文章
+_search_type_gzh = 1  # 公众号
+_search_type_article = 2  # 文章
 
 
 class WechatSogouRequest(object):
@@ -66,24 +66,24 @@ def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_artic
         else:
             interation = ''
 
-        qsDict = OrderedDict()
-        qsDict['type'] = _search_type_article
-        qsDict['page'] = page
-        qsDict['ie'] = 'utf8'
-        qsDict['query'] = keyword
+        qs_dict = OrderedDict()
+        qs_dict['type'] = _search_type_article
+        qs_dict['page'] = page
+        qs_dict['ie'] = 'utf8'
+        qs_dict['query'] = keyword
+        qs_dict['interation'] = interation
         if timesn != 0:
-            qsDict['tsn'] = timesn
-            qsDict['ft'] = str(ft)
-            qsDict['et'] = str(et)
-        qsDict['interation'] = interation
+            qs_dict['tsn'] = timesn
+            qs_dict['ft'] = str(ft)
+            qs_dict['et'] = str(et)
 
         # TODO 账号内搜索
         # '账号内 http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E9%AB%98%E8%80%83&tsn=3&ft=&et=&interation=458754
         # &wxid=oIWsFt1tmWoG6vO6BcsS7St61bRE&usip=nanhangqinggong'
         # qs['wxid'] = wxid
         # qs['usip'] = usip
 
-        return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qsDict))
+        return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict))
 
     @staticmethod
     def gen_search_gzh_url(keyword, page=1):

diff --git a/wechatsogou/structuring.py b/wechatsogou/structuring.py
@@ -391,7 +391,7 @@ def get_gzh_artilce_by_hot(text):
 
             try:
                 send_time = int(send_time[0])
-            except:
+            except ValueError:
                 send_time = send_time[0]
 
             gzh_article_list.append({