Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/re ocr for get gzh article by url text #43

Merged
merged 2 commits into from
Dec 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions wechatsogou/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def _ocr_for_get_gzh_article_by_url_text(self, url):
result = self._ocr.create(coder.content, 2040)
img_code = result['Result']
else:
result = None
im = readimg(coder.content)
im.show()
img_code = printf("please input code: ")
Expand All @@ -231,6 +232,7 @@ def _ocr_for_get_gzh_article_by_url_text(self, url):
raise WechatSogouVcodeException('cannot jiefeng get_gzh_article because ' + remsg['errmsg'])
self._cache.set(config.cache_session_name, self._session)
logger.debug('ocr ', remsg['errmsg'])
return result

def _replace_html(self, s):
"""替换html‘"’等转义内容为正常内容
Expand Down Expand Up @@ -339,12 +341,24 @@ def _get_gzh_article_by_url_text(self, url):
Returns:
text: 返回的文本
"""
if hasattr(self, '_get_gzh_article_by_url_text_counter'):
self._get_gzh_article_by_url_text_counter += 1
text = ''
else:
text = self._get(url, 'get', host='mp.weixin.qq.com')
self._get_gzh_article_by_url_text_counter = 1

text = self._get(url, 'get', host='mp.weixin.qq.com')
if u'为了保护你的网络安全,请输入验证码' in text:
self._ocr_for_get_gzh_article_by_url_text(url)

if u'为了保护你的网络安全,请输入验证码' in text or self._get_gzh_article_by_url_text_counter > 1:
result = self._ocr_for_get_gzh_article_by_url_text(url)
text = self._get(url, 'get', host='mp.weixin.qq.com')
if '验证码有误' in text:
print('验证时输入错误')
if result:
self._ocr.report_error(result['Id'])
if self._get_gzh_article_by_url_text_counter > 1:
raise WechatSogouVcodeOcrException('验证码识别错误 url:{}'.format(url))

self._get_gzh_article_by_url_text(url)
return text

def _get_gzh_article_gzh_by_url_dict(self, text, url):
Expand Down
4 changes: 4 additions & 0 deletions wechatsogou/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ class WechatSogouVcodeException(WechatSogouException):
pass


class WechatSogouVcodeOcrException(WechatSogouException):
"""基于搜狗搜索的的微信公众号爬虫接口 验证码 识别错误 异常类
"""
pass
class WechatSogouJsonException(WechatSogouException):
"""基于搜狗搜索的的微信公众号爬虫接口 非标准json数据 异常类
"""
Expand Down