diff --git a/test/TestOne.py b/test/TestOne.py deleted file mode 100644 index 781b0c5..0000000 --- a/test/TestOne.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - - -class TestOne(unittest.TestCase): - def test_for_fun(self): - pass - - -if __name__ == '__main__': - unittest.main() diff --git a/test/TestTools.py b/test/TestTools.py new file mode 100644 index 0000000..979659e --- /dev/null +++ b/test/TestTools.py @@ -0,0 +1,60 @@ +import unittest +from nose.tools import assert_raises, assert_equal + +from lxml import etree +from wechatsogou.tools import ( + list_or_empty, + get_elem_text, + replace_html, + str_to_dict, + replace_space, + get_url_param +) + + +class TestTools(unittest.TestCase): + def test_list_or_empty(self): + with assert_raises(AssertionError): + list_or_empty('test for fun') + + assert_equal(list_or_empty(['1', '2'], int), 1) + assert_equal(list_or_empty(['1', '2']), '1') + assert_equal(list_or_empty([], int), 0) + assert_equal(list_or_empty([], str), '') + assert_equal(list_or_empty([], list), []) + + def test_get_elem_text(self): + html = ''' +
+
111
+
222
+
+ ''' + elem = etree.HTML(html) + assert_equal(get_elem_text(elem), '111222') + + def test_replace_html(self): + html = ''''"&¥amp;<> \\''' + assert_equal(replace_html(html), '\'"&¥<> ') + + html = [''', '"', '&', '¥', 'amp;', '<', '>', ' ', '\\'] + assert_equal(replace_html(html), ['\'', '"', '&', '¥', '', '<', '>', ' ', '']) + + html = {''': '"'} + assert_equal(replace_html(html), {'\'': '"'}) + + def test_str_to_dict(self): + string = "{'a':'a'}" + assert_equal(str_to_dict(string), {'a': 'a'}) + + def test_replace_space(self): + string = 'ss ss' + assert_equal(replace_space(string), 'ssss') + + def test_get_url_param(self): + url = 'http://example.com?a=1&b=2&a=3' + assert_equal(get_url_param(url), {'a': ['1', '3'], 'b': ['2']}) + + +if __name__ == '__main__': + unittest.main() diff --git a/wechatsogou/basic.py b/wechatsogou/basic.py index db3b834..e260fbf 100644 --- a/wechatsogou/basic.py +++ b/wechatsogou/basic.py @@ -4,6 +4,7 @@ import time import random import logging +from builtins import input import requests from lxml import etree @@ -43,8 +44,7 @@ def readimg(content): from .ruokuaicode import RClient from .filecache import WechatCache from .tools import ( - input, - replace_all, + replace_html, replace_space, get_encoding_from_reponse ) @@ -328,7 +328,7 @@ def _get_gzh_article_by_url_dict(self, text): msglist = re.findall("var msgList =(.+?)};", text, re.S)[0] msglist = msglist + '}' msgdict = eval(msglist) - msgdict = replace_all(msgdict) + msgdict = replace_html(msgdict) return msgdict def _deal_gzh_article_dict(self, msgdict, **kwargs): diff --git a/wechatsogou/tools.py b/wechatsogou/tools.py index 7fe2771..4abffb7 100644 --- a/wechatsogou/tools.py +++ b/wechatsogou/tools.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -import json +import ast import requests @@ -10,29 +10,23 @@ import urllib.parse as url_parse -def prdict(content): - msg = json.dumps(content, indent=1, ensure_ascii=False) - print(msg) - - def list_or_empty(content, contype=None): - if isinstance(content, list): - if content: - return contype(content[0]) if contype else content[0] - else: - if contype: - if contype == int: - return 0 - elif contype == str: - return '' - elif contype == list: - return [] - else: - raise Exception('only cna deal int str list') - else: - return '' + assert isinstance(content, list), 'content is not list: {}'.format(content) + + if content: + return contype(content[0]) if contype else content[0] else: - raise Exception('need list') + if contype: + if contype == int: + return 0 + elif contype == str: + return '' + elif contype == list: + return [] + else: + raise Exception('only can deal int str list') + else: + return '' def get_elem_text(elem): @@ -44,10 +38,7 @@ def get_elem_text(elem): Returns: elem中文字 """ - rc = [] - for node in elem.itertext(): - rc.append(node.strip()) - return ''.join(rc) + return ''.join([node.strip() for node in elem.itertext()]) def get_encoding_from_reponse(r): @@ -63,7 +54,7 @@ def get_encoding_from_reponse(r): return encoding[0] if encoding else requests.utils.get_encoding_from_headers(r.headers) -def _replace_html(s): +def _replace_str_html(s): """替换html‘"’等转义内容为正常内容 Args: @@ -72,68 +63,42 @@ def _replace_html(s): Returns: s: 处理反转义后的文字 """ - s = s.replace(''', '\'') - s = s.replace('"', '"') - s = s.replace('&', '&') - s = s.replace('>', '>') - s = s.replace('<', '<') - s = s.replace('¥', '¥') - s = s.replace('amp;', '') - s = s.replace('<', '<') - s = s.replace('>', '>') - s = s.replace(' ', ' ') - s = s.replace('\\', '') + html_str_list = [ + (''', '\''), + ('"', '"'), + ('&', '&'), + ('¥', '¥'), + ('amp;', ''), + ('<', '<'), + ('>', '>'), + (' ', ' '), + ('\\', '') + ] + for i in html_str_list: + s = s.replace(i[0], i[1]) return s -def _replace_dict(dicts): - retu_dict = dict() - for k, v in dicts.items(): - retu_dict[replace_all(k)] = replace_all(v) - return retu_dict - - -def _replace_list(lists): - retu_list = list() - for l in lists: - retu_list.append(replace_all(l)) - return retu_list - - -def replace_all(data): +def replace_html(data): if isinstance(data, dict): - return _replace_dict(data) + return dict([(replace_html(k), replace_html(v)) for k, v in data.items()]) elif isinstance(data, list): - return _replace_list(data) + return [replace_html(l) for l in data] elif isinstance(data, str): - return _replace_html(data) + return _replace_str_html(data) else: return data def str_to_dict(json_str): - json_dict = eval(json_str) - return replace_all(json_dict) + json_dict = ast.literal_eval(json_str) + return replace_html(json_dict) def replace_space(s): - s = s.replace(' ', '') - s = s.replace('\r\n', '') - return s + return s.replace(' ', '').replace('\r\n', '') def get_url_param(url): result = url_parse.urlparse(url) return url_parse.parse_qs(result.query, True) - - -def input(msg=''): - try: - return raw_input(msg) - except NameError: - return input(msg) - - -if __name__ == '__main__': - aa = list_or_empty(['list']) - print(aa, type(aa))