有一个文件,对其进行单词统计,不区分大小写,并显示单词重复最多的十个单词
from pathlib import Path
from collections import defaultdict
_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'
with file.open(encoding='utf-8') as f:
for line in f:
for j in line.split():
_dict[j] += 1
order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)
print(order)
继续优化
from pathlib import Path
from collections import defaultdict
_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'
def make_key(files, chars=set(r'''!~@#$%^&*(){}[]:'"/\.,''')):
ret = []
for i in map(lambda x: x.lower(), files):
if i in chars:
ret.append(' ')
else:
ret.append(i)
return ''.join(ret).split()
with file.open(encoding='utf-8') as f:
for line in f:
for j in make_key(line):
_dict[j] += 1
order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)
print(order)
继续优化,用窗口切片
from pathlib import Path
from collections import defaultdict
_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'
def _make_key(files, chars=set(r'''! ~@#$%^&*(){}[]:'"/\.,''')):
start, ret, length = 0, [], len(files)
for i, j in enumerate(map(lambda x: x.lower(), files)):
if j in chars:
if start == i:
start += 1
continue
ret.append(files[start:i])
start = i+1
else:
if start < length:
ret.append(files[start:])
return ret
with file.open(encoding='utf-8') as f:
for line in f:
for j in _make_key(line):
_dict[j] += 1
order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)
print(order)
继续优化,使用yield
from pathlib import Path
from collections import defaultdict
_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'
def _make_key(files, chars=set('''! ~@#$%^&*(){}[]:'"/\n\r.,><''')):
start, ret, length = 0, [], len(files)
for i, j in enumerate(files):
if j in chars:
if start == i:
start += 1
continue
yield files[start:i]
start = i+1
else:
if start < length:
yield files[start:]
with file.open(encoding='utf-8') as f:
for line in f:
for j in map(str.lower, _make_key(line)):
_dict[j] += 1
order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print(order)
# 结果如下
[('path', 212), ('the', 170), ('a', 117), ('p', 93), ('is', 88), ('to', 74), ('py', 68), ('if', 67), ('file', 61), ('pureposixpath', 59)]
继续优化,停用词
drop_words = {'a', 'is', 'an', 'this', 'that', 'the'}
with file.open(encoding='utf-8') as f:
for line in f:
for j in map(str.lower, _make_key(line)):
if j not in drop_words:
_dict[j] += 1
order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print(order)
封装函数
from pathlib import Path
from collections import defaultdict
def _make_key(list4src: str, chars=set(r'''!~@#$%^&*(){}[]:-'"/\., ‘“''')):
"""处理split处理后的字符串"""
# return ''.join([' ' if key in chars else key for key in list4src]).split()
start = 0
for index, word in enumerate(list4src):
if word in chars:
if start != index:
yield list4src[start:index]
start = index+1
def wordcount(filename, method, directory='./', encoding='utf-8') -> defaultdict:
d = defaultdict(lambda: 0)
drop_words = {'a', 'is', 'an', 'this', 'that', 'the'}
try:
with Path(f"{directory}/{filename}").open(encoding=encoding) as f:
pass
except Exception as e:
print("Error, the given filename:{} or directory:{} were somethings wrong!".format(filename, directory))
else:
with Path(f"{directory}/{filename}").open(encoding=encoding) as f:
for line in f:
for i in filter(lambda key: key.lower() not in drop_words, method(line)):
d[i.lower()] += 1
return d
def topn(src2dict: set, key, n, reverse=True):
yield from sorted(src2dict, key=key, reverse=reverse)[:n]
d = wordcount('simple.txt', method=_make_key)
print(*topn(d.items(), lambda x: x[1], 10))