Skip to content

Latest commit

 

History

History
176 lines (132 loc) · 4.39 KB

单词统计.md

File metadata and controls

176 lines (132 loc) · 4.39 KB

单词统计

有一个文件,对其进行单词统计,不区分大小写,并显示单词重复最多的十个单词

from  pathlib import Path
from collections import defaultdict

_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'
with file.open(encoding='utf-8') as f:
    for line in f:
        for j in line.split():
            _dict[j] += 1

order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)
print(order)

继续优化

from  pathlib import Path
from collections import defaultdict

_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'

def make_key(files, chars=set(r'''!~@#$%^&*(){}[]:'"/\.,''')):
    ret = []
    for i in map(lambda x: x.lower(), files):
        if i in chars:
            ret.append(' ')
        else:
            ret.append(i)
    return ''.join(ret).split()


with file.open(encoding='utf-8') as f:
    for line in f:
        for j in make_key(line):
            _dict[j] += 1

order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)
print(order)

继续优化,用窗口切片

from  pathlib import Path
from collections import defaultdict

_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'

def _make_key(files, chars=set(r'''! ~@#$%^&*(){}[]:'"/\.,''')):
    start, ret, length = 0, [], len(files)
    for i, j in enumerate(map(lambda x: x.lower(), files)):
        if j in chars:
            if start == i:
                start += 1
                continue
            ret.append(files[start:i])
            start = i+1
    else:
        if start < length:
            ret.append(files[start:])
    return ret


with file.open(encoding='utf-8') as f:
    for line in f:
        for j in _make_key(line):
            _dict[j] += 1

order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)
print(order)

继续优化,使用yield

from pathlib import Path
from collections import defaultdict

_dict = defaultdict(lambda: 0)
file = Path().cwd()/'_path'

def _make_key(files, chars=set('''! ~@#$%^&*(){}[]:'"/\n\r.,><''')):
    start, ret, length = 0, [], len(files)
    for i, j in enumerate(files):
        if j in chars:
            if start == i:
                start += 1
                continue
            yield files[start:i]
            start = i+1
    else:
        if start < length:
            yield files[start:]


with file.open(encoding='utf-8') as f:
    for line in f:
        for j in map(str.lower, _make_key(line)):
            _dict[j] += 1

order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print(order)

# 结果如下
[('path', 212), ('the', 170), ('a', 117), ('p', 93), ('is', 88), ('to', 74), ('py', 68), ('if', 67), ('file', 61), ('pureposixpath', 59)]

继续优化,停用词

drop_words = {'a', 'is', 'an', 'this', 'that', 'the'}
with file.open(encoding='utf-8') as f:
    for line in f:
        for j in map(str.lower, _make_key(line)):
            if j not in drop_words:
                _dict[j] += 1

order = sorted(_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print(order)

封装函数

from pathlib import Path
from collections import defaultdict


def _make_key(list4src: str, chars=set(r'''!~@#$%^&*(){}[]:-'"/\., ‘“''')):
    """处理split处理后的字符串"""
    # return ''.join([' ' if key in chars else key for key in list4src]).split()
    start = 0
    for index, word in enumerate(list4src):
        if word in chars:
            if start != index:
                yield list4src[start:index]
            start = index+1


def wordcount(filename, method, directory='./', encoding='utf-8') -> defaultdict:
    d = defaultdict(lambda: 0)
    drop_words = {'a', 'is', 'an', 'this', 'that', 'the'}

    try:
        with Path(f"{directory}/{filename}").open(encoding=encoding) as f:
            pass
    except Exception as e:
        print("Error, the given filename:{} or directory:{} were somethings wrong!".format(filename, directory))
    else:
        with Path(f"{directory}/{filename}").open(encoding=encoding) as f:
            for line in f:
                for i in filter(lambda key: key.lower() not in drop_words, method(line)):
                    d[i.lower()] += 1
    return d


def topn(src2dict: set, key, n, reverse=True):
    yield from sorted(src2dict, key=key, reverse=reverse)[:n]


d = wordcount('simple.txt', method=_make_key)
print(*topn(d.items(), lambda x: x[1], 10))