-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
80 lines (65 loc) · 2.41 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import json
import glob
import unicodedata
from collections import deque
import lzma
import aiohttp
import asyncio
from tqdm import tqdm
def extract_text(raw):
text = []
if raw.get("title"):
text.append(raw.get("title"))
if raw.get("abstract"):
text.append(raw.get("abstract"))
text = ". ".join(text)
text = ''.join([l for l in text
if unicodedata.category(str(l))[0]
not in ('S', 'M', 'C')])
return text
async def lang_detect(dictToSend):
async with aiohttp.ClientSession() as session:
async with session.post('http://localhost:5002/batch_lang_detect',
json=dictToSend) as resp:
return await resp.json()
async def tokenize(dictToSend):
async with aiohttp.ClientSession() as session:
async with session.post('http://localhost:5002/batch_tokenize',
json=dictToSend) as resp:
return await resp.json()
def batch_preprocess(docs):
temp = deque()
while len(docs) > 0:
temp.append(docs.pop())
if len(temp) > 25000:
preprocess(list(temp))
temp = deque()
preprocess(list(temp))
def preprocess(docs):
dictToSend = {'lang': lang, 'docs': docs}
loop = asyncio.get_event_loop()
res = loop.run_until_complete(lang_detect(dictToSend))
detected_langs = res["detected_langs"]
docs = [doc for doc, lang in zip(docs, detected_langs) if lang == 'en']
if len(docs) > 0:
dictToSend = {'lang': lang, 'docs': docs}
res = loop.run_until_complete(tokenize(dictToSend))
if "tokens" in res:
docs = [" ".join(d).lower() for d in res["tokens"]]
with open(os.path.join(interimpath, "documents_"+lang), "a") as outfile:
outfile.write("\n".join(docs) + "\n")
lang = "en"
rawpath = "/home/chris/data/CORE/fulltext/"
interimpath = "/home/chris/data/CORE/interim/"
modelpath = "models"
rawfiles = glob.glob(rawpath+"*.json.xz")
with open("done.txt", "r") as infile:
done = set(infile.read().splitlines())
rawfiles = [r for r in rawfiles if r not in done]
for r in tqdm(rawfiles, desc="collections", mininterval=10, maxinterval=120):
with lzma.open(r) as infile:
docs = [extract_text(json.loads(l.decode('utf-8'))) for l in infile]
batch_preprocess(docs)
with open("done.txt", "a") as outfile:
outfile.write(r+"\n")