-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataExtraction.py
41 lines (32 loc) · 1.06 KB
/
DataExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#-*- coding: utf-8 -*-
import re
import locale
import os
def parseFile(fileName):
with open(fileName, 'r', encoding='utf-8') as f:
text = f.read()
f.close()
return re.findall('\w+', text, re.UNICODE)
def parseTaggedFile(fileName):
with open(fileName, 'r', encoding='utf-8') as f:
text = f.read()
f.close()
return re.findall('(\w+)\n', text, re.UNICODE)
def countWords(pathToFiles, uselessWordsFileName, tagged):
listDico = []
filesList = os.listdir(pathToFiles)
uselessWords = parseFile(uselessWordsFileName)
for fileName in filesList:
wordCounter = {}
if tagged:
words = parseTaggedFile(pathToFiles + fileName)
else:
words = parseFile(pathToFiles + fileName)
for word in words:
if word not in uselessWords:
if word in wordCounter:
wordCounter[word] += 1
else:
wordCounter[word] = 1
listDico.append(wordCounter)
return listDico