-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloader.py
95 lines (84 loc) · 3.15 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
# PyWordPredictor - XML-RPC word prediction server
# Copyright (C) 2010 - Etienne Membrives
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import nltk, re, locale
import worddatabase as wdb
class DataImporter(object):
def __init__(self,files):
self.files=files
self.database = wdb.Database("data.sql")
self.database.clear()
self._prepare_text()
def _prepare_text(self):
for p in self.files:
f=open(p,'r')
data=f.read()
data=data.split('\n\n')
data=map(lambda x:x.replace('\n',' '),data)
data=filter(lambda x:x.startswith('--') or x.count(' ')>1,data)
self.import_text(data)
def import_words_to_database(self,sentence):
if len(sentence)<2:
return
w=wdb.Word.select(data="")
if len(w)!=0:
w=w[0]
w.set(occurences=w.occurences+1)
else:
w=wdb.Word(data="",occurences=1)
window=[w]
wnull=w
i=0
while i<len(sentence):
window=self._addtodb(window)
wl=wdb.Word.select(data=sentence[i])
if len(wl)==0:
w=wdb.Word(data=sentence[i],occurences=1)
else:
w=wl[0]
w.set(occurences=w.occurences+1)
window+=[w]
i+=1
window=self._addtodb(window)
window+=[wnull]
window=self._addtodb(window)
return
def _addtodb(self,window):
if len(window)<4:
return window
elif len(window)>4:
raise IndexError(str(window)+" has more than 4 elements")
else:
l=wdb.Quadruplet.select(w0=window[0],w1=window[1],w2=window[2],w3=window[3])
if len(l)==0:
quad=wdb.Quadruplet(w0=window[0],w1=window[1],w2=window[2],w3=window[3],occurences=1)
else:
quad=l[0]
quad.set(occurences=quad.occurences+1)
return window[1:]
def import_text(self, text):
sent_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle')
sentences=map(lambda x:sent_tokenizer.tokenize(x),text)
for par in sentences:
token_sents=map(self.tokenize,par)
map(self.import_words_to_database,token_sents)
def tokenize(self, sent):
char_list="""'"«»:"""
prepared_sent=sent
for char in char_list:
prepared_sent=prepared_sent.replace(char,char+" ")
prepared_sent.replace(".","")
return prepared_sent.split(" ")