-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathcleanText.py
50 lines (42 loc) · 1.98 KB
/
cleanText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python3
import sys
import indic_transliteration.sanscript as sanscript
import re
def replaceText():
repl_list = [# ('र्द्ध्', 'र्ध्'), ('र्द्ध', 'र्ध'),
('त्स्', 'थ्स्'), ('प्स्', 'फ्स्'),
('त्स', 'थ्स'), ('प्स', 'फ्स'),
('', '᳚'),
('॑', 'ः॑'), ('॒', 'ः॒'), ('᳚', 'ः᳚'),
('', 'ꣳ'), ('', 'ꣳ॑'), ('', 'ꣳ᳚'), ('', 'ꣳ॒'),
('', 'ꣴ'), ('', 'ꣴ॒'), ('', 'ꣴ॑'), ('', 'ꣴ᳚')]
for file in sys.argv[1:]:
sys.stderr.write('Updating %s\n' % (file))
with open(file, 'r') as in_f:
file_lines = in_f.readlines()
noReplaces = True
for old, new in repl_list:
for i in range(len(file_lines)):
file_lines_new = file_lines[i].replace(old, new)
if file_lines_new != file_lines[i]:
file_lines[i] = file_lines_new
noReplaces = False
if not noReplaces:
with open(file, 'w') as out_f:
out_f.writelines(file_lines)
def main():
for file in sys.argv[1:]:
# sys.stderr.write('Updating %s\n' % (file))
with open(file, 'r') as in_f:
file_lines = in_f.readlines()
for i, line in enumerate(file_lines):
for word in line.split(' '):
trans_word = sanscript.transliterate(word, 'devanagari', 'itrans')
if trans_word == word:
continue
trans_word_rep = re.sub('([kgcjtdpb])s', '\\1hs', trans_word)
# trans_word_rep = re.sub('([kgcjtdpb])\\-s', '\\1h\\-s', trans_word)
if trans_word_rep != trans_word:
print('%40s:%05d:: %s' % (file, i + 1, trans_word))
if __name__ == '__main__':
main()