forked from drgulevich/gtexfix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrom.py
executable file
·99 lines (89 loc) · 2.87 KB
/
from.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
#-----------------------------------------
# Google translate fix for LaTeX documents
# Copyright (c) Dmitry R. Gulevich 2020
# GNU General Public License v3.0
#-----------------------------------------
import re
import sys
import pickle
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
if(re.search('.txt$',args.filename)==None):
sys.exit('The input should be .txt file. Exit.')
print('Input file:',args.filename)
### Load LaTeX data from binary files
with open(args.filename, 'r') as fin:
source = fin.read()
with open ('gtexfix_comments', 'rb') as fp:
comments = pickle.load(fp)
with open ('gtexfix_commands', 'rb') as fp:
commands = pickle.load(fp)
with open ('gtexfix_latex', 'rb') as fp:
latex = pickle.load(fp)
### Replace weird characters introduced by translation
trtext=re.sub('\u200B',' ',source)
### Fix spacing
trtext = re.sub(r'\\ ',r'\\',trtext)
trtext = re.sub(' ~ ','~',trtext)
trtext = re.sub(' {','{',trtext)
### Restore LaTeX and formulas
here=0
newtext=''
nl=0
nc=0
corrupted=[]
for m in re.finditer('\[ *[012][\.\,][0-9]+\]',trtext):
t=int( re.search('(?<=[\[ ])[012](?=[\.\,])',m.group()).group() )
n=int( re.search('(?<=[\.\,])[0-9]+(?=\])',m.group()).group() )
if(t==1):
if(n<nl):
print('Token ',m.group(),'found in place of [%d.%d]. Edit manually and run again.'%(t,nl))
break
while(nl!=n):
corrupted.append('[%d.%d]'%(t,nl))
nl+=1
newtext += trtext[here:m.start()] + latex[n]
nl+=1
elif(t==2):
if(n<nc):
print('Token ',m.group(),'found in place of [%d.%d]. Edit manually and run again.'%(t,nc))
break
while(nc!=n):
corrupted.append('[%d.%d]'%(t,nc))
nc+=1
newtext += trtext[here:m.start()] + commands[n]
nc+=1
here=m.end()
newtext += trtext[here:]
trtext=newtext
### Restore comments
here=0
ncomment=0
newtext=''
for m in re.finditer('___GTEXFIXCOMMENT[0-9]*___',trtext):
n=int( re.search('[0-9]+',m.group()).group() )
if(n!=ncomment):
print('Comment token ',m.group(),'is broken. Stopping.')
break
newtext += trtext[here:m.start()] + comments[n]
ncomment+=1
here=m.end()
newtext += trtext[here:]
trtext=newtext
### Save the processed output to .tex file
output_filename = re.sub('.txt$','.tex',args.filename)
with open(output_filename, 'w') as translation_file:
translation_file.write(trtext)
print('Output file:',output_filename)
### Report the corrupted tokens
if(corrupted==[]):
print('No corrupted tokens. The translation is ready.')
else:
print('Corrupted tokens detected:',end=' ')
for c in corrupted:
print(c,end=' ')
print()
print('To improve the output manually change the corrupted tokens in file',args.filename,'and run from.py again.')