-
Notifications
You must be signed in to change notification settings - Fork 6
/
split.py
executable file
·172 lines (140 loc) · 5.07 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding:utf-8 -*-
#! /usr/bin/python3
import sys
import os
import timeit
# from sets import Set
# default settings
connector = "_"
extension = ".txt"
split_size = 200
maxline = 60000
original_file = "coca60000.txt"
output_path = "output"
# remove duplicates while maintaining the order
def remove_duplicates(input_file):
with open(input_file, "r") as fin:
lines = fin.readlines()
all_words = [x.strip() for x in lines]
seen = {}
pos = 0
for word in all_words:
if word not in seen:
seen[word] = True
all_words[pos] = word
pos += 1
del all_words[pos:]
outfile = os.path.splitext(input_file)[0] + "_removed_duplicates" + extension
with open(outfile, "w") as fout:
for word in all_words:
fout.write(word)
fout.write("\n")
# strip spaces, symbols pre and post the words
def strip_spaces(input_file):
outfile = os.path.splitext(input_file)[0] + "_removed_spaces" + extension
with open(input_file, "r") as fin, open(outfile, "w") as fout:
for line in fin:
# line = re.sub(r'\(\)', '', line.strip())
line = line.strip("()\n")
fout.write(line)
fout.write("\n")
# generate a file, with splits, that can be used for batch-importing of Eudic
def batch_import(input_file, num_of_words):
if (len(sys.argv) < 3):
print("split size is not specified, use default:%d" % split_size)
else:
num_of_words = int(sys.argv[2])
outfile = os.path.splitext(input_file)[0] + "_batch_import" + extension
with open(input_file, "r") as fin, open(outfile, "w") as fout:
base, cursor = 0, 0
for line in fin:
if base == 0:
fout.write("#1_"+str(num_of_words))
fout.write("\n")
base = num_of_words
if cursor > num_of_words - 1:
start, end = str(base+1), str(base+cursor)
title = '#'+start+connector+end
fout.write(title)
fout.write("\n")
base = base + cursor
cursor = 0
fout.write(line.strip())
fout.write("\n")
cursor += 1
# strip chinese meanings, output only english words
def strip_meanings(input_file):
outfile = os.path.splitext(input_file)[0] + "_no_meaning" + extension
# print outfile
with open(input_file, "r") as fin, open(outfile, "w") as fout:
for line in fin:
word = line.strip().split(" ")[0]
fout.write(word)
fout.write("\n")
# remove last line, which is empty!
fout.seek(-2, os.SEEK_CUR)
fout.truncate()
# split words into groups with specified group size
def splitwords(input_file, num_of_words):
if (len(sys.argv) < 3):
print("split size is not specified, use default:%d" % split_size)
else:
num_of_words = int(sys.argv[2])
wordlist = []
with open(input_file, "r") as fin:
base, cursor = 0, 0
for line in fin:
# if base > maxline:
# sys.exit(0)
if cursor > num_of_words - 1:
#1. assemble filename
start, end = str(base+1), str(base+cursor)
outfile = os.path.join(output_path, start + connector + end + extension)
#2. output to file
with open(outfile, "w") as fout:
for word in wordlist:
fout.write(word)
fout.write("\n")
# delate last line
fout.seek(-2, os.SEEK_CUR)
fout.truncate()
#3. update base, reset cursor and wordlist
base = base + cursor
cursor = 0
wordlist = []
# collects words and update cursor
wordlist.append(line.strip())
cursor+=1
if cursor > 0:
start, end = str(base+1), str(base+cursor)
outfile = os.path.join(output_path, start + connector + end + extension)
with open(outfile, "w") as fout:
for word in wordlist:
fout.write(word)
fout.write("\n")
fout.seek(-2, os.SEEK_CUR)
fout.truncate()
###########################################################################
# usage:
# ./split.py filename 200
def main(argv):
global output_path
input_file = original_file
if len(argv) < 2:
print("must specify filename")
sys.exit(1)
else:
input_file = argv[1]
# create output directory if not exist
if not os.path.exists(output_path):
os.makedirs(output_path)
start = timeit.default_timer()
# splitwords(input_file, split_size)
batch_import(input_file, split_size)
# remove_duplicates(input_file)
# strip_spaces(input_file)
# strip_meanings(input_file)
stop = timeit.default_timer()
print('Time elapsed: {}'.format(stop - start))
if __name__ == "__main__":
main(sys.argv)