-
Notifications
You must be signed in to change notification settings - Fork 4
/
convert_to_json.py
executable file
·172 lines (123 loc) · 6.29 KB
/
convert_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
import json
import re
def all_columns_to_json (column_dict, columns_line):
json_object = {}
for column_index, column_value in enumerate(columns_line):
if column_index in column_dict:
column_name = column_dict[column_index]
else:
column_name = str(column_index)
json_object[column_name] = column_value
return json_object
def filter_columns_in_dict_to_json(column_dict, columns_line):
'''Parse columns_line, make sure every element in column_dict
exists there, filter elements that are not in column_dict from
columns_line, and convert it to a dict.
'''
json_object = {}
for column_index, column_name in column_dict.items():
try:
json_object[column_name] = columns_line[column_index]
except IndexError as err:
# columns_line doesn't has column_index.
raise ValueError('Invalid table line ({}) : no {} element.'.format(columns_line,
column_index)) from err
return json_object
def columns_line_to_json (column_dict, columns_line, should_filter_colunms):
'''Parse a list of values to a json object with special names.
'''
if should_filter_colunms:
return filter_columns_in_dict_to_json(column_dict, columns_line)
else:
return all_columns_to_json(column_dict, columns_line)
def regex_from_delims_list(delims_list):
'''Get a regex compiled pattern from a delims list'''
one_characters_delims = ''
final_pattern = ''
for delim in delims_list:
delim_and_maybe_min_max = delim.split(':')
escaped_delim = re.escape(delim_and_maybe_min_max[0])
# Check if this is a delim without min count.
if len(delim_and_maybe_min_max) == 1:
final_pattern += "%s{1,}|" % (escaped_delim)
elif len(delim_and_maybe_min_max) == 2:
min_and_maybe_max = delim_and_maybe_min_max[1].split('-')
current_pattern = escaped_delim
# Add count to the regex (only min or max too)
if len(min_and_maybe_max) == 2:
current_pattern += '{%d,%d}' % (int(min_and_maybe_max[0],
int(min_and_maybe_max[1])))
else:
current_pattern += '{%d,}' % (int(min_and_maybe_max[0]))
final_pattern += current_pattern + '|'
else:
raise ValueError("Invalid ':' count in the delimiter argument")
# If there are one character delims without count, add them. If not
# Remove the last OR ('|').
final_pattern = final_pattern[:-1]
return re.compile (final_pattern)
def main(args):
column_dict = {}
# Split the user's argument by a comma, and parse each columns
# seperatly.
for column_and_name in args.columns_and_names.split(','):
# Split the name from the columns.
column_and_name = column_and_name.split('=')
if len(column_and_name) > 2:
raise ValueError("Invalid column: {}".format(str(column_and_name())))
# If there is not name, set it to the column index.
if len(column_and_name) == 1:
column_and_name.append (str(column_and_name[0]))
# Try to convert the column index is it isn't '*'
if column_and_name[0] != '*':
try:
column_and_name[0] = int(column_and_name[0])
except ValueError as err:
raise ValueError('Invalid column index: {} (not an integer)'.format(column_and_name[0])) from err
# Add this column definition.
column_dict[column_and_name[0]] = column_and_name[1]
# Check if column_dict has the '*' member.
# If it does, we will print all of the columns (even ones that
# are not in column_dict)
should_filter_colunms = ('*' not in column_dict)
# We have checked it, no need for it now.
if not should_filter_colunms:
del column_dict['*']
# Parse the delim list into a regex pattern.
strip_regex_pattern = regex_from_delims_list(args.delim)
json_objects_list = []
for fd in args.infiles:
for line in fd:
# Convert bytes object to string.
if isinstance(line, bytes):
line = line.decode('utf-8')
# Strip the \n in the end of the line.
line = line.rstrip('\n')
# Split the line by the delims.
splitted_line = re.split(strip_regex_pattern, line)
json_objects_list.append (columns_line_to_json (column_dict, splitted_line, should_filter_colunms))
print(json.dumps (json_objects_list))
def comma_list(string):
'''Convert a comma list '1,2,3,4' to a list
[1,2,3,4] with escaping of , by a one \\ char'''
# Split the string by commas after non-\ chars.
splitted_string = re.split('(?<!\\\\),', string)
replaced_string = []
# Replace '\,' with ',' and '\\' with '\'.
for string in splitted_string:
string = string.replace ('\\,', ',')
string = string.replace ('\\\\', '\\')
replaced_string.append (string)
return replaced_string
if __name__ == '__main__':
import argparse
from sys import stdin
parser = argparse.ArgumentParser()
parser.add_argument('columns_and_names', help='The columns and its names to print out (format: n=name)', default='*')
parser.add_argument('--delim', '-d', type=comma_list,
help='A list of input columns delimiters. Format: delim[:min[-max]]. Where `min` and `max` are the numbers of times `delim` should repeat. As default min=1 and max is not set. Enter "\," for the delimiter "," and "\\\\"" for "\\"',
default=(' ', '\t'),
metavar='delim[:min-max]')
parser.add_argument('infiles', type=argparse.FileType('rb'), default=(stdin,), metavar='file', nargs='*')
main(parser.parse_args())