-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdf_operations.py
245 lines (201 loc) · 9.31 KB
/
df_operations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Performs an operation on a dataframe using column and/or row.
The supported operations are listed below.
Most operations are column-based. To combine multiple columns, use query to
select or remove data (--option).
To select data :
> df_operations.py query data.csv Measures=FA,MD Bundle=AF
To remove data :
> df_operations.py query data.csv Measures=FA,MD Bundle=AF --option
Some operations to threshold, select or exclude value accept string/float/int
value as parameters.
> df_operations.py lower data.csv 'Section' 2
Dictionary option : column=row_argument
Use --my_dict to provide a sequence of parameters in the form ;
key1=value key2=value or key=value1,value2 for multiple values.
> df_operations.py query Measures=[FA, ihMTR] Section=1
______________________________________________________________________________
OPERATION LIST:
"""
import argparse
import json
import logging
import os
from scilpy.io.utils import add_overwrite_arg, assert_inputs_exist
from dataframe.utils import load_df
from dataframe.operations import get_df_ops, get_operations_doc
OPERATIONS = get_df_ops()
__doc__ += get_operations_doc(OPERATIONS)
class ParseDictArgs(argparse.Action):
def __init__(self, option_strings, dest, nargs=None, **kwargs):
self._nargs = nargs
super(ParseDictArgs, self).__init__(option_strings, dest,
nargs=nargs, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
parse_dict = {}
for key_val in values:
parse_key, parse_val = key_val.split("=")
if len(parse_val.split(',')) == 2:
parse_val = parse_val.split(',')
parse_dict[parse_key] = parse_val
setattr(namespace, self.dest, parse_dict)
def _build_arg_parser():
p = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
description=__doc__)
p.add_argument('operation',
choices=OPERATIONS.keys(),
help='The type of operation to be performed on the '
'dataframe.')
p.add_argument('in_csv',
help='CSV data (.csv).')
p.add_argument('out_name',
help='Filename to save csv outputs.')
dict_fct = p.add_mutually_exclusive_group()
dict_fct.add_argument('--my_dict', nargs='+', action=ParseDictArgs,
metavar="KEY=VAL",
help='Parameters used to build a dictionary. '
'Example: key=value or key=value1,value2. '
'Use a space to provide multiple keys.')
dict_fct.add_argument('--param',
help='Json file used to replace several elements '
'in a specific column. See ex. in data.')
p.add_argument('--my_cols', nargs='+',
help='A column name or list of column names. ')
p.add_argument('--pattern',
help='String or column name used as argument on columns '
'or rows.')
p.add_argument('--value',
help='Value used for numeric operations on rows.')
p.add_argument('--option', action='store_true',
help='Use for additional options depending on operation.')
p.add_argument('--out_dir',
help='Output directory to save CSV files. ')
add_overwrite_arg(p)
return p
def main():
parser = _build_arg_parser()
args = parser.parse_args()
assert_inputs_exist(parser, args.in_csv)
if args.out_dir is None:
args.out_dir = './'
if args.operation not in OPERATIONS.keys():
parser.error('Operation {} not implement.'.format(args.operation))
if args.param:
input_param = json.load(open(args.param))
df = load_df(args.in_csv)
result_df = []
operations_args = []
# Operations requires only dataframe
print_function = ['display', 'column', 'info', 'check_empty']
if args.operation in print_function:
try:
OPERATIONS[args.operation](df)
except ValueError as msg:
logging.error('{} operation failed.'.format(
args.operation.capitalize()))
logging.error(msg)
return
exit()
# Operations requires only dataframe and save it
single_operations = ['drop_empty_column', 'drop_nan']
if args.operation in single_operations:
operations_args = [df]
# Operations requires dataframe and dictionnary
operations_on_column_with_dict = ['rename', 'delete']
if args.operation in operations_on_column_with_dict:
if not args.my_dict:
parser.error('This operation must be used with --my_dict.')
operations_args = [df, args.my_dict]
# Single column operations
# Operations requires dataframe and single column
operations_on_column = ['unique', 'remove_column', 'split_by']
if args.operation in operations_on_column:
operations_args = [df, str(args.my_cols[0])]
# Operations requires dataframe, single column and specific pattern
operations_on_column_with_pattern = ['get_where', 'get_from', 'remove_row']
if args.operation in operations_on_column_with_pattern:
if not args.pattern:
parser.error('This operation must be used with --pattern.')
operations_args = [df, str(args.my_cols[0]), args.pattern]
# Operations requires dataframe, single column and specific value
operations_on_value = ['lower', 'upper', 'exclude', 'select']
if args.operation in operations_on_value:
if not args.value:
parser.error('Value operations must be used with --value.')
operations_args = [df, str(args.my_cols[0]), args.value]
# Multi columns operations
# Operations requires dataframe and multi columns
operations_on_multi_columns = ['average', 'sum']
if args.operation in operations_on_multi_columns:
operations_args = [df, args.my_cols]
# Operations requires dataframe, multi columns and specific pattern
operations_on_multi_columns_with_pattern = ['convert', 'split_col'] #
if args.operation in operations_on_multi_columns_with_pattern:
if not args.pattern:
parser.error('This operation must be used with --pattern.')
operations_args = [df, args.my_cols, args.pattern]
# Operations requires dataframe, multi columns and dictionnary
if args.operation == 'replace':
if not args.my_cols and not (args.my_dict or args.param):
parser.error('Merge operation must be used with --my_cols and '
'--my_dict or --param.')
if args.param:
args.my_dict = input_param
operations_args = [df, args.my_cols, args.my_dict]
# Operations requires dataframe, multi columns and dictionnary
if args.operation == 'replace_where':
if not args.my_cols and not (args.my_dict or args.param or args.pattern):
parser.error('Merge operation must be used with --my_cols and '
'--my_dict or --param.')
if args.param:
args.my_dict = input_param
operations_args = [df, args.my_cols, args.pattern, args.my_dict]
# Operations requires dataframe, multi columns and dictionnary with option
if args.operation == 'merged':
if not args.my_cols and not (args.my_dict or args.param):
parser.error('Merge operation must be used with --my_cols and '
'--my_dict or --param.')
if args.param:
args.my_dict = input_param
operations_args = [df, args.my_cols, args.my_dict, args.option]
# Operations requires dataframe, multi columns, pattern and value
if args.operation == 'factor':
if not args.my_cols and not args.pattern and not args.value:
parser.error('Factor operation must be used with --my_cols and '
'--pattern and --value.')
if args.param:
args.my_dict = input_param
operations_args = [df, args.my_cols, args.pattern, args.value]
# Operations requires dataframe, dictionnary, pattern and options
if args.operation == 'query':
if not (args.my_dict or args.param):
parser.error('Query operation must be used with --my_dict or '
' --param.')
if args.param:
args.my_dict = input_param
operations_args = [df, args.my_dict, args.option, args.pattern]
# Called and run operations with specific arguments required
try:
result_df = OPERATIONS[args.operation](*operations_args)
except ValueError as msg:
logging.error('{} operation failed.'.format(
args.operation.capitalize()))
logging.error(msg)
return
# Save output dataframe
if args.operation == 'unique':
exit()
output_df = result_df
if len(output_df) == 0:
raise ValueError('Dataframe is empty.')
elif args.operation == 'split_by':
for name, frame in zip(output_df[0], output_df[1]):
frame.to_csv(os.path.join(args.out_dir, name + '_' + args.out_name),
index=False)
else:
output_df.to_csv(os.path.join(args.out_dir, args.out_name),
index=False)
if __name__ == '__main__':
main()