diff --git a/hone/hone.py b/hone/hone.py index 47cf053..9b1ae03 100644 --- a/hone/hone.py +++ b/hone/hone.py @@ -1,173 +1,187 @@ -from hone.utils import csv_utils -import copy - -class Hone: - def __init__(self): - self.delimit_chars = [",", "_", " "] - self.csv_filepath = None - self.csv = csv_utils.CSVUtils(self.csv_filepath) - ''' - Perform CSV to nested JSON conversion and return resulting JSON. - ''' - def convert(self, csv_filepath): - self.set_csv_filepath(csv_filepath) - column_names = self.csv.get_column_names() - data = self.csv.get_data_rows() - column_schema = self.generate_full_structure(column_names) - json_struct = self.populate_structure_with_data(column_schema, column_names, data) - return json_struct - ''' - Returns dictionary with given data rows fitted to given structure. - ''' - - def populate_structure_with_data(self, structure, column_names, data_rows): - json_struct = [] - num_columns = len(column_names) - mapping = self.get_leaves(structure) - for row in data_rows: - json_row = copy.deepcopy(structure) - i = 0 - while i < num_columns: - cell = row[i] - column_name = column_names[i] - key_path = mapping[column_name] - exec("json_row"+key_path+"="+"'"+cell+"'") - i += 1 - json_struct.append(json_row) - return json_struct - - ''' - Get generated JSON schema. - ''' - - def get_schema(self, csv_filepath): - self.set_csv_filepath(csv_filepath) - column_names = self.csv.get_column_names() - data = self.csv.get_data_rows() - column_struct = self.generate_full_structure(column_names) - return column_struct - - ''' - Generate recursively-nested JSON structure from column_names. - ''' - - def generate_full_structure(self, column_names): - visited = set() - structure = {} - sorted(column_names) - column_names = column_names[::-1] - for c1 in column_names: - if c1 in visited: - continue - splits = self.get_valid_splits(c1) - for split in splits: - nodes = {split: {}} - if split in column_names: - continue - for c2 in column_names: - if c2 not in visited and self.is_valid_prefix(split, c2): - nodes[split][self.get_split_suffix(split, c2)] = c2 - if len(nodes[split].keys()) > 1: - structure[split] = self.get_nested_structure(nodes[split]) - for val in nodes[split].values(): - visited.add(val) - if c1 not in visited: # if column_name not nestable - structure[c1] = c1 - return structure - - ''' - Generate nested JSON structure given parent structure generated from initial call to get_full_structure - ''' - - def get_nested_structure(self, parent_structure): - column_names = list(parent_structure.keys()) - visited = set() - structure = {} - sorted(column_names, reverse=True) - for c1 in column_names: - if c1 in visited: - continue - splits = self.get_valid_splits(c1) - for split in splits: - nodes = {split: {}} - if split in column_names: - continue - for c2 in column_names: - if c2 not in visited and self.is_valid_prefix(split, c2): - nodes[split][self.get_split_suffix(split, c2)] = parent_structure[c2] - visited.add(c2) - if len(nodes[split].keys()) > 1: - structure[split] = self.get_nested_structure(nodes[split]) - if c1 not in visited: # if column_name not nestable - structure[c1] = parent_structure[c1] - return structure - - ''' - Get the leaf nodes of a nested structure and the path to those nodes. - Ex: {"a":{"b":"c"}} => {"c":"['a']['b']"} - ''' - - def get_leaves(self, structure, path="", result={}): - for key, value in structure.items(): - if type(value) is dict: - self.get_leaves(value, path+"['"+key+"']", result) - else: - result[value] = path+"['"+key+"']" - return result - - ''' - Returns all valid splits for a given column name in descending order by length - ''' - - def get_valid_splits(self, column_name): - splits = [] - i = len(column_name) - 1 - while i >= 0: - c = column_name[i] - if c in self.delimit_chars: - split = self.clean_split(column_name[0:i]) - splits.append(split) - i -= 1 - return sorted(list(set(splits))) - - ''' - Returns string after split without delimiting characters. - ''' - - def get_split_suffix(self, split, column_name=""): - suffix = column_name[len(split) + 1:] - i = 0 - while i < len(suffix): - c = suffix[i] - if c not in self.delimit_chars: - return suffix[i:] - i += 1 - return suffix - - ''' - Returns split with no trailing delimiting characters. - ''' - - def clean_split(self, split): - i = len(split) - 1 - while i >= 0: - c = split[i] - if c not in self.delimit_chars: - return split[0:i + 1] - i -= 1 - return split - - ''' - Returns true if str_a is a valid prefix of str_b - ''' - - def is_valid_prefix(self, prefix, base): - if base.startswith(prefix): - if base[len(prefix)] in self.delimit_chars: - return True - return False - - def set_csv_filepath(self, csv_filepath): - self.csv_filepath = csv_filepath - self.csv.filepath = self.csv_filepath - +from hone.utils import csv_utils +import copy + +class Hone: + def __init__(self): + self.delimit_chars = [",", "_", " "] + self.csv_filepath = None + self.csv = csv_utils.CSVUtils(self.csv_filepath) + ''' + Perform CSV to nested JSON conversion and return resulting JSON. + ''' + def convert(self, csv_filepath): + self.set_csv_filepath(csv_filepath) + column_names = self.csv.get_column_names() + data = self.csv.get_data_rows() + cell_type = self.csv.data_type_check(data,column_names) + column_schema = self.generate_full_structure(column_names) + # change + json_struct = self.populate_structure_with_data(column_schema, column_names, data, cell_type) + # was json_struct = self.populate_structure_with_data(column_schema, column_names, data) + return json_struct + ''' + Returns dictionary with given data rows fitted to given structure. + ''' + + def populate_structure_with_data(self, structure, column_names, data_rows, cell_type): + # change ..added cell_type.. + json_struct = [] + num_columns = len(column_names) + mapping = self.get_leaves(structure) + row = 0 + while row < len(data_rows): + # change - was ...for row in data_rows:... + json_row = copy.deepcopy(structure) + col = 0 + # changed i to col + while col < len(column_names): + column_name = column_names[col] + key_path = mapping[column_name] + # Changes + if cell_type[row][col] == 'string': + exec("json_row"+key_path+"="+"'"+data_rows[row][col]+"'") + else: + exec("json_row"+key_path+"="+data_rows[row][col]) + col += 1 + json_struct.append(json_row) + row +=1 + + return json_struct + + ''' + Get generated JSON schema. + ''' + + def get_schema(self, csv_filepath): + self.set_csv_filepath(csv_filepath) + column_names = self.csv.get_column_names() + data = self.csv.get_data_rows() + column_struct = self.generate_full_structure(column_names) + return column_struct + + ''' + Generate recursively-nested JSON structure from column_names. + ''' + + def generate_full_structure(self, column_names): + visited = set() + structure = {} + sorted(column_names) + column_names = column_names[::-1] + for c1 in column_names: + if c1 in visited: + continue + splits = self.get_valid_splits(c1) + for split in splits: + nodes = {split: {}} + if split in column_names: + continue + for c2 in column_names: + #print(c1,c2) + if c2 not in visited and self.is_valid_prefix(split, c2): + nodes[split][self.get_split_suffix(split, c2)] = c2 + if len(nodes[split].keys()) > 1: + structure[split] = self.get_nested_structure(nodes[split]) + for val in nodes[split].values(): + visited.add(val) + if c1 not in visited: # if column_name not nestable + structure[c1] = c1 + return structure + + ''' + Generate nested JSON structure given parent structure generated from initial call to get_full_structure + ''' + + def get_nested_structure(self, parent_structure): + column_names = list(parent_structure.keys()) + visited = set() + structure = {} + sorted(column_names, reverse=True) + for c1 in column_names: + if c1 in visited: + continue + splits = self.get_valid_splits(c1) + for split in splits: + nodes = {split: {}} + if split in column_names: + continue + for c2 in column_names: + if c2 not in visited and self.is_valid_prefix(split, c2): + nodes[split][self.get_split_suffix(split, c2)] = parent_structure[c2] + visited.add(c2) + if len(nodes[split].keys()) > 1: + structure[split] = self.get_nested_structure(nodes[split]) + if c1 not in visited: # if column_name not nestable + structure[c1] = parent_structure[c1] + return structure + + ''' + Get the leaf nodes of a nested structure and the path to those nodes. + Ex: {"a":{"b":"c"}} => {"c":"['a']['b']"} + ''' + + def get_leaves(self, structure, path="", result={}): + for key, value in structure.items(): + if type(value) is dict: + self.get_leaves(value, path+"['"+key+"']", result) + else: + result[value] = path+"['"+key+"']" + + return result + + ''' + Returns all valid splits for a given column name in descending order by length + ''' + + def get_valid_splits(self, column_name): + splits = [] + i = len(column_name) - 1 + while i >= 0: + c = column_name[i] + if c in self.delimit_chars: + split = self.clean_split(column_name[0:i]) + splits.append(split) + i -= 1 + return sorted(list(set(splits))) + + ''' + Returns string after split without delimiting characters. + ''' + + def get_split_suffix(self, split, column_name=""): + suffix = column_name[len(split) + 1:] + i = 0 + while i < len(suffix): + c = suffix[i] + if c not in self.delimit_chars: + return suffix[i:] + i += 1 + return suffix + + ''' + Returns split with no trailing delimiting characters. + ''' + + def clean_split(self, split): + i = len(split) - 1 + while i >= 0: + c = split[i] + if c not in self.delimit_chars: + return split[0:i + 1] + i -= 1 + return split + + ''' + Returns true if str_a is a valid prefix of str_b + ''' + + def is_valid_prefix(self, prefix, base): + if base.startswith(prefix): + if base[len(prefix)] in self.delimit_chars: + return True + return False + + def set_csv_filepath(self, csv_filepath): + self.csv_filepath = csv_filepath + self.csv.filepath = self.csv_filepath + diff --git a/hone/utils/csv_utils.py b/hone/utils/csv_utils.py index 34203b8..4db71e2 100644 --- a/hone/utils/csv_utils.py +++ b/hone/utils/csv_utils.py @@ -1,65 +1,93 @@ -""" -Simple methods for processing CSV files -""" - -from contextlib import contextmanager - -class CSVUtils: - def __init__(self, csv_filepath): - self.filepath = csv_filepath - - # Parses and returns first row of CSV (column names) - def get_column_names(self): - with self.open_csv() as f: - cols = self.parse_row(f.readline()) - return cols - - # Returns parsed rows of CSV (excluding column names) - def get_data_rows(self): - data_rows = [] - with self.open_csv() as f: - f.readline() # discard column names - while True: - row = f.readline() - if row != '': - data_rows.append(self.parse_row(row)) - else: - break - return data_rows - - # Open CSV in given mode (default is read mode) - @contextmanager - def open_csv(self, mode='r'): - f = open(self.filepath, mode, encoding='utf-8-sig') - try: - yield f - finally: - f.close() - - # Parse given row (list of cells) - @staticmethod - def parse_row(row): - row = row[:-1] - cells = [] - - quote_flag = False - split_index = 0 - i = 0 - - # parse rows - while i < len(row): - c = row[i] - if c == '"': - quote_flag = not quote_flag - elif c == ',': - if not quote_flag: - cells.append(row[split_index:i]) - split_index = i+1 - i += 1 - - # add last cell in row - if split_index != len(row): - cells.append(row[split_index:]) - - cells = [cell.strip(' "') for cell in cells] - return cells +""" +Simple methods for processing CSV files +""" + +from contextlib import contextmanager +import re + +class CSVUtils: + def __init__(self, csv_filepath): + self.filepath = csv_filepath + + # Parses and returns first row of CSV (column names) + def get_column_names(self): + with self.open_csv() as f: + cols = self.parse_row(f.readline()) + return cols + + # Returns parsed rows of CSV (excluding column names) + def get_data_rows(self): + data_rows = [] + with self.open_csv() as f: + f.readline() # discard column names + while True: + row = f.readline() + if row != '': + data_rows.append(self.parse_row(row)) + else: + break + return data_rows + + def data_type_check(self, cell_data,column_names): + # return data type, only checks for accountID, integer or string, boolean could be added + accountid = re.compile("^[0-9]{12}$") + number = re.compile("^\d+$") + row = 0 + col = 0 + cell_type = [] + inner_array = [] + + while row < len(cell_data): + while col < (len(column_names)): + if accountid.match(cell_data[row][col]): + inner_array.append('string') + else: + if number.match(cell_data[row][col]): + inner_array.append('integer') + else: + inner_array.append('string') + col += 1 + cell_type.append(inner_array) + row += 1 + + #print(cell_type) + return cell_type + + # Open CSV in given mode (default is read mode) + @contextmanager + def open_csv(self, mode='r'): + f = open(self.filepath, mode, encoding='utf-8-sig') + try: + yield f + finally: + f.close() + + # Parse given row (list of cells) + @staticmethod + def parse_row(row): + row = row[:-1] + cells = [] + + quote_flag = False + split_index = 0 + i = 0 + + # parse rows + while i < len(row): + + c = row[i] + + if c == '"': + quote_flag = not quote_flag + elif c == ',': + if not quote_flag: + cells.append(row[split_index:i]) + split_index = i+1 + i += 1 + + # add last cell in row + if split_index != len(row): + cells.append(row[split_index:]) + + cells = [cell.strip(' "') for cell in cells] + return cells