From cf37d068ab67daff76fdd6db6eda400779a160ff Mon Sep 17 00:00:00 2001 From: bmenasha Date: Tue, 11 Aug 2020 13:13:24 -0400 Subject: [PATCH] Prune columns over 10,000. (#535) Corrects issue 533 by removing any properties over the 10,000 limit. Also speed up remove_duplicates by processing each dict once. Co-authored-by: Jacob Ferriero --- .../asset_inventory/bigquery_schema.py | 75 +++++++++++-------- .../tests/test_bigquery_schema.py | 17 +++++ 2 files changed, 62 insertions(+), 30 deletions(-) diff --git a/tools/asset-inventory/asset_inventory/bigquery_schema.py b/tools/asset-inventory/asset_inventory/bigquery_schema.py index c8d40dd777..f497c45e71 100644 --- a/tools/asset-inventory/asset_inventory/bigquery_schema.py +++ b/tools/asset-inventory/asset_inventory/bigquery_schema.py @@ -36,6 +36,7 @@ """ import copy +from collections import defaultdict from numbers import Number import re @@ -48,8 +49,10 @@ DATE_REGEX = re.compile(r'^\d\d\d\d-\d\d-\d\d$') BQ_MAX_NUMERIC = 99999999999999999999999999999.999999999 BQ_MIN_NUMERIC = -99999999999999999999999999999.999999999 -MAX_BQ_COL_NAME_LENGTH = 128 +BQ_MAX_COL_NAME_LENGTH = 128 BQ_NUMERIC_SCALE_DIGITS = 9 +BQ_MAX_DEPTH = 15 +BQ_MAX_COLUMNS = 10000 def is_number(s): @@ -223,10 +226,10 @@ def _convert_labels_dict_to_list(parent): return parent -def _sanitize_property(property_name, parent, depth): +def _sanitize_property(property_name, parent, depth, num_properties): """Clean up json property for import into BigQuery. - Enforces some BigQuery requirements (see _santize_property_value for some + Enforces some BigQuery requirements (see _sanitize_property_value for some others): 1. Covert all properties named "labels" from maps into list of "name", @@ -250,6 +253,7 @@ def _sanitize_property(property_name, parent, depth): property_name: Name of the property in the json oject. parent: The json object containing the property. depth: How nested within the original document we are. + num_properties: How many properties into the document we are. """ # if property was removed earlier, nothing to sanitize. if property_name not in parent: @@ -260,7 +264,7 @@ def _sanitize_property(property_name, parent, depth): first_character = new_property_name[0] if not first_character.isalpha() and first_character != '_': new_property_name = '_' + new_property_name - new_property_name = new_property_name[:MAX_BQ_COL_NAME_LENGTH] + new_property_name = new_property_name[:BQ_MAX_COL_NAME_LENGTH] # check if property was changed. if property_name != new_property_name: @@ -275,7 +279,8 @@ def _sanitize_property(property_name, parent, depth): property_value = parent[new_property_name] # recursivly descend. - sanitized = sanitize_property_value(property_value, depth=depth + 1) + sanitized = sanitize_property_value(property_value, depth=depth + 1, + num_properties=num_properties) # else the value could have changed. parent[new_property_name] = sanitized @@ -288,32 +293,31 @@ def _sanitize_property(property_name, parent, depth): # prune the value. parent.pop(new_property_name) - # remove duplicates (condition #4) - remove_duplicates(new_property_name, parent) - - -def remove_duplicates(property_name, properties): - """Ensure no other property in properties share the same name. +def remove_duplicates(properties): + """Ensure no two property in properties share the same name. Args: - property_name: name of property to check for. properties: dictionary to modify. BigQuery is case insensitive, remove any lexically greater property in the dictionary that differ only by case. """ - duplicates = [] - for k in properties.keys(): - if k.lower() == property_name.lower(): - duplicates.append(k) - if len(duplicates) > 1: - selected_property = min(duplicates) - for p in duplicates: - if p != selected_property: - properties.pop(p) - - -def sanitize_property_value(property_value, depth=0): + duplicates = defaultdict(list) + # find duplicate properties + for k in properties: + duplicates[k.casefold()] += [k] + + for k in duplicates: + duplicate_properties = duplicates[k] + # remove any properties that are duplicate + if len(duplicate_properties) > 1: + selected_property = min(duplicate_properties) + for p in duplicate_properties: + if p != selected_property: + properties.pop(p) + + +def sanitize_property_value(property_value, depth=0, num_properties=0): """Modifies supplied json object for BigQuery load. Traverses the json object and modifies it to conform to BigQuery @@ -326,17 +330,24 @@ def sanitize_property_value(property_value, depth=0): 2. rounds/truncates numeric values to be between BigQuery limits for NUMERIC values. + 3. Prunes any value after the 10,000'th property. + Args: property_value: Json object. - depth: Level of embedding within the json document. + depth: Level of embedding within the document. + num_properties: Number of properties processed within the document. Returns: Modified json object. """ - # BigQuery can't deal with more then 15 nested fields. + # BigQuery can't deal with too many nested fields. # prune it. - if depth > 15: + if depth > BQ_MAX_DEPTH: + return {} + + # BigQuery can't handle too many columns. + if num_properties > BQ_MAX_COLUMNS: return {} # NUMERIC data type is an exact numeric value with 38 digits of precision @@ -350,13 +361,17 @@ def sanitize_property_value(property_value, depth=0): # sanitize each nested list element. if isinstance(property_value, list): for array_item in property_value: - sanitize_property_value(array_item, depth) + sanitize_property_value(array_item, depth, num_properties) # and each nested json object. if isinstance(property_value, dict): + remove_duplicates(property_value) for child_property in dict(property_value): - _sanitize_property(child_property, property_value, depth) - + # count it. + num_properties += 1 + # sanitize each property. + _sanitize_property(child_property, property_value, depth, + num_properties) return property_value diff --git a/tools/asset-inventory/tests/test_bigquery_schema.py b/tools/asset-inventory/tests/test_bigquery_schema.py index 580328adb7..734c1e1934 100644 --- a/tools/asset-inventory/tests/test_bigquery_schema.py +++ b/tools/asset-inventory/tests/test_bigquery_schema.py @@ -269,6 +269,23 @@ def test_remove_duplicate_property(self): self.assertEqual(sanitized['IPAddress'], 'other_value') self.assertEqual(sanitized['array'], [{'IPAddress': 'other_value'}]) + def test_prune_max_properties(self): + doc = {'prop-' + str(i): 'value' for i in range(0, 10000)} + sanitized = bigquery_schema.sanitize_property_value(doc) + self.assertEqual(len(sanitized), 10000) + + # prune the 10,000'th + doc['prop-10001'] = 'value' + sanitized = bigquery_schema.sanitize_property_value(doc) + self.assertEqual(len(sanitized), 10000) + + # prune last added property + doc['z'] = 'value' + sanitized = bigquery_schema.sanitize_property_value(doc) + self.assertEqual(len(sanitized), 10000) + self.assertNotIn('z', sanitized) + + if __name__ == '__main__': unittest.main()