From 962e7eaa2da2105489ed0a180d42c4cfeb7c0753 Mon Sep 17 00:00:00 2001 From: ursin Date: Thu, 16 Sep 2021 11:42:15 +0200 Subject: [PATCH] Refactor pre-processing scripts and finish tutorial for pre-processing. --- README.md | 68 ++++++++++++++++++- .../api_ner/extract_values.py | 3 +- src/preprocessing/pre_process.py | 67 +++++++++++------- 3 files changed, 109 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 3fdedb0..91a7a85 100644 --- a/README.md +++ b/README.md @@ -139,12 +139,76 @@ Be aware that not the full SQL Syntax is supported. A table alias for example ne After you added your data (both _TRAIN_ and _TEST_ split), run the [training_data_builder.py](src/tools/training_data_builder/training_data_builder.py). It will take your custom data and transform it into the Spider representation: ```bash -python src/tools/training_data_builder/training_data_builder.py --data hack_zurich +PYTHONPATH=$(pwd)/src python src/tools/training_data_builder/training_data_builder.py --data hack_zurich ``` You will now find your custom data in the two files [data/hack_zurich/original/train.json](data/hack_zurich/original/train.json) and [data/hack_zurich/original/dev.json](data/hack_zurich/original/dev.json). #### Extract Value Candidates using Named Entity Recognition -TODO +In this step we try to extract potential _values_ from the natural language question. In the example *'What is the share of electric cars in 2016 for Wetzikon?'*, +we would like to identify *2016* and *Wetzikon* as value candidates. + +To do so we use a mix of Named Entity Recognition (NER) and heuristics. As NER engine we use the Google Natural Language API. +You can get your own API key here [https://cloud.google.com/natural-language/docs/analyzing-entities](https://cloud.google.com/natural-language/docs/analyzing-entities) or ask us during the hack for a key. + +Run the script twice, for both *TRAIN* and *DEV* split: + +```bash +PYTHONPATH=$(pwd)/src python src/named_entity_recognition/api_ner/extract_values.py --data_path=data/hack_zurich/original/train.json --output_path=data/hack_zurich/ner_train.json --ner_api_secret=GOOGLE_API_SECRET +PYTHONPATH=$(pwd)/src python src/named_entity_recognition/api_ner/extract_values.py --data_path=data/hack_zurich/original/dev.json --output_path=data/hack_zurich/ner_dev.json --ner_api_secret=GOOGLE_API_SECRET +``` + +We then also extract the ground truth values from the SQL query. Doing this we can verify if our NER approach returns the values required to synthesize the query successfully. + +```bash +PYTHONPATH=$(pwd)/src python src/tools/get_values_from_sql.py --data_path data/hack_zurich/original/train.json --table_path data/hack_zurich/original/tables.json --ner_path data/hack_zurich/ner_train.json +PYTHONPATH=$(pwd)/src python src/tools/get_values_from_sql.py --data_path data/hack_zurich/original/dev.json --table_path data/hack_zurich/original/tables.json --ner_path data/hack_zurich/ner_dev.json +``` + +This last script doesn't create a new file, but adds the ground truth values to the *ner_dev.json* and *ner_train.json* files, see the new attribute *values*: + +```json + "values": [ + "Wetzikon", + "2016" + ] +``` + +#### Pre-processing +The next step is the actual pre-processing. This step contains classical NLP pre-processing, but mostly it's focusing on encoding the DB-schema and values in a way the model can learn from it. As an example we will go through the content of the database and create so called *value hints*, so marking a column if it contains a value we extracted during the last step via NER. + +This step is highly parallelized, but might take time if your database is large or you have large amounts of training data. + +To execute, run the following script (once for *DEV*, once for *TRAIN*): + +```bash +PYTHONPATH=$(pwd)/src python src/preprocessing/pre_process.py --data_path=data/hack_zurich/original/train.json --ner_data_path=data/hack_zurich/ner_train.json --table_path=data/hack_zurich/original/tables.json --output=data/hack_zurich/preprocessed_train.json --database_host=your_database_host --database_port=5432 --database_user=postgres --database_password=DB_PW --database_schema=public +PYTHONPATH=$(pwd)/src python src/preprocessing/pre_process.py --data_path=data/hack_zurich/original/dev.json --ner_data_path=data/hack_zurich/ner_dev.json --table_path=data/hack_zurich/original/tables.json --output=data/hack_zurich/preprocessed_dev.json --database_host=your_database_host --database_port=5432 --database_user=postgres --database_password=DB_PW --database_schema=public +``` + +#### Modelling JOINs and SQL-to-SemQL + +In this last step we model some special JOINs situation as filters and then transform each SQL statement to a SemQL (Semantic Query Language) AST (abstract syntax tree). + +The idea train the model on an intermediate query representation (SemQL) instead of pure SQL stems from the [IRNet paper](https://arxiv.org/pdf/1905.08205.pdf) +with the goal to abstract implementation details of SQL (e.g. the difference between *WHERE* and *HAVING*). + +ValueNet will at inference time transform the SemQL representation back to classical SQL. + +We start by modeling some JOINs as filters **(minor importance, has most probably no effect on your data - you might skip it)** + +```bash +PYTHONPATH=$(pwd)/src python src/preprocessing/model_joins_as_filter.py --data_path=data/hack_zurich/preprocessed_train.json --table_path=data/hack_zurich/original/tables.json --output=data/hack_zurich/preprocessed_with_joins_train.json +PYTHONPATH=$(pwd)/src python src/preprocessing/model_joins_as_filter.py --data_path=data/hack_zurich/preprocessed_dev.json --table_path=data/hack_zurich/original/tables.json --output=data/hack_zurich/preprocessed_with_joins_dev.json +``` + +And then transform SQL to SemQL: + +```bash + PYTHONPATH=$(pwd)/src python src/preprocessing/sql2SemQL.py --data_path data/hack_zurich/preprocessed_with_joins_train.json --table_path data/hack_zurich/original/tables.json --output data/hack_zurich/train.json + PYTHONPATH=$(pwd)/src python src/preprocessing/sql2SemQL.py --data_path data/hack_zurich/preprocessed_with_joins_dev.json --table_path data/hack_zurich/original/tables.json --output data/hack_zurich/dev.json +``` + +With this script, the pre-processing is done. The files `train.json` and `dev.json` are gonna be the input for training the neural network in the next chapter. ### Train Model diff --git a/src/named_entity_recognition/api_ner/extract_values.py b/src/named_entity_recognition/api_ner/extract_values.py index c38fd77..918ceb9 100644 --- a/src/named_entity_recognition/api_ner/extract_values.py +++ b/src/named_entity_recognition/api_ner/extract_values.py @@ -8,6 +8,7 @@ arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--data_path', type=str, required=True) arg_parser.add_argument('--output_path', type=str, required=True) + arg_parser.add_argument('--ner_api_secret', type=str, required=True) args = arg_parser.parse_args() @@ -17,7 +18,7 @@ error_count = 0 ner_data = [] for doc in data: - extracted_values = remote_named_entity_recognition(doc['question']) + extracted_values = remote_named_entity_recognition(doc['question'], args.ner_api_secret) if extracted_values: ner_data.append({ 'entities': extracted_values['entities'], diff --git a/src/preprocessing/pre_process.py b/src/preprocessing/pre_process.py index 13f3277..1eb3438 100644 --- a/src/preprocessing/pre_process.py +++ b/src/preprocessing/pre_process.py @@ -47,19 +47,17 @@ def add_value_match(token, columns_list, column_matches): column_matches[column_ix]['full_value_match'] = True -def build_db_value_finder(database_path, db_name, schema_path): - if db_name != 'cordis_temporary': - return DatabaseValueFinderSQLite(database_path, db_name, schema_path) +def build_db_value_finder(db_name, schema_path, args): + # If a database path is provided, we assume an SQLite database and initialize the DatabaseValueFinderSQLite. + # In case there is no database path we assume a bunch of connection details for the DatabaseValueFinderPostgreSQL- + + if 'database_path' in args and args.database_path: + return DatabaseValueFinderSQLite(args.database_path, db_name, schema_path) else: - # a bit of a hack, when more postgres-db gets added for training we have to improve this. - config = {'database': db_name, - 'database_host': 'testbed.inode.igd.fraunhofer.de', - 'database_port': '18001', - 'database_user': 'postgres', - 'database_password': 'dummy_password', - 'database_schema': 'unics_cordis'} + connection_config = {k: v for k, v in vars(args).items() if k.startswith('database')} + connection_config['database'] = db_name - return DatabaseValueFinderPostgreSQL(config['database'], schema_path, config) + return DatabaseValueFinderPostgreSQL(db_name, schema_path, connection_config) def add_likely_value_candidates(value_candidates, potential_value_candidates): @@ -84,7 +82,8 @@ def add_likely_value_candidates(value_candidates, potential_value_candidates): value_candidates)) -def lookup_database(example, ner_information, columns, question_tokens, column_matches, database_value_finder, add_values_from_ground_truth): +def lookup_database(example, ner_information, columns, question_tokens, column_matches, database_value_finder, + add_values_from_ground_truth): """ Now we use the base data (database) for two things: * to put together a list of values from which the neural network later hase to pick the right one. @@ -92,14 +91,16 @@ def lookup_database(example, ner_information, columns, question_tokens, column_m As as input we use the entities extracted by the NER and then boil it down with the help of the base data (database). """ - potential_value_candidates = pre_process_ner_candidates(ner_information['entities'], example['question'], example['question_toks']) + potential_value_candidates = pre_process_ner_candidates(ner_information['entities'], example['question'], + example['question_toks']) # Here we use the power of the base-data: if we find a potential value in the database, we mark the column we found the value in with a "full value match". # TODO: also use the information on table level include_primary_key_columns = 'id' in question_tokens # here we do the actual database lookup - database_matches = match_values_in_database(database_value_finder, potential_value_candidates, include_primary_key_columns) + database_matches = match_values_in_database(database_value_finder, potential_value_candidates, + include_primary_key_columns) # and add the hint to the corresponding column for value, column, table in database_matches: @@ -241,7 +242,10 @@ def pre_process(idx, example, ner_information, db_value_finder, is_training): column_matches[column_idx]['partial_column_match'] += 1 # a lot of interesting stuff happens here - make sure you are aware of it! - value_candidates, all_values_found, column_matches = lookup_database(example, ner_information, columns, question_tokens, column_matches, db_value_finder, add_values_from_ground_truth=is_training) + value_candidates, all_values_found, column_matches = lookup_database(example, ner_information, columns, + question_tokens, column_matches, + db_value_finder, + add_values_from_ground_truth=is_training) return token_grouped, token_types, column_matches, value_candidates, all_values_found @@ -249,8 +253,13 @@ def pre_process(idx, example, ner_information, db_value_finder, is_training): def main(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--data_path', type=str, help='dataset', required=True) - arg_parser.add_argument('--ner_data_path', type=str, help='NER results (e.g. from Google API)', required=True) - arg_parser.add_argument('--database_path', type=str, help='database files', required=True) + arg_parser.add_argument('--ner_data_path', type=str, help='NER results (e.g. from Google API), including actual values extracted from SQL', required=True) + arg_parser.add_argument('--database_path', type=str, help='Database file in case of SQLite', required=False) + arg_parser.add_argument('--database_host', type=str, help='Database host in case of PostgreSQL', required=False) + arg_parser.add_argument('--database_port', type=str, help='Host port in case of PostgreSQL', required=False) + arg_parser.add_argument('--database_user', type=str, help='Database user in case of PostgreSQL', required=False) + arg_parser.add_argument('--database_password', type=str, help='Database password in case of PostgreSQL', required=False) + arg_parser.add_argument('--database_schema', type=str, help='Database schema in case of PostgreSQL', required=False) arg_parser.add_argument('--table_path', type=str, help='schema data', required=True) arg_parser.add_argument('--output', type=str, help='output data') args = arg_parser.parse_args() @@ -261,7 +270,8 @@ def main(): with open(os.path.join(args.ner_data_path), 'r', encoding='utf-8') as json_file: ner_data = json.load(json_file) - assert len(data) == len(ner_data), 'Both, NER data and actual data (e.g. ner_train.json and train.json) need to have the same amount of rows!' + assert len(data) == len( + ner_data), 'Both, NER data and actual data (e.g. ner_train.json and train.json) need to have the same amount of rows!' not_found_count = 0 @@ -273,18 +283,22 @@ def main(): # data = data[7646:7647] # ner_data = ner_data[7646:7647] - results = Parallel(n_jobs=NUM_CORES)(delayed(pre_process)(idx, example, ner_information, build_db_value_finder(args.database_path, example['db_id'], args.table_path), is_training=True) for idx, (example, ner_information) in enumerate(zip(data, ner_data))) + results = Parallel(n_jobs=NUM_CORES)(delayed(pre_process)(idx, example, ner_information, + build_db_value_finder(example['db_id'], args.table_path, args), + is_training=True) for idx, (example, ner_information) in + enumerate(zip(data, ner_data))) # To better debug this code, use the non-parallelized version of the code # results = [pre_process(idx, example, ner_information, build_db_value_finder(args.database_path, example['db_id'], args.table_path), is_training=True) for idx, (example, ner_information) in enumerate(zip(data, ner_data))] - all_token_grouped, all_token_types, all_column_matches, all_value_candidates, all_complete_values_found = zip(*results) + all_token_grouped, all_token_types, all_column_matches, all_value_candidates, all_complete_values_found = zip( + *results) for example, token_grouped, token_types, column_matches, value_candidates, complete_values_found in zip(data, - all_token_grouped, - all_token_types, - all_column_matches, - all_value_candidates, - all_complete_values_found): + all_token_grouped, + all_token_types, + all_column_matches, + all_value_candidates, + all_complete_values_found): # this are the only additional information we store after pre-processing example['question_arg'] = token_grouped @@ -297,7 +311,8 @@ def main(): not_found_count += 1 t.toc(msg="Total pre-processing took") - print(f"Could not find all values in {not_found_count} examples. All examples where values could not get extracted, will get disable on evaluation") + print( + f"Could not find all values in {not_found_count} examples. All examples where values could not get extracted, will get disable on evaluation") with open(args.output, 'w') as f: json.dump(data, f, sort_keys=True, indent=4)