From 2b0da4169414012371c4e34b893e66f4843a1f79 Mon Sep 17 00:00:00 2001 From: Jake Cohen <101183561+JakeCohenSol@users.noreply.github.com> Date: Mon, 12 Feb 2024 16:24:49 +0000 Subject: [PATCH] DIAC-232 python scripts (#1954) * DIAC-226 setup majority of scirpts to work. Searches for exported csvs and redacts, transforms and outputs them ready for importing * DIAC-226 updated python scripts * Finished scripts to properly output all wanted files with correct permissions * Added comments to explain how to use script * Adding proper CSV row searcher and try clauses for errors. Also added additonal fields to be redacted * QOL changes * Various refactoring. Created filepath_settings file to manage filepaths and place redacted fields dict, refactored scripts to now use input and output directories * Added dummy files within directories * Added extra fields, wrote first unit test for txt converter and renamed file --- .../python_scripts/convert-json-to-txt.py | 18 --- .../python_scripts/convert_json_to_txt.py | 20 ++++ .../create_jsons_from_event_csv.py | 32 +++++ bin/utils/python_scripts/filepath_settings.py | 105 ++++++++++++++++ bin/utils/python_scripts/init.py | 0 .../python_scripts/input_csv_files/dummy.txt | 0 .../python_scripts/output_csv_files/dummy.txt | 0 .../python_scripts/output_jsons/dummy.txt | 0 bin/utils/python_scripts/prep_import_data.py | 68 +++++++++++ .../python_scripts/redact_info_from_json.py | 112 ++++-------------- bin/utils/python_scripts/tests.py | 46 +++++++ 11 files changed, 291 insertions(+), 110 deletions(-) delete mode 100644 bin/utils/python_scripts/convert-json-to-txt.py create mode 100755 bin/utils/python_scripts/convert_json_to_txt.py create mode 100755 bin/utils/python_scripts/create_jsons_from_event_csv.py create mode 100644 bin/utils/python_scripts/filepath_settings.py create mode 100755 bin/utils/python_scripts/init.py create mode 100644 bin/utils/python_scripts/input_csv_files/dummy.txt create mode 100644 bin/utils/python_scripts/output_csv_files/dummy.txt create mode 100644 bin/utils/python_scripts/output_jsons/dummy.txt create mode 100755 bin/utils/python_scripts/prep_import_data.py mode change 100644 => 100755 bin/utils/python_scripts/redact_info_from_json.py create mode 100644 bin/utils/python_scripts/tests.py diff --git a/bin/utils/python_scripts/convert-json-to-txt.py b/bin/utils/python_scripts/convert-json-to-txt.py deleted file mode 100644 index fe431a560a..0000000000 --- a/bin/utils/python_scripts/convert-json-to-txt.py +++ /dev/null @@ -1,18 +0,0 @@ -import json -import os - -# Your JSON data (example) 1677498210980054-data-annotated.json -input_file = '/Users/jacobcohensolirius/HMCTS/IA/ia-case-api/bin/utils/python_scripts/SNi_tickets/SNI-5296/latest_data_class.json' -base_name, extension = os.path.splitext(input_file) -output_filename = f'{base_name}.txt' - -# Read JSON data from the input file -with open(input_file, "r") as json_file: - json_data = json.load(json_file) - -# Convert JSON data to a compact text representation -text_data = json.dumps(json_data, separators=(',', ':')) - -# Write the compact text data to a text file -with open(output_filename, "w") as text_file: - text_file.write(text_data) diff --git a/bin/utils/python_scripts/convert_json_to_txt.py b/bin/utils/python_scripts/convert_json_to_txt.py new file mode 100755 index 0000000000..c32bfdab65 --- /dev/null +++ b/bin/utils/python_scripts/convert_json_to_txt.py @@ -0,0 +1,20 @@ +import json + + +# Your JSON data (example) 1677498210980054-data-annotated.json +def convert_json_to_txt(file_path): + # Read JSON data from the input file + with open(file_path, "r") as json_file: + json_data = json.load(json_file) + + # Convert JSON data to a compact text representation + text_data = json.dumps(json_data, separators=(',', ':')) + + output_file = file_path.split(".")[0] + ".txt" + + # Write the compact text data to a text file + with open(output_file, "w") as text_file: + text_file.write(text_data) + + +# convert_json_to_txt("event_1.json") diff --git a/bin/utils/python_scripts/create_jsons_from_event_csv.py b/bin/utils/python_scripts/create_jsons_from_event_csv.py new file mode 100755 index 0000000000..dbf7519595 --- /dev/null +++ b/bin/utils/python_scripts/create_jsons_from_event_csv.py @@ -0,0 +1,32 @@ +import csv +import json +import os + +from filepath_settings import settings + + +def create_jsons_from_csv(csv_file, events: list[int] = None, output_dir_name_suffix: str = 'latest'): + with open(csv_file, 'r') as file: + csv_reader = csv.DictReader(file) + events_counter = 1 + dir_name = make_output_dir(output_dir_name_suffix) + for row in csv_reader: + if 'data' in row and events_counter in events: + data = json.loads(row['data']) + filename = f"event_{events_counter}.json" + full_filepath = os.path.join(dir_name, filename) + with open(full_filepath, 'w') as json_file: + json.dump(data, json_file, indent=2) + os.chmod(full_filepath, 0o777) + events_counter += 1 + + +def make_output_dir(case_name: str) -> str: + directory_name = f'output_jsons_{case_name}' + full_filepath = os.path.join(settings.output_json_directory, directory_name) + if not os.path.exists(full_filepath): + os.makedirs(full_filepath) + os.chmod(full_filepath, 0o777) + return full_filepath + +# create_jsons_from_csv('case_event_202402071630.csv', events=list(range(1, 7)), output_dir_name_prefix='5405') diff --git a/bin/utils/python_scripts/filepath_settings.py b/bin/utils/python_scripts/filepath_settings.py new file mode 100644 index 0000000000..117466143d --- /dev/null +++ b/bin/utils/python_scripts/filepath_settings.py @@ -0,0 +1,105 @@ +import os + +PROJECT_DIR = os.path.dirname(__file__) +PYTHON_SCRIPTS_DIR = os.path.join(PROJECT_DIR) +EXPORTED_CSV_INPUT_DIR = os.path.join(PYTHON_SCRIPTS_DIR, 'input_csv_files') +OUTPUT_CSV_DIR = os.path.join(PYTHON_SCRIPTS_DIR, "output_csv_files") +OUTPUT_JSON_DIRECTORY = os.path.join(PYTHON_SCRIPTS_DIR, "output_jsons") + + +class Settings: + project_dir = PROJECT_DIR + scripts_dir = PYTHON_SCRIPTS_DIR + exported_csv_dir = EXPORTED_CSV_INPUT_DIR + output_csv_dir = OUTPUT_CSV_DIR + output_json_directory = OUTPUT_JSON_DIRECTORY + + replace_mapping_dict = { + 'email': 'email@email.com', + 'postcode': 'N11 1yz', + 'dateofbirth': '1980-01-01', + 'document_filename': 'redacted.pdf', + 'document_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe', + 'document_binary_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe/binary', + 'name': 'redacted.pdf', + 'filename': 'redacted.pdf', + 'addressline1': '10 street', + 'addressline2': 'town', + 'addressline3': 'city', + 'attendingjudge': 'redacted', + 'feedescription': 'redacted', + 'searchpostcode': 'n11 1yz', + 'posttown': 'town', + 'details': 'redacted', + 'decisionreason': 'redacted', + 'appellantfamilyname': 'redacted', + 'appellantgivennames': 'redacted', + 'appellantdateofbirth': '1980-01-01', + 'appellantemailaddress': 'email@email.com', + 'casenotedescription': 'redacted', + 'casenotesubject': 'redacted', + 'user': 'redacted', + 'question': 'redacted', + 'explanation': 'redacted', + 'address': 'redacted', + 'bundlefilenameprefix': 'EA 50111 2023', + 'casenamehmctsinternal': 'redacted', + 'hmctscasenameinternal': 'redacted', + 'appellantnamefordisplay': 'redacted', + 'reasonsforappealdecision': 'redacted', + 'description': 'redacted', + 'answer': 'redacted', + "legalrepname": "redacted", + "mobilenumber": "07451111111", + "legalrepcompany": "redacted", + "partyname": "redacted", + "legalrepcompanyname": "redacted", + "county": "redacted", + "legalrepresentativename": "redacted", + "legalrepreferencenumber": "AA/1234", + "directioneditexplanation": 'redacted', + "fullname": "redacted", + "dayofbirth": 1, + "familyname": "redacted", + "yearofbirth": 1980, + "monthofbirth": 1, + "displaydateofbirth": "10 Jan 1980", + "documentreference": "012345678", + "displayappellantdetailstitle": "redacted", + "displayapplicationdetailstitle": "redacted", + "homeofficesearchresponse": 'redacted', + "remotevideocalldescription": 'redacted', + "hearingdaterangedescription": 'redacted', + "interpreterlanguagereadonly": "Language\t\tEnglish\nDialect\t\t\tENG", + "legalrepresentativeemailaddress": "email@email.co.uk", + "remotevideocalltribunalresponse": "redacted", + "appellantnationalitiesdescription": "France", + "language": "English", + "languagedialect": "redacted", + "legalaidaccountnumber": "OG123V1", + "appellantphonenumber": "07451111111", + "givenname": "redacted", + "data": "redacted", + "witnessname": "redacted", + "witnessdetailsreadonly": "redacted", + "multimediaTribunalResponse": "redacted", + "appellantfullname": "redacted", + "endappealapprovername": "redacted", + "dateToAvoidReason": "redacted", + "sponsorNameForDisplay": "redacted", + "sponsorAddressForDisplay": "10 street", + "sponsorMobileNumber": "07451111111", + "Name": "redacted", + "attendingAppellant": "redacted", + "attendingHomeOfficeLegalRepresentative": "redacted", + "attendingAppellantsLegalRepresentative": "redacted", + "newMatters": "redacted" + } + + csv_rows_to_redact = { + "user_first_name": "redacted", + "user_last_name": "redacted" + } + + +settings = Settings() diff --git a/bin/utils/python_scripts/init.py b/bin/utils/python_scripts/init.py new file mode 100755 index 0000000000..e69de29bb2 diff --git a/bin/utils/python_scripts/input_csv_files/dummy.txt b/bin/utils/python_scripts/input_csv_files/dummy.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bin/utils/python_scripts/output_csv_files/dummy.txt b/bin/utils/python_scripts/output_csv_files/dummy.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bin/utils/python_scripts/output_jsons/dummy.txt b/bin/utils/python_scripts/output_jsons/dummy.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bin/utils/python_scripts/prep_import_data.py b/bin/utils/python_scripts/prep_import_data.py new file mode 100755 index 0000000000..01422c1e7e --- /dev/null +++ b/bin/utils/python_scripts/prep_import_data.py @@ -0,0 +1,68 @@ +import os +import csv + +from filepath_settings import settings +from create_jsons_from_event_csv import create_jsons_from_csv +from redact_info_from_json import redact_values_from_csv + + +def prep_import_data(directory: str = settings.exported_csv_dir, events_to_get_individual_json: list[int] = None): + """ + Function to prep exported CSV data for importing. Redacts and transforms most recent exported files within + python_scripts directory. + + Exported files should be in format: case_event_202402080516.csv and case_data_202402080518.csv + (default export pattern) + """ + latest_case_event_data = get_latest_file(directory, 'case_event') + latest_case_data = get_latest_file(directory, 'case_data') + if events_to_get_individual_json: + create_jsons_from_csv(latest_case_event_data, events=events_to_get_individual_json) + redacted_case_data = redact_values_from_csv(latest_case_data) + os.chmod(redacted_case_data, 0o777) + case_data_id = input( + 'Import the redacted case data CSV and retrieve the correct case id.\nEnter the new case data id:')\ + .encode('utf-8').decode('utf-8') + redacted_case_event_data = redact_values_from_csv(latest_case_event_data) + redacted_replaced_event_data = replace_case_data_id(case_data_id, redacted_case_event_data) + os.chmod(redacted_replaced_event_data, 0o777) + + +def get_latest_file(dir_path: str, file_prefix: str) -> str: + dir_files = os.listdir(dir_path) + files = [file for file in dir_files if file_prefix in file] + times = [] + for file in files: + try: + file = file.split('.')[0] + times.append(int(file[-12:])) + except Exception as e: + print(e) + pass + latest = max(times) + filename = f"{file_prefix}_{latest}.csv" + full_filepath = os.path.join(dir_path, filename) + return full_filepath + + +def replace_case_data_id(new_id: str, file_path: str): + # Read the CSV file and create a list of dictionaries + with open(file_path, 'r') as file: + reader = csv.DictReader(file) + data_list = list(reader) + + # Replace values in the 'case_data_id' column with the new_id + for row in data_list: + if 'case_data_id' in row: + row['case_data_id'] = new_id + + # Write the modified data back to the CSV file + with open(file_path, 'w', newline='') as file: + fieldnames = reader.fieldnames + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(data_list) + return file_path + + +prep_import_data(events_to_get_individual_json=range(1,13)) \ No newline at end of file diff --git a/bin/utils/python_scripts/redact_info_from_json.py b/bin/utils/python_scripts/redact_info_from_json.py old mode 100644 new mode 100755 index 2b085f93e9..7da34aa4c5 --- a/bin/utils/python_scripts/redact_info_from_json.py +++ b/bin/utils/python_scripts/redact_info_from_json.py @@ -2,109 +2,32 @@ import json import csv +from filepath_settings import settings + """ Script for redacting data from a case data JSON or CSV. Usage: Replace filepath with absolute filepath of JSON/CSV requiring redacting at the bottom of this file where -desired function is called (comment out other if not needed). +desired function is called. -Run python bin/utils/python_scripts/redact_info_from_json.py while in ia/case/api directory - Script will output redacted JSON/CSV file in the same directory as original with '_redacted' suffix. Notes: - -Fields to be redacted are hardcoded in replace mapping dict, so use with caution and double check redacted JSON -(I suggest comparing two files in IDE) as it's only been tested on a few cases' data. -Add any additionally required fields (in full lowercase) to be redacted in replace_mapping_dict with their corresponding +Fields to be redacted are hardcoded in replace mapping dict in filepath_settings.py, so use with caution and double +check redacted JSON (I suggest comparing two files in IDE) as it's only been tested on a few cases' data. + +Add any additionally required fields to be redacted in replace_mapping_dict with their corresponding replace data Replaces all document filenames so if testing any specifics you may want to manually change the filenames """ -replace_mapping_dict = { - 'email': 'email@email.com', - 'postcode': 'N11 1yz', - 'dateofbirth': '1980-01-01', - 'document_filename': 'redacted.pdf', - 'document_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe', - 'document_binary_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe/binary', - 'name': 'redacted.pdf', - 'filename': 'redacted.pdf', - 'addressline1': '10 street', - 'addressline2': 'town', - 'addressline3': 'city', - 'attendingjudge': 'redacted', - 'feedescription': 'redacted', - 'searchpostcode': 'n11 1yz', - 'posttown': 'town', - 'details': 'redacted', - 'decisionreason': 'redacted', - 'appellantfamilyname': 'redacted', - 'appellantgivennames': 'redacted', - 'appellantdateofbirth': '1980-01-01', - 'appellantemailaddress': 'email@email.com', - 'casenotedescription': 'redacted', - 'casenotesubject': 'redacted', - 'user': 'redacted', - 'question': 'redacted', - 'explanation': 'redacted', - 'address': 'redacted', - 'bundlefilenameprefix': 'EA 50111 2023', - 'casenamehmctsinternal': 'redacted', - 'hmctscasenameinternal': 'redacted', - 'appellantnamefordisplay': 'redacted', - 'reasonsforappealdecision': 'redacted', - 'description': 'redacted', - 'answer': 'redacted', - "legalrepname": "redacted", - "mobilenumber": "07451111111", - "legalrepcompany": "redacted", - "partyname": "redacted", - "legalrepcompanyname": "redacted", - "county": "redacted", - "legalrepresentativename": "redacted", - "legalrepreferencenumber": "AA/1234", - "directioneditexplanation": 'redacted', - "fullname": "redacted", - "dayofbirth": 1, - "familyname": "redacted", - "yearofbirth": 1980, - "monthofbirth": 1, - "displaydateofbirth": "10 Jan 1980", - "documentreference": "012345678", - "displayappellantdetailstitle": "redacted", - "displayapplicationdetailstitle": "redacted", - "homeofficesearchresponse": 'redacted', - "remotevideocalldescription": 'redacted', - "hearingdaterangedescription": 'redacted', - "interpreterlanguagereadonly": "Language\t\tEnglish\nDialect\t\t\tENG", - "legalrepresentativeemailaddress": "email@email.co.uk", - "remotevideocalltribunalresponse": "redacted", - "appellantnationalitiesdescription": "France", - "language": "English", - "languagedialect": "redacted", - "legalaidaccountNumber": "OG123V1", - "appellantPhoneNumber": "07451111111", - "givenName": "redacted", - "appellantfullname": "redacted", - "endappealapprovername": "redacted", - - # Refactor to separate list for CSV rows - "data": "redacted", -} - -csv_rows_to_redact = { - "user_first_name": "redacted", - "user_last_name": "redacted" -} - -replace_mapping_keys = list(replace_mapping_dict.keys()) -replace_csv_mapping_keys = list(csv_rows_to_redact.keys()) +replace_mapping_keys = [x.lower() for x in list(settings.replace_mapping_dict.keys())] +replace_csv_mapping_keys = [x.lower() for x in list(settings.csv_rows_to_redact.keys())] def redact_values_from_json(file_path, keys_to_redact): @@ -131,11 +54,13 @@ def redact_values(json_data, keys_to_redact): redact_values(item, keys_to_redact) -def redact_values_from_csv(input_file_path, keys_to_redact): +def redact_values_from_csv(input_file_path, keys_to_redact = replace_mapping_keys) -> str: with open(input_file_path, 'r', newline='') as input_file: reader = csv.DictReader(input_file) rows = list(reader) + redact_csv_rows(rows, keys_to_redact) + output_file_path = get_redacted_file_path(input_file_path) with open(output_file_path, 'w', newline='') as output_file: @@ -143,6 +68,7 @@ def redact_values_from_csv(input_file_path, keys_to_redact): writer = csv.DictWriter(output_file, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) + return output_file_path def redact_csv_rows(rows, keys_to_redact): @@ -155,7 +81,6 @@ def redact_csv_rows(rows, keys_to_redact): redact_values(json_value, keys_to_redact) row[key] = json.dumps(json_value) except json.JSONDecodeError: - print(f'Not a valid json within CSV field: {key}') pass elif key.lower() in replace_csv_mapping_keys: row[key] = 'redacted' @@ -164,17 +89,20 @@ def redact_csv_rows(rows, keys_to_redact): def get_redacted_file_path(original_file_path): + original_file_path = original_file_path.replace("input", "output") base_name, extension = os.path.splitext(original_file_path) - return f"{base_name}_redacted{extension}" + file_name = f"{base_name}_redacted{extension}" + return file_name def get_replace_term(key): key = key.lower() - return replace_mapping_dict.get(key, 'redacted') + return settings.replace_mapping_dict.get(key, 'redacted') # redact_values_from_json( -# '/Users/jacobcohensolirius/HMCTS/IA/ia-case-api/bin/utils/python_scripts/SNi_tickets/SNI-5296/latest_data.json', replace_mapping_keys +# 'latest_data.json', replace_mapping_keys # ) -redact_values_from_csv('/Users/jacobcohensolirius/HMCTS/IA/ia-case-api/bin/utils/python_scripts/SNi_tickets/SNI_5340/case_event_202401261547.csv', replace_mapping_keys) +# redact_values_from_csv( +# 'case_event_202401261547.csv') diff --git a/bin/utils/python_scripts/tests.py b/bin/utils/python_scripts/tests.py new file mode 100644 index 0000000000..cedf4d5424 --- /dev/null +++ b/bin/utils/python_scripts/tests.py @@ -0,0 +1,46 @@ +import unittest +import json +import os +import shutil + +from convert_json_to_txt import convert_json_to_txt + + +class TestConvertJsonToTxt(unittest.TestCase): + + def setUp(self): + # Create a temporary directory for testing + self.test_dir = "test_temp_dir" + os.makedirs(self.test_dir, exist_ok=True) + + def tearDown(self): + # Remove the temporary directory and its contents + shutil.rmtree(self.test_dir) + + def test_convert_json_to_txt(self): + # Prepare a sample JSON data + json_data = {'name': 'John', 'age': 30, 'city': 'New York'} + + # Write the sample JSON data to a file + input_file_path = os.path.join(self.test_dir, "input.json") + with open(input_file_path, "w") as json_file: + json.dump(json_data, json_file) + + # Call the function to convert JSON to TXT + convert_json_to_txt(input_file_path) + + # Check if the output file exists + output_file_path = os.path.join(self.test_dir, "input.txt") + self.assertTrue(os.path.exists(output_file_path)) + + # Read the content of the output file + with open(output_file_path, "r") as text_file: + text_data = text_file.read() + + # Check if the content matches the expected compact text representation + expected_text_data = '{"name":"John","age":30,"city":"New York"}' + self.assertEqual(text_data, expected_text_data) + + +if __name__ == '__main__': + unittest.main()