From 2b0da4169414012371c4e34b893e66f4843a1f79 Mon Sep 17 00:00:00 2001
From: Jake Cohen <101183561+JakeCohenSol@users.noreply.github.com>
Date: Mon, 12 Feb 2024 16:24:49 +0000
Subject: [PATCH] DIAC-232 python scripts (#1954)

* DIAC-226 setup majority of scirpts to work. Searches for exported csvs and redacts, transforms and outputs them ready for importing

* DIAC-226 updated python scripts

* Finished scripts to properly output all wanted files with correct permissions

* Added comments to explain how to use script

* Adding proper CSV row searcher and try clauses for errors. Also added additonal fields to be redacted

* QOL changes

* Various refactoring. Created filepath_settings file to manage filepaths and place redacted fields dict, refactored scripts to now use input and output directories

* Added dummy files within directories

* Added extra fields, wrote first unit test for txt converter and renamed file
---
 .../python_scripts/convert-json-to-txt.py     |  18 ---
 .../python_scripts/convert_json_to_txt.py     |  20 ++++
 .../create_jsons_from_event_csv.py            |  32 +++++
 bin/utils/python_scripts/filepath_settings.py | 105 ++++++++++++++++
 bin/utils/python_scripts/init.py              |   0
 .../python_scripts/input_csv_files/dummy.txt  |   0
 .../python_scripts/output_csv_files/dummy.txt |   0
 .../python_scripts/output_jsons/dummy.txt     |   0
 bin/utils/python_scripts/prep_import_data.py  |  68 +++++++++++
 .../python_scripts/redact_info_from_json.py   | 112 ++++--------------
 bin/utils/python_scripts/tests.py             |  46 +++++++
 11 files changed, 291 insertions(+), 110 deletions(-)
 delete mode 100644 bin/utils/python_scripts/convert-json-to-txt.py
 create mode 100755 bin/utils/python_scripts/convert_json_to_txt.py
 create mode 100755 bin/utils/python_scripts/create_jsons_from_event_csv.py
 create mode 100644 bin/utils/python_scripts/filepath_settings.py
 create mode 100755 bin/utils/python_scripts/init.py
 create mode 100644 bin/utils/python_scripts/input_csv_files/dummy.txt
 create mode 100644 bin/utils/python_scripts/output_csv_files/dummy.txt
 create mode 100644 bin/utils/python_scripts/output_jsons/dummy.txt
 create mode 100755 bin/utils/python_scripts/prep_import_data.py
 mode change 100644 => 100755 bin/utils/python_scripts/redact_info_from_json.py
 create mode 100644 bin/utils/python_scripts/tests.py

diff --git a/bin/utils/python_scripts/convert-json-to-txt.py b/bin/utils/python_scripts/convert-json-to-txt.py
deleted file mode 100644
index fe431a560a..0000000000
--- a/bin/utils/python_scripts/convert-json-to-txt.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import json
-import os
-
-# Your JSON data (example) 1677498210980054-data-annotated.json
-input_file = '/Users/jacobcohensolirius/HMCTS/IA/ia-case-api/bin/utils/python_scripts/SNi_tickets/SNI-5296/latest_data_class.json'
-base_name, extension = os.path.splitext(input_file)
-output_filename = f'{base_name}.txt'
-
-# Read JSON data from the input file
-with open(input_file, "r") as json_file:
-    json_data = json.load(json_file)
-
-# Convert JSON data to a compact text representation
-text_data = json.dumps(json_data, separators=(',', ':'))
-
-# Write the compact text data to a text file
-with open(output_filename, "w") as text_file:
-    text_file.write(text_data)
diff --git a/bin/utils/python_scripts/convert_json_to_txt.py b/bin/utils/python_scripts/convert_json_to_txt.py
new file mode 100755
index 0000000000..c32bfdab65
--- /dev/null
+++ b/bin/utils/python_scripts/convert_json_to_txt.py
@@ -0,0 +1,20 @@
+import json
+
+
+# Your JSON data (example) 1677498210980054-data-annotated.json
+def convert_json_to_txt(file_path):
+    # Read JSON data from the input file
+    with open(file_path, "r") as json_file:
+        json_data = json.load(json_file)
+
+    # Convert JSON data to a compact text representation
+    text_data = json.dumps(json_data, separators=(',', ':'))
+
+    output_file = file_path.split(".")[0] + ".txt"
+
+    # Write the compact text data to a text file
+    with open(output_file, "w") as text_file:
+        text_file.write(text_data)
+
+
+# convert_json_to_txt("event_1.json")
diff --git a/bin/utils/python_scripts/create_jsons_from_event_csv.py b/bin/utils/python_scripts/create_jsons_from_event_csv.py
new file mode 100755
index 0000000000..dbf7519595
--- /dev/null
+++ b/bin/utils/python_scripts/create_jsons_from_event_csv.py
@@ -0,0 +1,32 @@
+import csv
+import json
+import os
+
+from filepath_settings import settings
+
+
+def create_jsons_from_csv(csv_file, events: list[int] = None, output_dir_name_suffix: str = 'latest'):
+    with open(csv_file, 'r') as file:
+        csv_reader = csv.DictReader(file)
+        events_counter = 1
+        dir_name = make_output_dir(output_dir_name_suffix)
+        for row in csv_reader:
+            if 'data' in row and events_counter in events:
+                data = json.loads(row['data'])
+                filename = f"event_{events_counter}.json"
+                full_filepath = os.path.join(dir_name, filename)
+                with open(full_filepath, 'w') as json_file:
+                    json.dump(data, json_file, indent=2)
+                os.chmod(full_filepath, 0o777)
+            events_counter += 1
+
+
+def make_output_dir(case_name: str) -> str:
+    directory_name = f'output_jsons_{case_name}'
+    full_filepath = os.path.join(settings.output_json_directory, directory_name)
+    if not os.path.exists(full_filepath):
+        os.makedirs(full_filepath)
+    os.chmod(full_filepath, 0o777)
+    return full_filepath
+
+# create_jsons_from_csv('case_event_202402071630.csv', events=list(range(1, 7)), output_dir_name_prefix='5405')
diff --git a/bin/utils/python_scripts/filepath_settings.py b/bin/utils/python_scripts/filepath_settings.py
new file mode 100644
index 0000000000..117466143d
--- /dev/null
+++ b/bin/utils/python_scripts/filepath_settings.py
@@ -0,0 +1,105 @@
+import os
+
+PROJECT_DIR = os.path.dirname(__file__)
+PYTHON_SCRIPTS_DIR = os.path.join(PROJECT_DIR)
+EXPORTED_CSV_INPUT_DIR = os.path.join(PYTHON_SCRIPTS_DIR, 'input_csv_files')
+OUTPUT_CSV_DIR = os.path.join(PYTHON_SCRIPTS_DIR, "output_csv_files")
+OUTPUT_JSON_DIRECTORY = os.path.join(PYTHON_SCRIPTS_DIR, "output_jsons")
+
+
+class Settings:
+    project_dir = PROJECT_DIR
+    scripts_dir = PYTHON_SCRIPTS_DIR
+    exported_csv_dir = EXPORTED_CSV_INPUT_DIR
+    output_csv_dir = OUTPUT_CSV_DIR
+    output_json_directory = OUTPUT_JSON_DIRECTORY
+
+    replace_mapping_dict = {
+        'email': 'email@email.com',
+        'postcode': 'N11 1yz',
+        'dateofbirth': '1980-01-01',
+        'document_filename': 'redacted.pdf',
+        'document_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe',
+        'document_binary_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe/binary',
+        'name': 'redacted.pdf',
+        'filename': 'redacted.pdf',
+        'addressline1': '10 street',
+        'addressline2': 'town',
+        'addressline3': 'city',
+        'attendingjudge': 'redacted',
+        'feedescription': 'redacted',
+        'searchpostcode': 'n11 1yz',
+        'posttown': 'town',
+        'details': 'redacted',
+        'decisionreason': 'redacted',
+        'appellantfamilyname': 'redacted',
+        'appellantgivennames': 'redacted',
+        'appellantdateofbirth': '1980-01-01',
+        'appellantemailaddress': 'email@email.com',
+        'casenotedescription': 'redacted',
+        'casenotesubject': 'redacted',
+        'user': 'redacted',
+        'question': 'redacted',
+        'explanation': 'redacted',
+        'address': 'redacted',
+        'bundlefilenameprefix': 'EA 50111 2023',
+        'casenamehmctsinternal': 'redacted',
+        'hmctscasenameinternal': 'redacted',
+        'appellantnamefordisplay': 'redacted',
+        'reasonsforappealdecision': 'redacted',
+        'description': 'redacted',
+        'answer': 'redacted',
+        "legalrepname": "redacted",
+        "mobilenumber": "07451111111",
+        "legalrepcompany": "redacted",
+        "partyname": "redacted",
+        "legalrepcompanyname": "redacted",
+        "county": "redacted",
+        "legalrepresentativename": "redacted",
+        "legalrepreferencenumber": "AA/1234",
+        "directioneditexplanation": 'redacted',
+        "fullname": "redacted",
+        "dayofbirth": 1,
+        "familyname": "redacted",
+        "yearofbirth": 1980,
+        "monthofbirth": 1,
+        "displaydateofbirth": "10 Jan 1980",
+        "documentreference": "012345678",
+        "displayappellantdetailstitle": "redacted",
+        "displayapplicationdetailstitle": "redacted",
+        "homeofficesearchresponse": 'redacted',
+        "remotevideocalldescription": 'redacted',
+        "hearingdaterangedescription": 'redacted',
+        "interpreterlanguagereadonly": "Language\t\tEnglish\nDialect\t\t\tENG",
+        "legalrepresentativeemailaddress": "email@email.co.uk",
+        "remotevideocalltribunalresponse": "redacted",
+        "appellantnationalitiesdescription": "France",
+        "language": "English",
+        "languagedialect": "redacted",
+        "legalaidaccountnumber": "OG123V1",
+        "appellantphonenumber": "07451111111",
+        "givenname": "redacted",
+        "data": "redacted",
+        "witnessname": "redacted",
+        "witnessdetailsreadonly": "redacted",
+        "multimediaTribunalResponse": "redacted",
+        "appellantfullname": "redacted",
+        "endappealapprovername": "redacted",
+        "dateToAvoidReason": "redacted",
+        "sponsorNameForDisplay": "redacted",
+        "sponsorAddressForDisplay": "10 street",
+        "sponsorMobileNumber": "07451111111",
+        "Name": "redacted",
+        "attendingAppellant": "redacted",
+        "attendingHomeOfficeLegalRepresentative": "redacted",
+        "attendingAppellantsLegalRepresentative": "redacted",
+        "newMatters": "redacted"
+    }
+
+    csv_rows_to_redact = {
+        "user_first_name": "redacted",
+        "user_last_name": "redacted"
+    }
+
+
+settings = Settings()
diff --git a/bin/utils/python_scripts/init.py b/bin/utils/python_scripts/init.py
new file mode 100755
index 0000000000..e69de29bb2
diff --git a/bin/utils/python_scripts/input_csv_files/dummy.txt b/bin/utils/python_scripts/input_csv_files/dummy.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/bin/utils/python_scripts/output_csv_files/dummy.txt b/bin/utils/python_scripts/output_csv_files/dummy.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/bin/utils/python_scripts/output_jsons/dummy.txt b/bin/utils/python_scripts/output_jsons/dummy.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/bin/utils/python_scripts/prep_import_data.py b/bin/utils/python_scripts/prep_import_data.py
new file mode 100755
index 0000000000..01422c1e7e
--- /dev/null
+++ b/bin/utils/python_scripts/prep_import_data.py
@@ -0,0 +1,68 @@
+import os
+import csv
+
+from filepath_settings import settings
+from create_jsons_from_event_csv import create_jsons_from_csv
+from redact_info_from_json import redact_values_from_csv
+
+
+def prep_import_data(directory: str = settings.exported_csv_dir, events_to_get_individual_json: list[int] = None):
+    """
+    Function to prep exported CSV data for importing. Redacts and transforms most recent exported files within
+    python_scripts directory.
+
+    Exported files should be in format: case_event_202402080516.csv and case_data_202402080518.csv
+    (default export pattern)
+    """
+    latest_case_event_data = get_latest_file(directory, 'case_event')
+    latest_case_data = get_latest_file(directory, 'case_data')
+    if events_to_get_individual_json:
+        create_jsons_from_csv(latest_case_event_data, events=events_to_get_individual_json)
+    redacted_case_data = redact_values_from_csv(latest_case_data)
+    os.chmod(redacted_case_data, 0o777)
+    case_data_id = input(
+        'Import the redacted case data CSV and retrieve the correct case id.\nEnter the new case data id:')\
+        .encode('utf-8').decode('utf-8')
+    redacted_case_event_data = redact_values_from_csv(latest_case_event_data)
+    redacted_replaced_event_data = replace_case_data_id(case_data_id, redacted_case_event_data)
+    os.chmod(redacted_replaced_event_data, 0o777)
+
+
+def get_latest_file(dir_path: str, file_prefix: str) -> str:
+    dir_files = os.listdir(dir_path)
+    files = [file for file in dir_files if file_prefix in file]
+    times = []
+    for file in files:
+        try:
+            file = file.split('.')[0]
+            times.append(int(file[-12:]))
+        except Exception as e:
+            print(e)
+            pass
+    latest = max(times)
+    filename = f"{file_prefix}_{latest}.csv"
+    full_filepath = os.path.join(dir_path, filename)
+    return full_filepath
+
+
+def replace_case_data_id(new_id: str, file_path: str):
+    # Read the CSV file and create a list of dictionaries
+    with open(file_path, 'r') as file:
+        reader = csv.DictReader(file)
+        data_list = list(reader)
+
+    # Replace values in the 'case_data_id' column with the new_id
+    for row in data_list:
+        if 'case_data_id' in row:
+            row['case_data_id'] = new_id
+
+    # Write the modified data back to the CSV file
+    with open(file_path, 'w', newline='') as file:
+        fieldnames = reader.fieldnames
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(data_list)
+    return file_path
+
+
+prep_import_data(events_to_get_individual_json=range(1,13))
\ No newline at end of file
diff --git a/bin/utils/python_scripts/redact_info_from_json.py b/bin/utils/python_scripts/redact_info_from_json.py
old mode 100644
new mode 100755
index 2b085f93e9..7da34aa4c5
--- a/bin/utils/python_scripts/redact_info_from_json.py
+++ b/bin/utils/python_scripts/redact_info_from_json.py
@@ -2,109 +2,32 @@
 import json
 import csv
 
+from filepath_settings import settings
+
 """
 Script for redacting data from a case data JSON or CSV.
 
 Usage: 
 
 Replace filepath with absolute filepath of JSON/CSV requiring redacting at the bottom of this file where 
-desired function is called (comment out other if not needed).
+desired function is called.
 
-Run python bin/utils/python_scripts/redact_info_from_json.py while in ia/case/api directory
- 
 Script will output redacted JSON/CSV file in the same directory as original
 with '_redacted' suffix.
 
 Notes:
- 
-Fields to be redacted are hardcoded in replace mapping dict, so use with caution and double check redacted JSON 
-(I suggest comparing two files in IDE) as it's only been tested on a few cases' data.
 
-Add any additionally required fields (in full lowercase) to be redacted in replace_mapping_dict with their corresponding
+Fields to be redacted are hardcoded in replace mapping dict in filepath_settings.py, so use with caution and double 
+check redacted JSON (I suggest comparing two files in IDE) as it's only been tested on a few cases' data.
+
+Add any additionally required fields to be redacted in replace_mapping_dict with their corresponding
 replace data
 
 Replaces all document filenames so if testing any specifics you may want to manually change the filenames
 """
 
-replace_mapping_dict = {
-    'email': 'email@email.com',
-    'postcode': 'N11 1yz',
-    'dateofbirth': '1980-01-01',
-    'document_filename': 'redacted.pdf',
-    'document_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe',
-    'document_binary_url': 'http://dm-store-aat.service.core-compute-aat.internal/documents/9ce3f9d5-31ef-4021-9aa5-4d017c404cfe/binary',
-    'name': 'redacted.pdf',
-    'filename': 'redacted.pdf',
-    'addressline1': '10 street',
-    'addressline2': 'town',
-    'addressline3': 'city',
-    'attendingjudge': 'redacted',
-    'feedescription': 'redacted',
-    'searchpostcode': 'n11 1yz',
-    'posttown': 'town',
-    'details': 'redacted',
-    'decisionreason': 'redacted',
-    'appellantfamilyname': 'redacted',
-    'appellantgivennames': 'redacted',
-    'appellantdateofbirth': '1980-01-01',
-    'appellantemailaddress': 'email@email.com',
-    'casenotedescription': 'redacted',
-    'casenotesubject': 'redacted',
-    'user': 'redacted',
-    'question': 'redacted',
-    'explanation': 'redacted',
-    'address': 'redacted',
-    'bundlefilenameprefix': 'EA 50111 2023',
-    'casenamehmctsinternal': 'redacted',
-    'hmctscasenameinternal': 'redacted',
-    'appellantnamefordisplay': 'redacted',
-    'reasonsforappealdecision': 'redacted',
-    'description': 'redacted',
-    'answer': 'redacted',
-    "legalrepname": "redacted",
-    "mobilenumber": "07451111111",
-    "legalrepcompany": "redacted",
-    "partyname": "redacted",
-    "legalrepcompanyname": "redacted",
-    "county": "redacted",
-    "legalrepresentativename": "redacted",
-    "legalrepreferencenumber": "AA/1234",
-    "directioneditexplanation": 'redacted',
-    "fullname": "redacted",
-    "dayofbirth": 1,
-    "familyname": "redacted",
-    "yearofbirth": 1980,
-    "monthofbirth": 1,
-    "displaydateofbirth": "10 Jan 1980",
-    "documentreference": "012345678",
-    "displayappellantdetailstitle": "redacted",
-    "displayapplicationdetailstitle": "redacted",
-    "homeofficesearchresponse": 'redacted',
-    "remotevideocalldescription": 'redacted',
-    "hearingdaterangedescription": 'redacted',
-    "interpreterlanguagereadonly": "Language\t\tEnglish\nDialect\t\t\tENG",
-    "legalrepresentativeemailaddress": "email@email.co.uk",
-    "remotevideocalltribunalresponse": "redacted",
-    "appellantnationalitiesdescription": "France",
-    "language": "English",
-    "languagedialect": "redacted",
-    "legalaidaccountNumber": "OG123V1",
-    "appellantPhoneNumber": "07451111111",
-    "givenName": "redacted",
-    "appellantfullname": "redacted",
-    "endappealapprovername": "redacted",
-    
-    # Refactor to separate list for CSV rows
-    "data": "redacted",
-}
-
-csv_rows_to_redact = {
-    "user_first_name": "redacted",
-    "user_last_name": "redacted"
-}
-
-replace_mapping_keys = list(replace_mapping_dict.keys())
-replace_csv_mapping_keys = list(csv_rows_to_redact.keys())
+replace_mapping_keys = [x.lower() for x in list(settings.replace_mapping_dict.keys())]
+replace_csv_mapping_keys = [x.lower() for x in list(settings.csv_rows_to_redact.keys())]
 
 
 def redact_values_from_json(file_path, keys_to_redact):
@@ -131,11 +54,13 @@ def redact_values(json_data, keys_to_redact):
             redact_values(item, keys_to_redact)
 
 
-def redact_values_from_csv(input_file_path, keys_to_redact):
+def redact_values_from_csv(input_file_path, keys_to_redact = replace_mapping_keys) -> str:
     with open(input_file_path, 'r', newline='') as input_file:
         reader = csv.DictReader(input_file)
         rows = list(reader)
+
     redact_csv_rows(rows, keys_to_redact)
+
     output_file_path = get_redacted_file_path(input_file_path)
 
     with open(output_file_path, 'w', newline='') as output_file:
@@ -143,6 +68,7 @@ def redact_values_from_csv(input_file_path, keys_to_redact):
         writer = csv.DictWriter(output_file, fieldnames=fieldnames)
         writer.writeheader()
         writer.writerows(rows)
+    return output_file_path
 
 
 def redact_csv_rows(rows, keys_to_redact):
@@ -155,7 +81,6 @@ def redact_csv_rows(rows, keys_to_redact):
                     redact_values(json_value, keys_to_redact)
                     row[key] = json.dumps(json_value)
                 except json.JSONDecodeError:
-                    print(f'Not a valid json within CSV field: {key}')
                     pass
             elif key.lower() in replace_csv_mapping_keys:
                 row[key] = 'redacted'
@@ -164,17 +89,20 @@ def redact_csv_rows(rows, keys_to_redact):
 
 
 def get_redacted_file_path(original_file_path):
+    original_file_path = original_file_path.replace("input", "output")
     base_name, extension = os.path.splitext(original_file_path)
-    return f"{base_name}_redacted{extension}"
+    file_name = f"{base_name}_redacted{extension}"
+    return file_name
 
 
 def get_replace_term(key):
     key = key.lower()
-    return replace_mapping_dict.get(key, 'redacted')
+    return settings.replace_mapping_dict.get(key, 'redacted')
 
 
 # redact_values_from_json(
-#     '/Users/jacobcohensolirius/HMCTS/IA/ia-case-api/bin/utils/python_scripts/SNi_tickets/SNI-5296/latest_data.json', replace_mapping_keys
+#     'latest_data.json', replace_mapping_keys
 # )
 
-redact_values_from_csv('/Users/jacobcohensolirius/HMCTS/IA/ia-case-api/bin/utils/python_scripts/SNi_tickets/SNI_5340/case_event_202401261547.csv', replace_mapping_keys)
+# redact_values_from_csv(
+#     'case_event_202401261547.csv')
diff --git a/bin/utils/python_scripts/tests.py b/bin/utils/python_scripts/tests.py
new file mode 100644
index 0000000000..cedf4d5424
--- /dev/null
+++ b/bin/utils/python_scripts/tests.py
@@ -0,0 +1,46 @@
+import unittest
+import json
+import os
+import shutil
+
+from convert_json_to_txt import convert_json_to_txt
+
+
+class TestConvertJsonToTxt(unittest.TestCase):
+
+    def setUp(self):
+        # Create a temporary directory for testing
+        self.test_dir = "test_temp_dir"
+        os.makedirs(self.test_dir, exist_ok=True)
+
+    def tearDown(self):
+        # Remove the temporary directory and its contents
+        shutil.rmtree(self.test_dir)
+
+    def test_convert_json_to_txt(self):
+        # Prepare a sample JSON data
+        json_data = {'name': 'John', 'age': 30, 'city': 'New York'}
+
+        # Write the sample JSON data to a file
+        input_file_path = os.path.join(self.test_dir, "input.json")
+        with open(input_file_path, "w") as json_file:
+            json.dump(json_data, json_file)
+
+        # Call the function to convert JSON to TXT
+        convert_json_to_txt(input_file_path)
+
+        # Check if the output file exists
+        output_file_path = os.path.join(self.test_dir, "input.txt")
+        self.assertTrue(os.path.exists(output_file_path))
+
+        # Read the content of the output file
+        with open(output_file_path, "r") as text_file:
+            text_data = text_file.read()
+
+        # Check if the content matches the expected compact text representation
+        expected_text_data = '{"name":"John","age":30,"city":"New York"}'
+        self.assertEqual(text_data, expected_text_data)
+
+
+if __name__ == '__main__':
+    unittest.main()