ministryofjustice · matt-heery · Jun 19, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
@@ -1,7 +1,10 @@
 module "dms_task" {
   source = "./modules/dms"
 
-  for_each = toset(var.database_list)
+  for_each = toset(local.is-production? [
+    "g4s_cap_dw",
+    "g4s_emsys_mvp"
+  ] : ["test"])
 
   database_name = each.key
 

@@ -0,0 +1,17 @@
+{
+  "rules": [
+    {
+      "rule-type": "selection",
+      "rule-id": "01",
+      "rule-name": "all",
+      "object-locator": {
+        "schema-name": "%",
+        "table-name": "%"
+      },
+      "rule-action": "include",
+      "filters": [],
+      "parallel-load": null,
+      "isAutoSegmentationChecked": false
+    }
+  ]
+}
@@ -1,13 +1,3 @@
-# This variable needs to be supplied with the list of database names to be migrated
-variable "database_list" {
-  type = list(string)
-  # cap_dw
-  default = [
-    "g4s_cap_dw",
-    "g4s_emsys_mvp"
-  ]
-}
-
 variable "dms_replication_instance_class" {
   description = "Name of the replication instance class to be used"
   type        = string

@@ -0,0 +1,4 @@
+module "ecr_lambda_repo" {
+    source = "./modules/ecr"
+    ecr_name = "lambdas/update_log_table"
+}
@@ -74,7 +74,8 @@ def has_query_succeeded(execution_id):
     CREATE EXTERNAL TABLE IF NOT EXISTS `{GLUE_CATALOG_DB_NAME}`.`{GLUE_CATALOG_TBL_NAME}`(
     `run_datetime` timestamp,
     `json_row` string, 
-    `validation_msg` string)
+    `validation_msg` string,
+    `table_to_ap` string)
     PARTITIONED BY ( 
         `database_name` string,
         `full_table_name` string)

@@ -96,7 +96,7 @@ def resolve_args(args_list):
 
 NVL_DTYPE_DICT = {'string': "''", 'int': 0, 'double': 0, 'float': 0, 'smallint': 0, 'bigint':0,
                   'boolean': False,
-                  'timestamp': "to_timestamp('1900-01-01', 'yyyy-MM-dd')"}
+                  'timestamp': "to_timestamp('1900-01-01', 'yyyy-MM-dd')", 'date': "to_date('1900-01-01', 'yyyy-MM-dd')"}
 
 # ===============================================================================
 # USER-DEFINED-FUNCTIONS
@@ -311,7 +311,8 @@ def process_dv_for_table(rds_db_name, rds_tbl_name, total_files, total_size_mb,
     cast(null as string) as json_row,
     cast(null as string) as validation_msg,
     cast(null as string) as database_name,
-    cast(null as string) as full_table_name
+    cast(null as string) as full_table_name,
+    cast(null as string) as table_to_ap
     """.strip()
 
     df_dv_output = spark.sql(sql_select_str).repartition(input_repartition_factor)
@@ -381,7 +382,8 @@ def process_dv_for_table(rds_db_name, rds_tbl_name, total_files, total_size_mb,
                                                   "'' as json_row",
                                                   f"""'{rds_tbl_name} - Validated.{additional_message}' as validation_msg""",
                                                   f"""'{rds_db_name}' as database_name""",
-                                                  f"""'{db_sch_tbl}' as full_table_name"""
+                                                  f"""'{db_sch_tbl}' as full_table_name""",
+                                                  """'False' as table_to_ap"""
                                                   )
                 LOGGER.info(f"Validation Successful - 1")
                 df_dv_output = df_dv_output.union(df_temp)
@@ -396,7 +398,8 @@ def process_dv_for_table(rds_db_name, rds_tbl_name, total_files, total_size_mb,
                                              "json_row",
                                              validation_msg,
                                              f"""'{rds_db_name}' as database_name""",
-                                             f"""'{db_sch_tbl}' as full_table_name"""
+                                             f"""'{db_sch_tbl}' as full_table_name""",
+                                             """'False' as table_to_ap"""
                                              )
                 LOGGER.warn(f"Validation Failed - 2")
                 df_dv_output = df_dv_output.union(df_temp)
@@ -408,7 +411,8 @@ def process_dv_for_table(rds_db_name, rds_tbl_name, total_files, total_size_mb,
                                               "'' as json_row",
                                               validation_msg,
                                               f"""'{rds_db_name}' as database_name""",
-                                              f"""'{db_sch_tbl}' as full_table_name"""
+                                              f"""'{db_sch_tbl}' as full_table_name""",
+                                              """'False' as table_to_ap"""
                                               )
             LOGGER.warn(f"Validation Failed - 3")
             df_dv_output = df_dv_output.union(df_temp)
@@ -423,7 +427,8 @@ def process_dv_for_table(rds_db_name, rds_tbl_name, total_files, total_size_mb,
                                           "'' as json_row",
                                           f"""'{db_sch_tbl} - S3-Parquet folder path does not exist !' as validation_msg""",
                                           f"""'{rds_db_name}' as database_name""",
-                                          f"""'{db_sch_tbl}' as full_table_name"""
+                                          f"""'{db_sch_tbl}' as full_table_name""",
+                                          """'False' as table_to_ap"""
                                           )
         LOGGER.warn(f"Validation not applicable - 4")
         df_dv_output = df_dv_output.union(df_temp)
@@ -482,19 +487,24 @@ def write_parquet_to_s3(df_dv_output: DataFrame, database, table):
 
     # -------------------------------------------------------
     if args.get("select_rds_db_tbls", None) is None:
-
-        exclude_rds_db_tbls_list = [f"""{args['rds_sqlserver_db']}_{given_rds_sqlserver_db_schema}_{tbl.strip().strip("'").strip('"')}""" 
-                                    for tbl in args['exclude_rds_db_tbls'].split(",")]
-        LOGGER.warn(f"""Given list of tables being exluded:\n{exclude_rds_db_tbls_list}""")
-
-        filtered_rds_sqlserver_db_tbl_list = [tbl for tbl in rds_sqlserver_db_tbl_list 
+
+        if args.get("exclude_rds_db_tbls", None) is None:
+            exclude_rds_db_tbls_list = list()
+        else:
+            exclude_rds_db_tbls_list = [f"""{args['rds_sqlserver_db']}_{given_rds_sqlserver_db_schema}_{tbl.strip().strip("'").strip('"')}"""
+                                        for tbl in args['exclude_rds_db_tbls'].split(",")]
+            LOGGER.warn(f"""Given list of tables being exluded:\n{exclude_rds_db_tbls_list}""")
+        filtered_rds_sqlserver_db_tbl_list = [tbl for tbl in rds_sqlserver_db_tbl_list
                                               if tbl not in exclude_rds_db_tbls_list]
-
+ 
         if not filtered_rds_sqlserver_db_tbl_list:
-            LOGGER.error(f"""filtered_rds_sqlserver_db_tbl_list - is empty. Exiting ...!""")
+            LOGGER.error(
+                f"""filtered_rds_sqlserver_db_tbl_list - is empty. Exiting ...!""")
             sys.exit(1)
         else:
-            LOGGER.info(f"""List of tables to be processed: {filtered_rds_sqlserver_db_tbl_list}""")
+            LOGGER.info(
+                f"""List of tables to be processed: {filtered_rds_sqlserver_db_tbl_list}""")
+
 
         for db_sch_tbl in filtered_rds_sqlserver_db_tbl_list:
             rds_db_name, rds_tbl_name = db_sch_tbl.split(f"_{given_rds_sqlserver_db_schema}_")[0], \

@@ -145,8 +145,7 @@ resource "aws_iam_policy" "glue_user_restricted_notebook_service_role_iam_policy
             ],
             "Resource": [
                 "arn:aws:iam::*:role/service-role/AwsGlueSessionServiceRoleUserRestrictedForNotebook*",
-                "arn:aws:iam::976799291502:role/${aws_iam_role.glue_notebook_iam_role.name}",
-                "arn:aws:iam::800964199911:role/${aws_iam_role.glue_notebook_iam_role.name}"
+                "arn:aws:iam::${local.env_account_id}:role/${aws_iam_role.glue_notebook_iam_role.name}"
             ],
             "Condition": {
                 "StringLike": {

@@ -0,0 +1,4 @@
+def handler(event, context):
+    data = event["queryOutput"]["ResultSet"]["Rows"][1:]
+    output_list = [{row["Data"][0]["VarCharValue"]: row["Data"][1]["VarCharValue"]} for row in data]
+    return output_list
@@ -56,4 +56,4 @@ def handler(event, context):
         logger.error(msg)
         raise Exception(msg)
 
-    return {"statusCode": 200, "body": json.dumps(f"{copy_object} has been Successfully Copied to the AP")}
+    return (database_name, schema_name, table_name)
@@ -0,0 +1,9 @@
+FROM public.ecr.aws/lambda/python:3.11
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt --target "${LAMBDA_TASK_ROOT}"
+
+COPY update_log_table.py ${LAMBDA_TASK_ROOT}
+
+CMD ["update_log_table.handler"]
@@ -0,0 +1,34 @@
+APP_NAME = update_log_table
+APP_VERSION := $(shell terraform -chdir=../../ output -raw account_suffix)
+
+.PHONY: print-account-suffix
+print-account-suffix:
+	@echo APP_VERSION=$(APP_VERSION)
+
+AWS_ECR_ACCOUNT_ID := $(shell terraform -chdir=../../ output -raw account_id)
+
+.PHONY: print-account-id
+print-account-id:
+	@echo AWS_ECR_ACCOUNT_ID=$(AWS_ECR_ACCOUNT_ID)
+
+
+AWS_ECR_REGION = eu-west-2
+AWS_ECR_REPO = lambdas/$(APP_NAME)
+
+TAG = $(APP_VERSION)
+
+.PHONY: docker/build docker/push docker/run docker/test
+
+docker/build :
+	docker build -t $(APP_NAME):$(APP_VERSION) .
+
+docker/push: docker/build
+	aws ecr get-login-password --region $(AWS_ECR_REGION) | docker login --username AWS --password-stdin $(AWS_ECR_ACCOUNT_ID).dkr.ecr.$(AWS_ECR_REGION).amazonaws.com
+	docker tag $(APP_NAME):$(APP_VERSION) $(AWS_ECR_ACCOUNT_ID).dkr.ecr.$(AWS_ECR_REGION).amazonaws.com/$(AWS_ECR_REPO):$(TAG)
+	docker push $(AWS_ECR_ACCOUNT_ID).dkr.ecr.$(AWS_ECR_REGION).amazonaws.com/$(AWS_ECR_REPO):$(TAG)
+
+docker/run:
+	docker run -p 9000:8080 $(AWS_ECR_ACCOUNT_ID).dkr.ecr.$(AWS_ECR_REGION).amazonaws.com/$(AWS_ECR_REPO):$(TAG)
+
+docker/test:
+	curl -XPOST 'http://localhost:9000/2015-03-31/functions/function/invocations' -d '{"input": {"test/*/Financials": "s3://dms-rds-to-parquet-20240606142913727200000001/test/dbo/Financials/LOAD00000001.parquet","db_info": ["test","dbo","Financials"]},"inputDetails": {"truncated": false},"resource": "arn:aws:lambda:eu-west-2:800964199911:function:update_log_table"}'
@@ -0,0 +1 @@
+pandas[pyarrow]==2.2.1
@@ -0,0 +1,87 @@
+import pandas as pd
+import boto3
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+logger.setLevel(logging.INFO)
+
+S3_LOG_BUCKET = os.environ.get("S3_LOG_BUCKET")
+DATABASE_NAME = os.envrion.get("DATABASE_NAME")
+TABLE_NAME = os.environ.get("TABLE_NAME")
+
+def s3_path_to_bucket_key(s3_path):
+    """
+    Splits out s3 file path to bucket key combination
+    """
+    return s3_path.replace("s3://", "").split("/", 1)
+
+
+def bucket_key_to_s3_path(bucket, key):
+    """
+    Takes an S3 bucket and key combination and returns the
+    full S3 path to that location.
+    """
+    return f"s3://{bucket}/{key}"
+
+
+def _add_slash(s):
+    """
+    Adds slash to end of string
+    """
+    return s if s[-1] == "/" else s + "/"
+
+
+def get_filepaths_from_s3_folder(
+    s3_folder_path, file_extension=None, exclude_zero_byte_files=True
+):
+    """
+    Get a list of filepaths from a bucket. If extension is set to a string
+    then only return files with that extension otherwise if set to None (default)
+    all filepaths are returned.
+    :param s3_folder_path: "s3://...."
+    :param extension: file extension, e.g. .json
+    :param exclude_zero_byte_files: Whether to filter out results of zero size: True
+    :return: A list of full s3 paths that were in the given s3 folder path
+    """
+
+    s3_resource = boto3.resource("s3")
+
+    if file_extension is not None:
+        if file_extension[0] != ".":
+            file_extension = "." + file_extension
+
+    # This guarantees that the path the user has given is really a 'folder'.
+    s3_folder_path = _add_slash(s3_folder_path)
+
+    bucket, key = s3_path_to_bucket_key(s3_folder_path)
+
+    s3b = s3_resource.Bucket(bucket)
+    obs = s3b.objects.filter(Prefix=key)
+
+    if file_extension is not None:
+        obs = [o for o in obs if o.key.endswith(file_extension)]
+
+    if exclude_zero_byte_files:
+        obs = [o for o in obs if o.size != 0]
+
+    ob_keys = [o.key for o in obs]
+
+    paths = sorted([bucket_key_to_s3_path(bucket, o) for o in ob_keys])
+
+    return paths
+
+def handler(event, context):
+    database_name, schema_name, table_name = event.get("db_info")
+    s3_path = f"s3://{S3_LOG_BUCKET}/{DATBASE_NAME}/{TABLE_NAME}/database_name={database_name}/full_table_name={database_name}_{schema_name}_{table_name}"
+    file_names = [file.split("/")[-1] for file in get_filepaths_from_s3_folder(s3_path)]
+    log_table = pd.read_parquet(s3_path)
+    log_table["table_to_ap"] = "True"
+    try:
+        log_table.to_parquet(f"{s3_path}/{file_names[0]}")
+    except Exception as e:
+        msg = f"An error has occured: {e}"
+        logger.error(msg)
+        raise msg
+    return {}