From 96d864fbadd3a44f4bd67e0e53448d1bcbca8282 Mon Sep 17 00:00:00 2001 From: Michael Chouinard <46358556+chouinar@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:20:51 -0400 Subject: [PATCH] [Issue #2429] Various small fixes to get ELT process working in prod (#2430) ## Summary Fixes #2429 ### Time to review: __5 mins__ ## Changes proposed Actually set the default chunk size to 800 Handle cases where lookup values in the legacy system are ` ` (whitespace) and treat them the same as null/empty strings Handle deleting the `current_opportunity_summary` record when an `opportunity_summary` is deleted ## Context for reviewers All issues found during testing in the prod ELT runs I was doing today Chunk size affects how "big" the batches are when we do queries to the Oracle foreign data wrapper as Oracle has a hard limit on the size of queries that we were hitting. For the lookup values, found a dozen or so opportunities where the opportunity category was literally a ` ` (space) and we couldn't process them. This fixes that and pre-emptively does it for other similar rows (although I think most of them haven't had the issue). The delete just takes advantage of SQLAlchemy's approach to deletes which will automatically delete through relationships when you do `db_session.delete(some_record)`. Now it'll know it needs to first delete the current opportunity summary record as part of that. ## Additional information Tests updated accordingly --- .../data_migration/command/load_transform.py | 2 +- .../transformation/transform_util.py | 26 ++++++++++++------- api/src/db/models/opportunity_models.py | 7 +++++ .../data_migration/transformation/conftest.py | 7 ++++- .../test_transform_opportunity_summary.py | 2 ++ .../transformation/test_transform_util.py | 5 +++- 6 files changed, 37 insertions(+), 12 deletions(-) diff --git a/api/src/data_migration/command/load_transform.py b/api/src/data_migration/command/load_transform.py index 85569d1b2..8617e9158 100644 --- a/api/src/data_migration/command/load_transform.py +++ b/api/src/data_migration/command/load_transform.py @@ -29,7 +29,7 @@ "--set-current/--no-set-current", default=True, help="run SetCurrentOpportunitiesTask" ) @click.option( - "--insert-chunk-size", default=4000, help="chunk size for load inserts", show_default=True + "--insert-chunk-size", default=800, help="chunk size for load inserts", show_default=True ) @click.option("--tables-to-load", "-t", help="table to load", multiple=True) @flask_db.with_db_session() diff --git a/api/src/data_migration/transformation/transform_util.py b/api/src/data_migration/transformation/transform_util.py index 57dc7b74a..6dc742425 100644 --- a/api/src/data_migration/transformation/transform_util.py +++ b/api/src/data_migration/transformation/transform_util.py @@ -94,6 +94,12 @@ } +def is_empty_str(value: str | None) -> bool: + # null/None, "" and any amount of just whitespace + # are treated as an empty string + return value is None or value.strip() == "" + + def transform_opportunity( source_opportunity: Topportunity, existing_opportunity: Opportunity | None ) -> Opportunity: @@ -125,7 +131,7 @@ def transform_opportunity( def transform_opportunity_category(value: str | None) -> OpportunityCategory | None: - if value is None or value == "": + if is_empty_str(value): return None if value not in OPPORTUNITY_CATEGORY_MAP: @@ -135,7 +141,7 @@ def transform_opportunity_category(value: str | None) -> OpportunityCategory | N def transform_applicant_type(value: str | None) -> ApplicantType | None: - if value is None or value == "": + if is_empty_str(value): return None if value not in APPLICANT_TYPE_MAP: @@ -145,7 +151,7 @@ def transform_applicant_type(value: str | None) -> ApplicantType | None: def transform_funding_category(value: str | None) -> FundingCategory | None: - if value is None or value == "": + if is_empty_str(value): return None if value not in FUNDING_CATEGORY_MAP: @@ -155,7 +161,7 @@ def transform_funding_category(value: str | None) -> FundingCategory | None: def transform_funding_instrument(value: str | None) -> FundingInstrument | None: - if value is None or value == "": + if is_empty_str(value): return None if value not in FUNDING_INSTRUMENT_MAP: @@ -428,7 +434,7 @@ def transform_update_create_timestamp( def convert_yn_bool(value: str | None) -> bool | None: # Booleans in the Oracle database are stored as varchar/char # columns with the values as Y/N (very rarely Yes/No) - if value is None or value == "": + if is_empty_str(value): return None if value in TRUTHY: @@ -442,7 +448,7 @@ def convert_yn_bool(value: str | None) -> bool | None: def convert_true_false_bool(value: str | None) -> bool | None: - if value is None or value == "": + if is_empty_str(value): return None return value == "TRUE" @@ -463,7 +469,7 @@ def convert_action_type_to_is_deleted(value: str | None) -> bool: # however many older records seem to not have this set at all # The legacy system looks like it treats anything that isn't D # the same, so we'll go with that assumption as well. - if value is None or value == "": + if is_empty_str(value): return False if value == "D": # D = Delete @@ -476,11 +482,13 @@ def convert_action_type_to_is_deleted(value: str | None) -> bool: def convert_numeric_str_to_int(value: str | None) -> int | None: - if value is None or value == "": + if is_empty_str(value): return None try: - return int(value) + # We know the value is a string and not None from the above + # function call but mypy can't see that, so ignore it here. + return int(value) # type: ignore except ValueError: # From what we've found in the legacy data, some of these numeric strings # are written out as "none", "not available", "n/a" or similar. All of these diff --git a/api/src/db/models/opportunity_models.py b/api/src/db/models/opportunity_models.py index 36921b14d..e57bce5e3 100644 --- a/api/src/db/models/opportunity_models.py +++ b/api/src/db/models/opportunity_models.py @@ -216,6 +216,13 @@ class OpportunitySummary(ApiSchemaTable, TimestampMixin): creator=lambda obj: LinkOpportunitySummaryApplicantType(applicant_type=obj), ) + # We configure a relationship from a summary to the current opportunity summary + # Just in case we delete this record, we can cascade to deleting the current_opportunity_summary + # record as well automatically. + current_opportunity_summary: Mapped["CurrentOpportunitySummary | None"] = relationship( + back_populates="opportunity_summary", single_parent=True, cascade="delete" + ) + def for_json(self) -> dict: json_valid_dict = super().for_json() diff --git a/api/tests/src/data_migration/transformation/conftest.py b/api/tests/src/data_migration/transformation/conftest.py index d78af140d..fdc7b8458 100644 --- a/api/tests/src/data_migration/transformation/conftest.py +++ b/api/tests/src/data_migration/transformation/conftest.py @@ -98,6 +98,7 @@ def setup_synopsis_forecast( opportunity: Opportunity | None, is_delete: bool = False, is_already_processed: bool = False, + is_existing_current_opportunity_summary: bool = False, source_values: dict | None = None, ): if source_values is None: @@ -128,9 +129,13 @@ def setup_synopsis_forecast( ) if create_existing: - f.OpportunitySummaryFactory.create( + opportunity_summary = f.OpportunitySummaryFactory.create( opportunity=opportunity, is_forecast=is_forecast, revision_number=revision_number ) + if is_existing_current_opportunity_summary: + f.CurrentOpportunitySummaryFactory.create( + opportunity=opportunity, opportunity_summary=opportunity_summary + ) return source_summary diff --git a/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_summary.py b/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_summary.py index 5dcec4f56..883737e7c 100644 --- a/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_summary.py +++ b/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_summary.py @@ -69,6 +69,7 @@ def test_process_opportunity_summaries(self, db_session, transform_opportunity_s create_existing=True, is_delete=True, opportunity=opportunity3, + is_existing_current_opportunity_summary=True, ) synopsis_delete1 = setup_synopsis_forecast( is_forecast=False, @@ -98,6 +99,7 @@ def test_process_opportunity_summaries(self, db_session, transform_opportunity_s create_existing=False, is_delete=True, opportunity=opportunity4, + is_existing_current_opportunity_summary=True, ) synopsis_update_invalid_yn_field = setup_synopsis_forecast( is_forecast=False, diff --git a/api/tests/src/data_migration/transformation/test_transform_util.py b/api/tests/src/data_migration/transformation/test_transform_util.py index becd6fec8..c674621b2 100644 --- a/api/tests/src/data_migration/transformation/test_transform_util.py +++ b/api/tests/src/data_migration/transformation/test_transform_util.py @@ -16,6 +16,7 @@ ("M", OpportunityCategory.MANDATORY), ("O", OpportunityCategory.OTHER), (None, None), + (" ", None), ("", None), ], ) @@ -90,6 +91,7 @@ def test_transform_update_create_timestamp( ("yes", True), ("no", False), ("", None), + (" ", None), (None, None), ], ) @@ -104,7 +106,7 @@ def test_convert_yn_boolean_unexpected_value(value): @pytest.mark.parametrize( - "value,expected_value", [("D", True), ("U", False), ("", False), (None, False)] + "value,expected_value", [("D", True), ("U", False), ("", False), (" ", False), (None, False)] ) def test_convert_action_type_to_is_deleted(value, expected_value): assert transform_util.convert_action_type_to_is_deleted(value) == expected_value @@ -124,6 +126,7 @@ def test_convert_action_type_to_is_deleted_unexpected_value(value): ("123123123", 123123123), ("-5", -5), ("", None), + (" ", None), (None, None), ("words", None), ("zero", None),