From b5ff8c8f3980824eaab09407c03dbd959549d304 Mon Sep 17 00:00:00 2001 From: Michael Chouinard <46358556+chouinar@users.noreply.github.com> Date: Mon, 16 Sep 2024 10:13:37 -0400 Subject: [PATCH] [Issue #106] Transform agency data (#157) ## Summary Fixes #106 ### Time to review: __10 mins__ ## Changes proposed Add transformations for agency data ## Context for reviewers Agency data is structured oddly in the existing system, instead of being in ordinary tables, its in a `tgroups` table that has values stored as key-value pairs. We want to normalize that into something more workable, so the transformation needs to work a bit differently than the transformations of other tables. For simplicity, I load all of the data for every agency (and later filter to just what changed) as this removes a lot of weird edge cases that we would have otherwise needed to consider. Only modified rows actually get used, but we know we have the full set of data now. ## Additional information I have a snapshot of the prod tgroups table and loaded it into my DB locally and ran the transform script. In total, it takes ~2 seconds to run and didn't hit any issues. A set of the relevant metrics: ``` total_records_processed=1152 total_records_deleted=0 total_records_inserted=1152 total_records_updated=0 total_error_count=0 agency.total_records_processed=1152 agency.total_records_inserted=1152 TransformAgency_subtask_duration_sec=2.14 task_duration_sec=2.14 ``` As a sanity test, I also loaded in the tgroups data from dev and tried running it through. While it generally worked, there were 12 agencies that failed because they were missing the ldapGp and AgencyContactCity fields. I'm not certain if we want to do anything about that as they all seemed to be test agencies based on the names. --------- Co-authored-by: nava-platform-bot --- .../subtask/transform_agency.py | 422 ++++++++++++++++++ .../transformation/transform_constants.py | 1 + .../transform_oracle_data_task.py | 5 + .../transformation/transform_util.py | 69 ++- api/src/db/models/agency_models.py | 13 +- api/src/db/models/staging/staging_base.py | 4 + api/src/db/models/staging/tgroups.py | 11 + .../data_migration/transformation/conftest.py | 92 +++- .../subtask/test_transform_agency.py | 277 ++++++++++++ api/tests/src/db/models/factories.py | 38 +- 10 files changed, 897 insertions(+), 35 deletions(-) create mode 100644 api/src/data_migration/transformation/subtask/transform_agency.py create mode 100644 api/tests/src/data_migration/transformation/subtask/test_transform_agency.py diff --git a/api/src/data_migration/transformation/subtask/transform_agency.py b/api/src/data_migration/transformation/subtask/transform_agency.py new file mode 100644 index 000000000..b7414901f --- /dev/null +++ b/api/src/data_migration/transformation/subtask/transform_agency.py @@ -0,0 +1,422 @@ +import logging +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any + +from pydantic import Field +from sqlalchemy import select +from sqlalchemy.orm import selectinload + +import src.data_migration.transformation.transform_constants as transform_constants +import src.data_migration.transformation.transform_util as transform_util +from src.constants.lookup_constants import ( + AgencyDownloadFileType, + AgencySubmissionNotificationSetting, +) +from src.data_migration.transformation.subtask.abstract_transform_subtask import ( + AbstractTransformSubTask, +) +from src.db.models.agency_models import Agency, AgencyContactInfo, LinkAgencyDownloadFileType +from src.db.models.staging.tgroups import Tgroups +from src.task.task import Task +from src.util.env_config import PydanticBaseEnvConfig + +logger = logging.getLogger(__name__) + +NULLABLE_FIELDS = { + "AgencyCode", # Note this is the sub_agency_code in our system + "AgencyContactEMail2", +} + +AGENCY_FIELD_MAP = { + "AgencyName": "agency_name", + "AgencyCode": "sub_agency_code", + "AgencyCFDA": "assistance_listing_number", + "AgencyDownload": "agency_download_file_types", + "AgencyNotify": "agency_submission_notification_setting", + "ldapGp": "ldap_group", + "description": "description", + "label": "label", + "multilevel": "is_multilevel_agency", + "HasS2SCert": "has_system_to_system_certificate", + "ViewPkgsInGracePeriod": "can_view_packages_in_grace_period", + "multiproject": "is_multiproject", + "ImageWS": "is_image_workspace_enabled", + "ValidationWS": "is_validation_workspace_enabled", +} + +AGENCY_CONTACT_INFO_FIELD_MAP = { + "AgencyContactName": "contact_name", + "AgencyContactAddress1": "address_line_1", + "AgencyContactAddress2": "address_line_2", + "AgencyContactCity": "city", + "AgencyContactState": "state", + "AgencyContactZipCode": "zip_code", + "AgencyContactTelephone": "phone_number", + "AgencyContactEMail": "primary_email", + "AgencyContactEMail2": "secondary_email", +} + +NOT_MAPPED_FIELDS = { + "AgencyEnroll", + "ForecastPOC", + "ForecastPOCEmail", + "ForecastPOCEmailDesc", + "ForecastPOCPhone", + "SynopsisPOC", + "SynopsisPOCEmail", + "SynopsisPOCEmailDesc", + "PackagePOC", + # These fields were only found in the test environment + "ASSISTCompatible", + "SAMValidation", +} + +REQUIRED_FIELDS = { + "AgencyName", + "AgencyCFDA", + "AgencyDownload", + "AgencyNotify", + "ldapGp", + "description", + "label", + "AgencyContactName", + "AgencyContactAddress1", + "AgencyContactCity", + "AgencyContactState", + "AgencyContactZipCode", + "AgencyContactTelephone", + "AgencyContactEMail", +} + + +class AgencyConfig(PydanticBaseEnvConfig): + # TODO - we might want to put this somewhere more central + # as we might want to filter these out in other places + test_agency_config: set[str] = Field( + default={"GDIT", "IVV", "IVPDF", "0001", "FGLT", "NGMS", "NGMS-Sub1", "SECSCAN"} + ) + + +@dataclass +class TgroupAgency: + """ + Container class for holding all tgroup records for + a given agency. + """ + + agency_code: str + tgroups: list[Tgroups] = field(default_factory=list) + + has_update: bool = False + + def add_tgroup(self, tgroup: Tgroups) -> None: + if tgroup.transformed_at is None: + self.has_update = True + + self.tgroups.append(tgroup) + + def get_updated_field_names(self) -> set[str]: + return {tgroup.get_field_name() for tgroup in self.tgroups if tgroup.transformed_at is None} + + +@dataclass +class AgencyUpdates: + """ + Container class for holding all of the necessary updates + for an agency + """ + + agency_updates: dict[str, Any] = field(default_factory=dict) + agency_contact_info_updates: dict[str, Any] = field(default_factory=dict) + agency_download_file_types: set[AgencyDownloadFileType] = field(default_factory=set) + + agency_created_at: datetime | None = None + agency_updated_at: datetime | None = None + + +class TransformAgency(AbstractTransformSubTask): + def __init__(self, task: Task, agency_config: AgencyConfig | None = None) -> None: + super().__init__(task) + + if agency_config is None: + agency_config = AgencyConfig() + + self.agency_config = agency_config + + def transform_records(self) -> None: + # fetch tgroup records + tgroup_map = self.fetch_tgroup_mapping() + + # Fetch all existing agencies + agency_map = self.fetch_agency_mapping() + + for agency_code, tgroup_agency in tgroup_map.items(): + agency = agency_map.get(agency_code) + + try: + self.process_tgroups(tgroup_agency, agency) + except ValueError: + self.increment( + transform_constants.Metrics.TOTAL_ERROR_COUNT, + prefix=transform_constants.AGENCY, + ) + logger.exception("Failed to process agency", extra={"agency_code": agency_code}) + + def fetch_tgroup_mapping(self) -> dict[str, TgroupAgency]: + tgroups = self.db_session.scalars(select(Tgroups)) + + tgroup_mapping: dict[str, TgroupAgency] = {} + + for tgroup in tgroups: + agency_code = tgroup.get_agency_code() + + if agency_code not in tgroup_mapping: + tgroup_mapping[agency_code] = TgroupAgency(agency_code) + + tgroup_mapping[agency_code].add_tgroup(tgroup) + + return tgroup_mapping + + def fetch_agency_mapping(self) -> dict[str, Agency]: + agencies = self.db_session.scalars( + select(Agency).options(selectinload(Agency.agency_contact_info)) + ) + + return {agency.agency_code: agency for agency in agencies} + + def process_tgroups(self, tgroup_agency: TgroupAgency, agency: Agency | None) -> None: + log_extra = {"agency_code": tgroup_agency.agency_code} + logger.info("Processing agency", extra=log_extra) + if not tgroup_agency.has_update: + logger.info("No updates for agency", extra=log_extra) + return + + # Only increment counter for agencies with something to update + self.increment( + transform_constants.Metrics.TOTAL_RECORDS_PROCESSED, prefix=transform_constants.AGENCY + ) + + # New agency insert case + is_insert = False + if agency is None: + is_insert = True + # If any field that is required for creating an agency is missing, we want to error + missing_required_fields = REQUIRED_FIELDS - tgroup_agency.get_updated_field_names() + if missing_required_fields: + raise ValueError( + "Cannot create agency %s as required fields are missing: %s" + % (tgroup_agency.agency_code, ",".join(missing_required_fields)) + ) + + logger.info("Creating new agency", extra=log_extra) + agency = Agency(agency_code=tgroup_agency.agency_code) + agency.agency_contact_info = AgencyContactInfo() + else: + logger.info("Updating agency", extra=log_extra) + + updates = get_agency_updates(tgroup_agency) + apply_updates( + agency, updates.agency_updates, updates.agency_created_at, updates.agency_updated_at + ) + apply_updates( + agency.agency_contact_info, + updates.agency_contact_info_updates, + updates.agency_created_at, + updates.agency_updated_at, + ) + self.update_agency_download_file_types(agency, updates.agency_download_file_types) + + # Set whether the agency is a test agency based on the config + is_test_agency = tgroup_agency.agency_code in self.agency_config.test_agency_config + agency.is_test_agency = is_test_agency + + # After we have fully updated the agency, set the transformed_at timestamp + # for all tgroup records that weren't already set. + for tgroup in tgroup_agency.tgroups: + if tgroup.transformed_at is None: + tgroup.transformed_at = self.transform_time + + if is_insert: + self.increment( + transform_constants.Metrics.TOTAL_RECORDS_INSERTED, + prefix=transform_constants.AGENCY, + ) + else: + self.increment( + transform_constants.Metrics.TOTAL_RECORDS_UPDATED, prefix=transform_constants.AGENCY + ) + + self.db_session.add(agency) + logger.info("Processed agency", extra=log_extra) + + def update_agency_download_file_types( + self, agency: Agency, agency_download_file_types: set[AgencyDownloadFileType] + ) -> None: + # If the download file types we have set is already the same, just return + if agency.agency_download_file_types == agency_download_file_types: + return + + file_types_to_delete = set(agency.agency_download_file_types) - agency_download_file_types + file_types_to_add = agency_download_file_types - set(agency.agency_download_file_types) + + for link_agency_download_file_type in agency.link_agency_download_file_types: + if link_agency_download_file_type.agency_download_file_type in file_types_to_delete: + self.db_session.delete(link_agency_download_file_type) + + for file_type_to_add in file_types_to_add: + self.db_session.add( + LinkAgencyDownloadFileType( + agency=agency, agency_download_file_type=file_type_to_add + ) + ) + + +############################ +# Transformation / utility functions +############################ + +AGENCY_DOWNLOAD_FILE_TYPE_MAP = { + "0": set(), + "1": {AgencyDownloadFileType.XML}, + "2": {AgencyDownloadFileType.XML, AgencyDownloadFileType.PDF}, + "3": {AgencyDownloadFileType.PDF}, +} + +AGENCY_SUBMISSION_NOTIFICATION_SETTING_MAP = { + "1": AgencySubmissionNotificationSetting.NEVER, + "2": AgencySubmissionNotificationSetting.FIRST_APPLICATION_ONLY, + "3": AgencySubmissionNotificationSetting.ALWAYS, +} + + +def get_agency_updates(tgroup_agency: TgroupAgency) -> AgencyUpdates: + updates = AgencyUpdates() + + for tgroup in tgroup_agency.tgroups: + if not tgroup.is_modified: + continue + + tgroup_field_name = tgroup.get_field_name() + + # TODO - how we want to actually handle deleted rows likely needs more investigation + # and discussion - do we assume that if certain fields are deleted that the + # entire agency should be deleted? Can they even be deleted once an opportunity refers to them? + # Rather than focus too much on that detail right now, I'm deferring + # a more thorough investigation to later + # For now - we'll error any agency that has deleted rows except for a few + # specific fields we know are safe to delete. + if tgroup.is_deleted: + if tgroup_field_name not in NULLABLE_FIELDS: + raise ValueError( + "Field %s in tgroups cannot be deleted as it is not nullable" + % tgroup_field_name + ) + value = None + else: + value = convert_field_values(tgroup_field_name, tgroup.value) + + if tgroup_field_name == "AgencyDownload": + updates.agency_download_file_types = value # type: ignore[assignment] + + elif tgroup_field_name in AGENCY_FIELD_MAP: + field_name = AGENCY_FIELD_MAP[tgroup_field_name] + updates.agency_updates[field_name] = value + + elif tgroup_field_name in AGENCY_CONTACT_INFO_FIELD_MAP: + field_name = AGENCY_CONTACT_INFO_FIELD_MAP[tgroup_field_name] + updates.agency_contact_info_updates[field_name] = value + + elif tgroup_field_name in NOT_MAPPED_FIELDS: + logger.info( + "Skipping processing of field %s for %s", + tgroup_field_name, + tgroup_agency.agency_code, + ) + continue + + else: + raise ValueError("Unknown tgroups agency field %s" % tgroup_field_name) + + # We effectively need to merge the created_at/updated_at timestamps to the earliest/latest respectively + created_at, updated_at = transform_util.get_create_update_timestamps( + tgroup.created_date, tgroup.last_upd_date + ) + + if updates.agency_created_at is None or created_at < updates.agency_created_at: + updates.agency_created_at = created_at + + if updates.agency_updated_at is None or updated_at > updates.agency_updated_at: + updates.agency_updated_at = updated_at + + return updates + + +def convert_field_values(field_name: str, value: str | None) -> Any: + if field_name == "AgencyDownload": + return transform_agency_download_file_types(value) + elif field_name == "AgencyNotify": + return transform_agency_notify(value) + elif field_name == "multilevel": + return transform_util.convert_true_false_bool(value) + elif field_name == "HasS2SCert": + return transform_util.convert_yn_bool(value) + elif field_name == "multiproject": + return transform_util.convert_yn_bool(value) + elif field_name == "ViewPkgsInGracePeriod": + return transform_util.convert_yn_bool(value) + elif field_name == "ImageWS": + return transform_util.convert_yn_bool(value) + elif field_name == "ValidationWS": + return transform_util.convert_yn_bool(value) + elif field_name == "AgencyContactAddress2": + return transform_util.convert_null_like_to_none(value) + + return value + + +def transform_agency_download_file_types(value: str | None) -> set[AgencyDownloadFileType]: + if value not in AGENCY_DOWNLOAD_FILE_TYPE_MAP: + raise ValueError("Unrecognized agency download file type value %s" % value) + + return AGENCY_DOWNLOAD_FILE_TYPE_MAP[value] + + +def transform_agency_notify(value: str | None) -> AgencySubmissionNotificationSetting: + if value not in AGENCY_SUBMISSION_NOTIFICATION_SETTING_MAP: + raise ValueError("Unrecognized agency notify setting value: %s" % value) + + return AGENCY_SUBMISSION_NOTIFICATION_SETTING_MAP[value] + + +def apply_updates( + record: Agency | AgencyContactInfo | None, + updates: dict[str, Any], + created_at: datetime | None, + updated_at: datetime | None, +) -> None: + # Note MyPy doesn't quite follow the typing in this function because it thinks + # created_at/updated_at aren't ever None. While they aren't ever null in the DB, + # before we insert a record they may not be set. Hence the type:ignores here + + if record is None: + # This shouldn't happen but need to make mypy happy because agency contact info + # can technically be null + raise ValueError("Cannot pass none value into apply_updates") + + for field_name, value in updates.items(): + setattr(record, field_name, value) + + # We will only set created_at if the value doesn't already exist on the record + # It would be confusing to change the created_at timestamp after the initial insert + if record.created_at is None and created_at is not None: # type: ignore[unreachable] + record.created_at = created_at # type: ignore[unreachable] + + # Updated at we'll either set if the value currently is null (ie. we're doing an insert) + # or if it is greater than whatever already exists. + if record.updated_at is None and updated_at is not None: # type: ignore[unreachable] + record.updated_at = updated_at # type: ignore[unreachable] + elif ( + record.updated_at is not None and updated_at is not None and record.updated_at < updated_at + ): + record.updated_at = updated_at diff --git a/api/src/data_migration/transformation/transform_constants.py b/api/src/data_migration/transformation/transform_constants.py index 9d50e2069..16d023fbd 100644 --- a/api/src/data_migration/transformation/transform_constants.py +++ b/api/src/data_migration/transformation/transform_constants.py @@ -34,6 +34,7 @@ APPLICANT_TYPE = "applicant_type" FUNDING_CATEGORY = "funding_category" FUNDING_INSTRUMENT = "funding_instrument" +AGENCY = "agency" class Metrics(StrEnum): diff --git a/api/src/data_migration/transformation/transform_oracle_data_task.py b/api/src/data_migration/transformation/transform_oracle_data_task.py index ed5f33a3c..b7ce8e0fd 100644 --- a/api/src/data_migration/transformation/transform_oracle_data_task.py +++ b/api/src/data_migration/transformation/transform_oracle_data_task.py @@ -5,6 +5,7 @@ import src.data_migration.transformation.transform_constants as transform_constants from src.adapters import db +from src.data_migration.transformation.subtask.transform_agency import TransformAgency from src.data_migration.transformation.subtask.transform_applicant_type import ( TransformApplicantType, ) @@ -37,6 +38,7 @@ class TransformOracleDataTaskConfig(PydanticBaseEnvConfig): enable_applicant_type: bool = True # TRANSFORM_ORACLE_DATA_ENABLE_APPLICANT_TYPE enable_funding_category: bool = True # TRANSFORM_ORACLE_DATA_ENABLE_FUNDING_CATEGORY enable_funding_instrument: bool = True # TRANSFORM_ORACLE_DATA_ENABLE_FUNDING_INSTRUMENT + enable_agency: bool = False # TRANSFORM_ORACLE_DATA_ENABLE_AGENCY class TransformOracleDataTask(Task): @@ -76,3 +78,6 @@ def run_task(self) -> None: if self.transform_config.enable_funding_instrument: TransformFundingInstrument(self).run() + + if self.transform_config.enable_agency: + TransformAgency(self).run() diff --git a/api/src/data_migration/transformation/transform_util.py b/api/src/data_migration/transformation/transform_util.py index d8bf58a1b..216134a4c 100644 --- a/api/src/data_migration/transformation/transform_util.py +++ b/api/src/data_migration/transformation/transform_util.py @@ -1,5 +1,6 @@ import logging from datetime import datetime +from typing import Tuple from src.constants.lookup_constants import ( ApplicantType, @@ -377,38 +378,47 @@ def convert_est_timestamp_to_utc(timestamp: datetime | None) -> datetime | None: return datetime_util.adjust_timezone(aware_timestamp, "UTC") -def transform_update_create_timestamp( - source: StagingBase, target: TimestampMixin, log_extra: dict | None = None -) -> None: - # Convert the source timestamps to UTC - # Note: the type ignores are because created_date/last_upd_date are added - # on the individual class definitions, not the base class - due to how - # we need to maintain the column order of the legacy system. - # Every legacy table does have these columns. - created_timestamp = convert_est_timestamp_to_utc(source.created_date) # type: ignore[attr-defined] - updated_timestamp = convert_est_timestamp_to_utc(source.last_upd_date) # type: ignore[attr-defined] +def get_create_update_timestamps( + source_created_date: datetime | None, + source_last_upd_date: datetime | None, + log_extra: dict | None = None, +) -> Tuple[datetime, datetime]: + created_timestamp = convert_est_timestamp_to_utc(source_created_date) + updated_timestamp = convert_est_timestamp_to_utc(source_last_upd_date) - if created_timestamp is not None: - target.created_at = created_timestamp - else: - # This is incredibly rare, but possible - because our system requires - # we set something, we'll default to the current time and log a warning. + # This is incredibly rare, but possible - because our system requires + # we set something, we'll default to the current time and log a warning. + if created_timestamp is None: if log_extra is None: log_extra = {} logger.warning( - f"{source.__class__} does not have a created_date timestamp set, setting value to now.", + "Record does not have a created_date timestamp set, assuming value to be now.", extra=log_extra, ) - target.created_at = datetime_util.utcnow() + created_timestamp = datetime_util.utcnow() - if updated_timestamp is not None: - target.updated_at = updated_timestamp - else: + if updated_timestamp is None: # In the legacy system, they don't set whether something was updated # until it receives an update. We always set the value, and on initial insert # want it to be the same as the created_at. - target.updated_at = target.created_at + updated_timestamp = created_timestamp + + return created_timestamp, updated_timestamp + + +def transform_update_create_timestamp( + source: StagingBase, target: TimestampMixin, log_extra: dict | None = None +) -> None: + # Convert the source timestamps to UTC + # Note: the type ignores are because created_date/last_upd_date are added + # on the individual class definitions, not the base class - due to how + # we need to maintain the column order of the legacy system. + # Every legacy table does have these columns. + created_timestamp, updated_timestamp = get_create_update_timestamps(source.created_date, source.last_upd_date, log_extra) # type: ignore[attr-defined] + + target.created_at = created_timestamp + target.updated_at = updated_timestamp TRUTHY = {"Y", "Yes"} @@ -431,6 +441,23 @@ def convert_yn_bool(value: str | None) -> bool | None: raise ValueError("Unexpected Y/N bool value: %s" % value) +def convert_true_false_bool(value: str | None) -> bool | None: + if value is None or value == "": + return None + + return value == "TRUE" + + +def convert_null_like_to_none(value: str | None) -> str | None: + if value is None: + return None + + if value.lower() == "null": + return None + + return value + + def convert_action_type_to_is_deleted(value: str | None) -> bool: # Action type can be U (update) or D (delete) # however many older records seem to not have this set at all diff --git a/api/src/db/models/agency_models.py b/api/src/db/models/agency_models.py index 6759b9fb2..14075233d 100644 --- a/api/src/db/models/agency_models.py +++ b/api/src/db/models/agency_models.py @@ -73,12 +73,13 @@ class Agency(ApiSchemaTable, TimestampMixin): ldap_group: Mapped[str] description: Mapped[str] label: Mapped[str] - is_multilevel_agency: Mapped[bool] - is_multiproject: Mapped[bool] - has_system_to_system_certificate: Mapped[bool] - can_view_packages_in_grace_period: Mapped[bool] - is_image_workspace_enabled: Mapped[bool] - is_validation_workspace_enabled: Mapped[bool] + + is_multilevel_agency: Mapped[bool] = mapped_column(default=False) + is_multiproject: Mapped[bool] = mapped_column(default=False) + has_system_to_system_certificate: Mapped[bool] = mapped_column(default=False) + can_view_packages_in_grace_period: Mapped[bool] = mapped_column(default=False) + is_image_workspace_enabled: Mapped[bool] = mapped_column(default=False) + is_validation_workspace_enabled: Mapped[bool] = mapped_column(default=False) link_agency_download_file_types: Mapped[list["LinkAgencyDownloadFileType"]] = relationship( back_populates="agency", uselist=True, cascade="all, delete-orphan" diff --git a/api/src/db/models/staging/staging_base.py b/api/src/db/models/staging/staging_base.py index 0705a866e..12bca9685 100644 --- a/api/src/db/models/staging/staging_base.py +++ b/api/src/db/models/staging/staging_base.py @@ -74,3 +74,7 @@ class StagingParamMixin: ) transformation_notes: Mapped[str | None] + + @property + def is_modified(self) -> bool: + return self.transformed_at is None diff --git a/api/src/db/models/staging/tgroups.py b/api/src/db/models/staging/tgroups.py index fda264834..97e70f3f0 100644 --- a/api/src/db/models/staging/tgroups.py +++ b/api/src/db/models/staging/tgroups.py @@ -4,3 +4,14 @@ class Tgroups(StagingBase, tgroups_mixin.TGroupsMixin, StagingParamMixin): __tablename__ = "tgroups" + + def get_agency_code(self) -> str: + # The keyfield is formatted as: + # Agency-- + # so to get the agency code, we need to parse out the middle bit + # so we split and drop the first + last field and rejoin it. + tokens = self.keyfield.split("-") + return "-".join(tokens[1:-1]) + + def get_field_name(self) -> str: + return self.keyfield.split("-")[-1] diff --git a/api/tests/src/data_migration/transformation/conftest.py b/api/tests/src/data_migration/transformation/conftest.py index 443c113b6..d78af140d 100644 --- a/api/tests/src/data_migration/transformation/conftest.py +++ b/api/tests/src/data_migration/transformation/conftest.py @@ -7,6 +7,7 @@ from src.constants.lookup_constants import ApplicantType, FundingCategory, FundingInstrument from src.data_migration.transformation.transform_oracle_data_task import TransformOracleDataTask from src.db.models import staging +from src.db.models.agency_models import Agency from src.db.models.opportunity_models import ( LinkOpportunitySummaryApplicantType, LinkOpportunitySummaryFundingCategory, @@ -299,13 +300,42 @@ def setup_funding_category( return source_funding_category +def setup_agency( + agency_code: str, + create_existing: bool, + is_already_processed: bool = False, + deleted_fields: set | None = None, + already_processed_fields: set | None = None, + source_values: dict | None = None, +): + if source_values is None: + source_values = {} + + tgroups = f.create_tgroups_agency( + agency_code, + is_already_processed=is_already_processed, + deleted_fields=deleted_fields, + already_processed_fields=already_processed_fields, + **source_values, + ) + + if create_existing: + f.AgencyFactory.create(agency_code=agency_code) + + return tgroups + + def validate_matching_fields( source, destination, fields: list[Tuple[str, str]], expect_all_to_match: bool ): mismatched_fields = [] for source_field, destination_field in fields: - source_value = getattr(source, source_field) + if isinstance(source, dict): + source_value = source.get(source_field) + else: + source_value = getattr(source, source_field) + destination_value = getattr(destination, destination_field) # Some fields that we copy in are datetime typed (although behave as dates and we convert as such) @@ -657,3 +687,63 @@ def validate_funding_category( [("creator_id", "created_by"), ("last_upd_id", "updated_by")], expect_values_to_match, ) + + +AGENCY_FIELD_MAPPING = [ + ("AgencyName", "agency_name"), + ("AgencyCode", "sub_agency_code"), + ("AgencyCFDA", "assistance_listing_number"), + ("ldapGp", "ldap_group"), + ("description", "description"), + ("label", "label"), +] + +AGENCY_CONTACT_FIELD_MAPPING = [ + ("AgencyContactName", "contact_name"), + ("AgencyContactAddress1", "address_line_1"), + ("AgencyContactCity", "city"), + ("AgencyContactState", "state"), + ("AgencyContactZipCode", "zip_code"), + ("AgencyContactTelephone", "phone_number"), + ("AgencyContactEMail", "primary_email"), +] + + +def validate_agency( + db_session, + source_tgroups: list[staging.tgroups.Tgroups], + expect_in_db: bool = True, + expect_values_to_match: bool = True, + is_test_agency: bool = False, + non_matching_fields: set | None = None, +): + agency_code = source_tgroups[0].get_agency_code() + agency = db_session.query(Agency).filter(Agency.agency_code == agency_code).one_or_none() + + if not expect_in_db: + assert agency is None + return + + assert agency is not None + + # need to restructure the tgroups into a dict + tgroup_map = {tgroup.get_field_name(): tgroup.value for tgroup in source_tgroups} + + if non_matching_fields is not None: + agency_field_mapping = [m for m in AGENCY_FIELD_MAPPING if m[0] not in non_matching_fields] + else: + agency_field_mapping = AGENCY_FIELD_MAPPING + + validate_matching_fields(tgroup_map, agency, agency_field_mapping, expect_values_to_match) + assert agency.is_test_agency == is_test_agency + + if non_matching_fields is not None: + agency_contact_field_mapping = [ + m for m in AGENCY_CONTACT_FIELD_MAPPING if m[0] not in non_matching_fields + ] + else: + agency_contact_field_mapping = AGENCY_CONTACT_FIELD_MAPPING + + validate_matching_fields( + tgroup_map, agency.agency_contact_info, agency_contact_field_mapping, expect_values_to_match + ) diff --git a/api/tests/src/data_migration/transformation/subtask/test_transform_agency.py b/api/tests/src/data_migration/transformation/subtask/test_transform_agency.py new file mode 100644 index 000000000..906ee8e64 --- /dev/null +++ b/api/tests/src/data_migration/transformation/subtask/test_transform_agency.py @@ -0,0 +1,277 @@ +from datetime import datetime + +import pytest + +import src.data_migration.transformation.transform_constants as transform_constants +from src.constants.lookup_constants import ( + AgencyDownloadFileType, + AgencySubmissionNotificationSetting, +) +from src.data_migration.transformation.subtask.transform_agency import ( + TgroupAgency, + TransformAgency, + apply_updates, + transform_agency_download_file_types, + transform_agency_notify, +) +from tests.src.data_migration.transformation.conftest import ( + BaseTransformTestClass, + setup_agency, + validate_agency, +) +from tests.src.db.models.factories import AgencyFactory + + +class TestTransformAgency(BaseTransformTestClass): + @pytest.fixture() + def transform_agency(self, transform_oracle_data_task): + return TransformAgency(transform_oracle_data_task) + + def test_process_agencies(self, db_session, transform_agency): + insert_agency1 = setup_agency("INSERT-AGENCY-1", create_existing=False) + insert_agency2 = setup_agency("INSERT-AGENCY-2", create_existing=False) + insert_agency3 = setup_agency("INSERT-AGENCY-3", create_existing=False) + insert_agency4 = setup_agency("INSERT-AGENCY-4", create_existing=False) + insert_test_agency = setup_agency("GDIT", create_existing=False) + + # Already processed fields are ones that were handled on a prior run and won't be updated + # during this specific run + update_agency1 = setup_agency("UPDATE-AGENCY-1", create_existing=True) + update_agency2 = setup_agency( + "UPDATE-AGENCY-2", create_existing=True, deleted_fields={"AgencyContactEMail2"} + ) + update_agency3 = setup_agency( + "UPDATE-AGENCY-3", + create_existing=True, + already_processed_fields={ + "AgencyName", + "AgencyCFDA", + "description", + "AgencyContactName", + "AgencyContactAddress1", + }, + ) + update_test_agency = setup_agency("SECSCAN", create_existing=True) + + already_processed1 = setup_agency( + "ALREADY-PROCESSED-1", create_existing=True, is_already_processed=True + ) + already_processed2 = setup_agency( + "ALREADY-PROCESSED-2", create_existing=True, is_already_processed=True + ) + already_processed3 = setup_agency( + "ALREADY-PROCESSED-3", create_existing=True, is_already_processed=True + ) + + insert_error = setup_agency( + "INSERT-ERROR", create_existing=False, source_values={"AgencyName": None} + ) + update_error1 = setup_agency( + "UPDATE-ERROR-1", create_existing=True, source_values={"AgencyDownload": "xyz"} + ) + update_error2 = setup_agency( + "UPDATE-ERROR-2", create_existing=True, source_values={"UnknownField": "xyz"} + ) + + transform_agency.run_subtask() + + validate_agency(db_session, insert_agency1) + validate_agency(db_session, insert_agency2) + validate_agency(db_session, insert_agency3) + validate_agency(db_session, insert_agency4) + validate_agency(db_session, insert_test_agency, is_test_agency=True) + + validate_agency(db_session, update_agency1) + validate_agency(db_session, update_agency2) + validate_agency( + db_session, + update_agency3, + non_matching_fields={ + "AgencyName", + "AgencyCFDA", + "description", + "AgencyContactName", + "AgencyContactAddress1", + }, + ) + validate_agency(db_session, update_test_agency, is_test_agency=True) + + validate_agency(db_session, already_processed1, expect_values_to_match=False) + validate_agency(db_session, already_processed2, expect_values_to_match=False) + validate_agency(db_session, already_processed3, expect_values_to_match=False) + + validate_agency(db_session, insert_error, expect_in_db=False) + validate_agency(db_session, update_error1, expect_values_to_match=False) + validate_agency(db_session, update_error2, expect_values_to_match=False) + + metrics = transform_agency.metrics + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_PROCESSED] == 12 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_INSERTED] == 5 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_UPDATED] == 4 + assert metrics[transform_constants.Metrics.TOTAL_ERROR_COUNT] == 3 + + # Rerunning does mostly nothing, it will attempt to re-process the three that errored + # but otherwise won't find anything else + db_session.commit() # commit to end any existing transactions as run_subtask starts a new one + transform_agency.run_subtask() + + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_PROCESSED] == 15 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_INSERTED] == 5 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_UPDATED] == 4 + assert metrics[transform_constants.Metrics.TOTAL_ERROR_COUNT] == 6 + + def test_process_tgroups_missing_fields_for_insert(self, db_session, transform_agency): + # Fields set to None don't get a tgroup record created + insert_that_will_fail = setup_agency( + "ERROR-CASE-MISSING-FIELDS", + create_existing=False, + source_values={"AgencyName": None, "AgencyContactCity": None}, + ) + + with pytest.raises( + ValueError, + match="Cannot create agency ERROR-CASE-MISSING-FIELDS as required fields are missing", + ): + transform_agency.process_tgroups( + TgroupAgency("ERROR-CASE-MISSING-FIELDS", insert_that_will_fail, has_update=True), + None, + ) + + validate_agency(db_session, insert_that_will_fail, expect_in_db=False) + + def test_process_tgroups_unknown_field(self, db_session, transform_agency): + insert_that_will_fail = setup_agency( + "ERROR-CASE-UNKNOWN-FIELD", create_existing=False, source_values={"MysteryField": "X"} + ) + + with pytest.raises(ValueError, match="Unknown tgroups agency field"): + transform_agency.process_tgroups( + TgroupAgency("ERROR-CASE-UNKNOWN-FIELD", insert_that_will_fail, has_update=True), + None, + ) + + validate_agency(db_session, insert_that_will_fail, expect_in_db=False) + + def test_process_tgroups_disallowed_deleted_fields(self, db_session, transform_agency): + update_that_will_fail = setup_agency( + "ERROR-CASE-DELETED-FIELD", create_existing=True, deleted_fields={"AgencyContactCity"} + ) + + with pytest.raises( + ValueError, + match="Field AgencyContactCity in tgroups cannot be deleted as it is not nullable", + ): + transform_agency.process_tgroups( + TgroupAgency("ERROR-CASE-DELETED-FIELD", update_that_will_fail, has_update=True), + None, + ) + + validate_agency(db_session, update_that_will_fail, expect_values_to_match=False) + + def test_process_tgroups_invalid_file_type(self, db_session, transform_agency): + insert_that_will_fail = setup_agency( + "ERROR-CASE-BAD-DOWNLOAD", create_existing=False, source_values={"AgencyDownload": "X"} + ) + + with pytest.raises(ValueError, match="Unrecognized agency download file type value"): + transform_agency.process_tgroups( + TgroupAgency("ERROR-CASE-BAD-DOWNLOAD", insert_that_will_fail, has_update=True), + None, + ) + + validate_agency(db_session, insert_that_will_fail, expect_in_db=False) + + def test_process_tgroups_invalid_agency_notify(self, db_session, transform_agency): + insert_that_will_fail = setup_agency( + "ERROR-CASE-BAD-NOTIFY", create_existing=False, source_values={"AgencyNotify": "4"} + ) + + with pytest.raises(ValueError, match="Unrecognized agency notify setting value"): + transform_agency.process_tgroups( + TgroupAgency("ERROR-CASE-BAD-NOTIFY", insert_that_will_fail, has_update=True), None + ) + + validate_agency(db_session, insert_that_will_fail, expect_in_db=False) + + +@pytest.mark.parametrize( + "value,expected_value", + [ + ("0", set()), + ("1", {AgencyDownloadFileType.XML}), + ("2", {AgencyDownloadFileType.XML, AgencyDownloadFileType.PDF}), + ("3", {AgencyDownloadFileType.PDF}), + ], +) +def test_transform_agency_download_file_types(value, expected_value): + assert transform_agency_download_file_types(value) == expected_value + + +@pytest.mark.parametrize("value", ["A", "B", "NULL", "", None]) +def test_transform_agency_download_file_types_unexpected_values(value): + with pytest.raises(ValueError, match="Unrecognized agency download file type value"): + transform_agency_download_file_types(value) + + +@pytest.mark.parametrize( + "value,expected_value", + [ + ("1", AgencySubmissionNotificationSetting.NEVER), + ("2", AgencySubmissionNotificationSetting.FIRST_APPLICATION_ONLY), + ("3", AgencySubmissionNotificationSetting.ALWAYS), + ], +) +def test_transform_agency_notify(value, expected_value): + assert transform_agency_notify(value) == expected_value + + +@pytest.mark.parametrize("value", ["A", "B", "NULL", "", None]) +def test_transform_agency_notify_unexpected_value(value): + with pytest.raises(ValueError, match="Unrecognized agency notify setting value"): + transform_agency_notify(value) + + +@pytest.mark.parametrize( + "agency_created_at,agency_updated_at,created_at,updated_at,expect_created_at_to_change,expect_updated_at_to_change", + [ + (None, None, None, None, False, False), + (None, None, datetime.now(), datetime.now(), True, True), + ( + datetime(2020, 1, 1), + datetime(2021, 1, 1), + datetime(2019, 12, 31), + datetime(2021, 1, 2), + False, + True, + ), + ( + datetime(2020, 1, 1), + datetime(2021, 1, 1), + datetime(2020, 12, 31), + datetime(2020, 1, 1), + False, + False, + ), + ], +) +def test_apply_updates_timestamps( + agency_created_at, + agency_updated_at, + created_at, + updated_at, + expect_created_at_to_change, + expect_updated_at_to_change, +): + agency = AgencyFactory.build(created_at=agency_created_at, updated_at=agency_updated_at) + + apply_updates(agency, {}, created_at, updated_at) + + if expect_created_at_to_change: + assert agency.created_at == created_at + else: + assert agency.created_at == agency_created_at + + if expect_updated_at_to_change: + assert agency.updated_at == updated_at + else: + assert agency.updated_at == agency_updated_at diff --git a/api/tests/src/db/models/factories.py b/api/tests/src/db/models/factories.py index 3fafacdce..9e215a78f 100644 --- a/api/tests/src/db/models/factories.py +++ b/api/tests/src/db/models/factories.py @@ -1310,6 +1310,11 @@ class Meta: last_upd_id = factory.Faker("first_name") creator_id = factory.Faker("first_name") + class Params: + already_transformed = factory.Trait( + transformed_at=factory.Faker("date_time_between", start_date="-7d", end_date="-1d") + ) + #################################### # Transfer Table Factories @@ -1652,15 +1657,21 @@ class StagingTgroupsAgencyFactory(factory.DictFactory): description = factory.LazyAttribute(lambda g: g.AgencyName) label = factory.LazyAttribute(lambda g: g.AgencyName) multilevel = sometimes_none("TRUE", none_chance=0.8) - HasS2SCert = sometimes_none(factory.Faker("yn_yesno_boolean"), none_chance=0.8) - ViewPkgsInGracePeriod = sometimes_none(factory.Faker("yn_yesno_boolean"), none_chance=0.8) - multiproject = sometimes_none(factory.Faker("yn_yesno_boolean"), none_chance=0.8) - ImageWS = sometimes_none(factory.Faker("yn_yesno_boolean"), none_chance=0.8) - ValidationWS = sometimes_none(factory.Faker("yn_yesno_boolean"), none_chance=0.8) + + HasS2SCert = sometimes_none(factory.Faker("yn_boolean"), none_chance=0.8) + ViewPkgsInGracePeriod = sometimes_none(factory.Faker("yn_boolean"), none_chance=0.8) + multiproject = sometimes_none(factory.Faker("yn_boolean"), none_chance=0.8) + ImageWS = sometimes_none(factory.Faker("yn_boolean"), none_chance=0.8) + ValidationWS = sometimes_none(factory.Faker("yn_boolean"), none_chance=0.8) def create_tgroups_agency( - agency_code: str, is_deleted: bool = False, **kwargs + agency_code: str, + is_deleted: bool = False, + is_already_processed: bool = False, + deleted_fields: set | None = None, + already_processed_fields: set | None = None, + **kwargs, ) -> list[staging.tgroups.Tgroups]: # The agency_code value is actually just the first bit (the top-level agency) kwargs["AgencyCode"] = agency_code.split("-")[0] @@ -1673,11 +1684,24 @@ def create_tgroups_agency( field_prefix = f"Agency-{agency_code}-" + if already_processed_fields is None: + already_processed_fields = set() + + if deleted_fields is None: + deleted_fields = set() + for field_name, value in field_values.items(): if value is None: continue + + is_field_already_processed = is_already_processed or field_name in already_processed_fields + is_field_deleted = is_deleted or field_name in deleted_fields + tgroup = StagingTgroupsFactory.create( - keyfield=field_prefix + field_name, value=value, is_deleted=is_deleted + keyfield=field_prefix + field_name, + value=str(value), + is_deleted=is_field_deleted, + already_transformed=is_field_already_processed, ) groups.append(tgroup)