Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
[Issue #106] Transform agency data (#157)
Browse files Browse the repository at this point in the history
## Summary
Fixes #106

### Time to review: __10 mins__

## Changes proposed
Add transformations for agency data

## Context for reviewers
Agency data is structured oddly in the existing system, instead of being
in ordinary tables, its in a `tgroups` table that has values stored as
key-value pairs. We want to normalize that into something more workable,
so the transformation needs to work a bit differently than the
transformations of other tables.

For simplicity, I load all of the data for every agency (and later
filter to just what changed) as this removes a lot of weird edge cases
that we would have otherwise needed to consider. Only modified rows
actually get used, but we know we have the full set of data now.

## Additional information
I have a snapshot of the prod tgroups table and loaded it into my DB
locally and ran the transform script. In total, it takes ~2 seconds to
run and didn't hit any issues.

A set of the relevant metrics:
```
total_records_processed=1152
total_records_deleted=0
total_records_inserted=1152
total_records_updated=0
total_error_count=0
agency.total_records_processed=1152
agency.total_records_inserted=1152
TransformAgency_subtask_duration_sec=2.14
task_duration_sec=2.14
```

As a sanity test, I also loaded in the tgroups data from dev and tried
running it through. While it generally worked, there were 12 agencies
that failed because they were missing the ldapGp and AgencyContactCity
fields. I'm not certain if we want to do anything about that as they all
seemed to be test agencies based on the names.

---------

Co-authored-by: nava-platform-bot <[email protected]>
  • Loading branch information
chouinar and nava-platform-bot authored Sep 16, 2024
1 parent 4b0bd5b commit b5ff8c8
Show file tree
Hide file tree
Showing 10 changed files with 897 additions and 35 deletions.
422 changes: 422 additions & 0 deletions api/src/data_migration/transformation/subtask/transform_agency.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
APPLICANT_TYPE = "applicant_type"
FUNDING_CATEGORY = "funding_category"
FUNDING_INSTRUMENT = "funding_instrument"
AGENCY = "agency"


class Metrics(StrEnum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import src.data_migration.transformation.transform_constants as transform_constants
from src.adapters import db
from src.data_migration.transformation.subtask.transform_agency import TransformAgency
from src.data_migration.transformation.subtask.transform_applicant_type import (
TransformApplicantType,
)
Expand Down Expand Up @@ -37,6 +38,7 @@ class TransformOracleDataTaskConfig(PydanticBaseEnvConfig):
enable_applicant_type: bool = True # TRANSFORM_ORACLE_DATA_ENABLE_APPLICANT_TYPE
enable_funding_category: bool = True # TRANSFORM_ORACLE_DATA_ENABLE_FUNDING_CATEGORY
enable_funding_instrument: bool = True # TRANSFORM_ORACLE_DATA_ENABLE_FUNDING_INSTRUMENT
enable_agency: bool = False # TRANSFORM_ORACLE_DATA_ENABLE_AGENCY


class TransformOracleDataTask(Task):
Expand Down Expand Up @@ -76,3 +78,6 @@ def run_task(self) -> None:

if self.transform_config.enable_funding_instrument:
TransformFundingInstrument(self).run()

if self.transform_config.enable_agency:
TransformAgency(self).run()
69 changes: 48 additions & 21 deletions api/src/data_migration/transformation/transform_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from datetime import datetime
from typing import Tuple

from src.constants.lookup_constants import (
ApplicantType,
Expand Down Expand Up @@ -377,38 +378,47 @@ def convert_est_timestamp_to_utc(timestamp: datetime | None) -> datetime | None:
return datetime_util.adjust_timezone(aware_timestamp, "UTC")


def transform_update_create_timestamp(
source: StagingBase, target: TimestampMixin, log_extra: dict | None = None
) -> None:
# Convert the source timestamps to UTC
# Note: the type ignores are because created_date/last_upd_date are added
# on the individual class definitions, not the base class - due to how
# we need to maintain the column order of the legacy system.
# Every legacy table does have these columns.
created_timestamp = convert_est_timestamp_to_utc(source.created_date) # type: ignore[attr-defined]
updated_timestamp = convert_est_timestamp_to_utc(source.last_upd_date) # type: ignore[attr-defined]
def get_create_update_timestamps(
source_created_date: datetime | None,
source_last_upd_date: datetime | None,
log_extra: dict | None = None,
) -> Tuple[datetime, datetime]:
created_timestamp = convert_est_timestamp_to_utc(source_created_date)
updated_timestamp = convert_est_timestamp_to_utc(source_last_upd_date)

if created_timestamp is not None:
target.created_at = created_timestamp
else:
# This is incredibly rare, but possible - because our system requires
# we set something, we'll default to the current time and log a warning.
# This is incredibly rare, but possible - because our system requires
# we set something, we'll default to the current time and log a warning.
if created_timestamp is None:
if log_extra is None:
log_extra = {}

logger.warning(
f"{source.__class__} does not have a created_date timestamp set, setting value to now.",
"Record does not have a created_date timestamp set, assuming value to be now.",
extra=log_extra,
)
target.created_at = datetime_util.utcnow()
created_timestamp = datetime_util.utcnow()

if updated_timestamp is not None:
target.updated_at = updated_timestamp
else:
if updated_timestamp is None:
# In the legacy system, they don't set whether something was updated
# until it receives an update. We always set the value, and on initial insert
# want it to be the same as the created_at.
target.updated_at = target.created_at
updated_timestamp = created_timestamp

return created_timestamp, updated_timestamp


def transform_update_create_timestamp(
source: StagingBase, target: TimestampMixin, log_extra: dict | None = None
) -> None:
# Convert the source timestamps to UTC
# Note: the type ignores are because created_date/last_upd_date are added
# on the individual class definitions, not the base class - due to how
# we need to maintain the column order of the legacy system.
# Every legacy table does have these columns.
created_timestamp, updated_timestamp = get_create_update_timestamps(source.created_date, source.last_upd_date, log_extra) # type: ignore[attr-defined]

target.created_at = created_timestamp
target.updated_at = updated_timestamp


TRUTHY = {"Y", "Yes"}
Expand All @@ -431,6 +441,23 @@ def convert_yn_bool(value: str | None) -> bool | None:
raise ValueError("Unexpected Y/N bool value: %s" % value)


def convert_true_false_bool(value: str | None) -> bool | None:
if value is None or value == "":
return None

return value == "TRUE"


def convert_null_like_to_none(value: str | None) -> str | None:
if value is None:
return None

if value.lower() == "null":
return None

return value


def convert_action_type_to_is_deleted(value: str | None) -> bool:
# Action type can be U (update) or D (delete)
# however many older records seem to not have this set at all
Expand Down
13 changes: 7 additions & 6 deletions api/src/db/models/agency_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ class Agency(ApiSchemaTable, TimestampMixin):
ldap_group: Mapped[str]
description: Mapped[str]
label: Mapped[str]
is_multilevel_agency: Mapped[bool]
is_multiproject: Mapped[bool]
has_system_to_system_certificate: Mapped[bool]
can_view_packages_in_grace_period: Mapped[bool]
is_image_workspace_enabled: Mapped[bool]
is_validation_workspace_enabled: Mapped[bool]

is_multilevel_agency: Mapped[bool] = mapped_column(default=False)
is_multiproject: Mapped[bool] = mapped_column(default=False)
has_system_to_system_certificate: Mapped[bool] = mapped_column(default=False)
can_view_packages_in_grace_period: Mapped[bool] = mapped_column(default=False)
is_image_workspace_enabled: Mapped[bool] = mapped_column(default=False)
is_validation_workspace_enabled: Mapped[bool] = mapped_column(default=False)

link_agency_download_file_types: Mapped[list["LinkAgencyDownloadFileType"]] = relationship(
back_populates="agency", uselist=True, cascade="all, delete-orphan"
Expand Down
4 changes: 4 additions & 0 deletions api/src/db/models/staging/staging_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,7 @@ class StagingParamMixin:
)

transformation_notes: Mapped[str | None]

@property
def is_modified(self) -> bool:
return self.transformed_at is None
11 changes: 11 additions & 0 deletions api/src/db/models/staging/tgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,14 @@

class Tgroups(StagingBase, tgroups_mixin.TGroupsMixin, StagingParamMixin):
__tablename__ = "tgroups"

def get_agency_code(self) -> str:
# The keyfield is formatted as:
# Agency-<AGENCY CODE>-<field name>
# so to get the agency code, we need to parse out the middle bit
# so we split and drop the first + last field and rejoin it.
tokens = self.keyfield.split("-")
return "-".join(tokens[1:-1])

def get_field_name(self) -> str:
return self.keyfield.split("-")[-1]
92 changes: 91 additions & 1 deletion api/tests/src/data_migration/transformation/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from src.constants.lookup_constants import ApplicantType, FundingCategory, FundingInstrument
from src.data_migration.transformation.transform_oracle_data_task import TransformOracleDataTask
from src.db.models import staging
from src.db.models.agency_models import Agency
from src.db.models.opportunity_models import (
LinkOpportunitySummaryApplicantType,
LinkOpportunitySummaryFundingCategory,
Expand Down Expand Up @@ -299,13 +300,42 @@ def setup_funding_category(
return source_funding_category


def setup_agency(
agency_code: str,
create_existing: bool,
is_already_processed: bool = False,
deleted_fields: set | None = None,
already_processed_fields: set | None = None,
source_values: dict | None = None,
):
if source_values is None:
source_values = {}

tgroups = f.create_tgroups_agency(
agency_code,
is_already_processed=is_already_processed,
deleted_fields=deleted_fields,
already_processed_fields=already_processed_fields,
**source_values,
)

if create_existing:
f.AgencyFactory.create(agency_code=agency_code)

return tgroups


def validate_matching_fields(
source, destination, fields: list[Tuple[str, str]], expect_all_to_match: bool
):
mismatched_fields = []

for source_field, destination_field in fields:
source_value = getattr(source, source_field)
if isinstance(source, dict):
source_value = source.get(source_field)
else:
source_value = getattr(source, source_field)

destination_value = getattr(destination, destination_field)

# Some fields that we copy in are datetime typed (although behave as dates and we convert as such)
Expand Down Expand Up @@ -657,3 +687,63 @@ def validate_funding_category(
[("creator_id", "created_by"), ("last_upd_id", "updated_by")],
expect_values_to_match,
)


AGENCY_FIELD_MAPPING = [
("AgencyName", "agency_name"),
("AgencyCode", "sub_agency_code"),
("AgencyCFDA", "assistance_listing_number"),
("ldapGp", "ldap_group"),
("description", "description"),
("label", "label"),
]

AGENCY_CONTACT_FIELD_MAPPING = [
("AgencyContactName", "contact_name"),
("AgencyContactAddress1", "address_line_1"),
("AgencyContactCity", "city"),
("AgencyContactState", "state"),
("AgencyContactZipCode", "zip_code"),
("AgencyContactTelephone", "phone_number"),
("AgencyContactEMail", "primary_email"),
]


def validate_agency(
db_session,
source_tgroups: list[staging.tgroups.Tgroups],
expect_in_db: bool = True,
expect_values_to_match: bool = True,
is_test_agency: bool = False,
non_matching_fields: set | None = None,
):
agency_code = source_tgroups[0].get_agency_code()
agency = db_session.query(Agency).filter(Agency.agency_code == agency_code).one_or_none()

if not expect_in_db:
assert agency is None
return

assert agency is not None

# need to restructure the tgroups into a dict
tgroup_map = {tgroup.get_field_name(): tgroup.value for tgroup in source_tgroups}

if non_matching_fields is not None:
agency_field_mapping = [m for m in AGENCY_FIELD_MAPPING if m[0] not in non_matching_fields]
else:
agency_field_mapping = AGENCY_FIELD_MAPPING

validate_matching_fields(tgroup_map, agency, agency_field_mapping, expect_values_to_match)
assert agency.is_test_agency == is_test_agency

if non_matching_fields is not None:
agency_contact_field_mapping = [
m for m in AGENCY_CONTACT_FIELD_MAPPING if m[0] not in non_matching_fields
]
else:
agency_contact_field_mapping = AGENCY_CONTACT_FIELD_MAPPING

validate_matching_fields(
tgroup_map, agency.agency_contact_info, agency_contact_field_mapping, expect_values_to_match
)
Loading

0 comments on commit b5ff8c8

Please sign in to comment.