Skip to content

Commit

Permalink
fix/metadata_from_g_sheet: Added checks suggested in google sheet
Browse files Browse the repository at this point in the history
  • Loading branch information
venu-sambarapu-DS committed Apr 23, 2024
1 parent 4023e9f commit 0867f1f
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 101 deletions.
80 changes: 62 additions & 18 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ class Settings(BaseSettings):
SERVICE_ACCOUNT_CONF: Dict[str, str] = {"<CHANGE_ME>": "<CHANGE_ME>"}
GSHEET_SCOPES: List[str] = ["https://www.googleapis.com/auth/spreadsheets"]

# Metadata File Parameters
METADATA_COLUMN_ORDER_STRING = ""

class Config:
env_file = ".env"

Expand Down Expand Up @@ -397,7 +400,7 @@ class MetadataSettings(BaseSettings):

SECTOR_KEYWORD = "sector"
ORGANIZATION_KEYWORD = "organization"
SHORT_FORM_KEYWORD = "short_form"
# SHORT_FORM_KEYWORD = "short_form"

DESCRIPTION_KEYWORD = "description"
DATASET_NAME_FOR_FACTLY_KEYWORD = "dataset_name_for_factly"
Expand All @@ -412,66 +415,107 @@ class MetadataSettings(BaseSettings):
VARIABLE_MEASURED_KEYWORD = "variable_measured"
DATA_NEXT_UPDATE_KEYWORD = "data_next_update"
SOURCE_KEYWORD = "source"
SECTOR_EXPECTATION = {
DATASET_NAME_FOR_FACTLY_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "sector_expectation_suite",
"expectation_suite_name": "dataset_name_for_factly_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_in_set",
"expectation_type": "expect_column_value_lengths_to_be_between",
"kwargs": {
"column": "sector",
"value_set": [],
"column": "dataset_name_for_factly",
"min_value": 5,
"max_value": 200,
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Sector Name in set of values",
"expectation_name": "Dataset Name For Factly Length",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Sector Name should be from the Data Dictionary",
"expectation_error_message": "Dataset Name For Factly Length should be less than 200",
},
}
],
}

ORGANIZATION_EXPECTATION = {
DESCRIPTION_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "organization_expectation_suite",
"expectation_suite_name": "description_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_value_lengths_to_be_between",
"kwargs": {
"column": "description",
"min_value": 50,
"max_value": 5000,
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Description Length",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Description should be grater than 50",
},
}
],
}
SECTOR_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "sector_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_in_set",
"kwargs": {
"column": "organization",
"column": "sector",
"value_set": [],
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Organization Name in set of values",
"expectation_name": "Sector Name in set of values",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Organization Name should be from the Data Dictionary",
"expectation_error_message": "Sector Name should be from the Data Dictionary",
},
}
],
}

SHORT_FORM_EXPECTATION = {
ORGANIZATION_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "short_form_expectation_suite",
"expectation_suite_name": "organization_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_in_set",
"kwargs": {
"column": "short_form",
"column": "organization",
"value_set": [],
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Short Form in set of values",
"expectation_name": "Organization Name in set of values",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Short Form should be from the Data Dictionary",
"expectation_error_message": "Organization Name should be from the Data Dictionary",
},
}
],
}

# SHORT_FORM_EXPECTATION = {
# "data_asset_type": None,
# "expectation_suite_name": "short_form_expectation_suite",
# "expectations": [
# {
# "expectation_type": "expect_column_values_to_be_in_set",
# "kwargs": {
# "column": "short_form",
# "value_set": [],
# "result_format": "SUMMARY",
# },
# "meta": {
# "expectation_name": "Short Form in set of values",
# "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
# "expectation_error_message": "Short Form should be from the Data Dictionary",
# },
# }
# ],
# }

FREQUENCY_OF_UPDATE_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "frequency_of_update_expectation_suite",
Expand Down
14 changes: 7 additions & 7 deletions app/utils/column_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,9 @@ async def find_metadata_columns(columns: set):
organization_pattern = re.compile(
r".*({}).*".format(metadata_settings.ORGANIZATION_KEYWORD)
)
short_form_pattern = re.compile(
r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD)
)
# short_form_pattern = re.compile(
# r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD)
# )
description_pattern = re.compile(
r".*({}).*".format(metadata_settings.DESCRIPTION_KEYWORD)
)
Expand Down Expand Up @@ -217,9 +217,9 @@ async def find_metadata_columns(columns: set):
organization_column, columns = extract_pattern_from_columns(
columns, organization_pattern
)
short_form_column, columns = extract_pattern_from_columns(
columns, short_form_pattern
)
# short_form_column, columns = extract_pattern_from_columns(
# columns, short_form_pattern
# )
description_column, columns = extract_pattern_from_columns(
columns, description_pattern
)
Expand Down Expand Up @@ -261,7 +261,7 @@ async def find_metadata_columns(columns: set):
return {
"sector": list(sector_column),
"organization": list(organization_column),
"short_form": list(short_form_column),
# "short_form": list(short_form_column),
"description": list(description_column),
"tags": list(tags_column),
"temporal_coverage": list(temporal_coverage_column),
Expand Down
26 changes: 26 additions & 0 deletions app/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,18 @@ async def modify_default_expectation_suite(
return expectation_suite


async def modify_column_order_expectation_suite(
expectation_suite: dict, column_order: list
):
modified_expectations = []
for expectation in expectation_suite["expectations"]:
if expectation["expectation_type"] == "expect_table_columns_to_match_ordered_list":
expectation["kwargs"]["column_list"] = column_order
modified_expectations.append(expectation)
expectation_suite["expectations"] = modified_expectations
return expectation_suite


async def modify_values_to_be_in_between(
changed_config: dict, default_config: str
):
Expand All @@ -126,6 +138,20 @@ async def modify_values_to_be_in_between(
return default_config


async def modify_values_length_to_be_between(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_value_lengths_to_be_between"
):
expectation["kwargs"].update(
changed_config["expect_column_value_lengths_to_be_between"]
)
return default_config


async def modify_values_to_be_in_set(
changed_config: dict, default_config: str
):
Expand Down
Loading

0 comments on commit 0867f1f

Please sign in to comment.