Skip to content

Commit

Permalink
Merge pull request #41 from factly/fix/sheet_issues
Browse files Browse the repository at this point in the history
Fix/sheet issues
  • Loading branch information
paul-tharun authored Mar 19, 2024
2 parents 06ef96a + 2a121b0 commit 4023e9f
Show file tree
Hide file tree
Showing 9 changed files with 355 additions and 18 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
repos:
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 24.3.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
rev: 7.0.0
hooks:
- id: flake8
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
15 changes: 11 additions & 4 deletions app/api/api_v1/routers/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from app.models.enums import ExpectationResultType
from app.models.metadata_gsheet import MetadataGsheetRequest
from app.utils.common import read_dataset
from app.utils.gsheets import get_records_from_gsheets
from app.utils.metadata import metadata_expectation_suite

Expand All @@ -20,16 +21,22 @@ async def execute_metadata_expectation_from_file(
ExpectationResultType.SUMMARY,
description="Level of Details for a Expectation result",
),
datasets: UploadFile = File(...),
file: UploadFile = File(...),
):

# read the dataset from uploaded CSV file
logger.info(f"dataset: {datasets.filename}")
df = pd.read_csv(datasets.file)
logger.info(f"dataset: {file.filename}")
dataset = await read_dataset(file, is_file=True)
# df = pd.read_csv(datasets.file)

# # metadata expectation
# expectation = await metadata_expectation_suite(
# df, result_type, dataset_name=datasets.filename
# )

# metadata expectation
expectation = await metadata_expectation_suite(
df, result_type, dataset_name=datasets.filename
dataset, result_type, dataset_name=file.filename
)

return expectation
Expand Down
45 changes: 41 additions & 4 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ class Settings(BaseSettings):
MODE: str = "development"
DOCS_URL: str = "/api/docs"
EXAMPLE_FOLDER: str = "/Users/somitragupta/factly/news-room-datasets"
EXAMPLE_URL: str = "/Users/somitragupta/factly/factly-datasets/projects/rbi/\
EXAMPLE_URL: str = (
"/Users/somitragupta/factly/factly-datasets/projects/rbi/\
data/processed/1_timeseries/5_handbook-of-statistics-on-the-indian-economy/\
hbs-mb-scb-select-aggregates-weekly/output.csv"
)
EXAMPLE_URL_COUNTRY: str = """https://storage.factly.org/mande/\
edu-ministry/data/processed/statistics/1_AISHE_report/19_enrolment_foreign/output.csv"""
EXAMPLE_URL_STATE: str = """https://storage.factly.org/mande/edu-ministry/data/\
Expand Down Expand Up @@ -298,7 +300,7 @@ class NoteSettings(BaseSettings):
{
"expectation_type": "expect_column_values_to_match_regex_list",
"kwargs": {
"column": "unit",
"column": "note",
"regex_list": [",?.+?:[^,]+[,]?"],
"result_format": "SUMMARY",
},
Expand All @@ -314,7 +316,33 @@ class NoteSettings(BaseSettings):

class CustomExpectationsSettings(BaseSettings):

NULL_DATETIME_VALUE_NAME: str = "Null date values Flag - {column}"
NULL_DATETIME_VALUE_MSG: str = (
"Null values should not be permitted for datetime values"
)

NUMERIC_COLUMNS_TYPES = ["float64", "int64"]
NUMERIC_VALUES_PATTERN = re.compile(r"^-?\d+(\.\d{1,2})?$")
NUMERIC_EXPECTATION_NAME: str = (
"Numeric values in specific pattern - {column}"
)
NUMERIC_EXPECTATION_ERR_MSG: str = (
"Numeric values should be in proper format both integer and float(roundoff to two decimal places)"
)

NEGATIVE_NUMERIC_VALUES_PATTERN = re.compile(r"^-\d+(\.\d{1,})?$")
NEGATIVE_NUMERIC_EXPECTATION_NAME: str = (
"Negative Numeric values Flag - {column}"
)
NEGATIVE_NUMERIC_EXPECTATION_ERR_MSG: str = (
"Flag Numeric values that are negative"
)

COLUMN_NAMES_PATTERN = re.compile(r"^[a-z]+(?:_[a-z]+)*$")
COLUMN_NAMES_EXPECTATION_NAME: str = "Column names in specific pattern"
COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
"Column names should be in lower case and separated by underscore - {column}"
)

TRAIL_OR_LEAD_WHITESPACE_PATTERN = re.compile(r"^\s+.*|.*\s+$")
LEADING_TRAILING_WHITE_SPACE_EXPECTATION_NAME: str = (
Expand All @@ -334,7 +362,9 @@ class CustomExpectationsSettings(BaseSettings):
SPECIAL_CHARACTER_EXPECTATION_NAME: str = (
"No special characters in Columns"
)
SPECIAL_CHARACTER_EXPECTATION_ERR_MSG: str = "There should be no special character in the category name and measured value, like Telangana** , and any additional information should be captured in notes instead of using a special character"
SPECIAL_CHARACTER_EXPECTATION_ERR_MSG: str = (
"There should be no special character in the category name and measured value, like Telangana** , and any additional information should be captured in notes instead of using a special character"
)

BRACKET_PATTERN = re.compile(r".*([\[\(].+?[\)\]]).*")
BRACKETS_EXPECTATION_NAME: str = "No unnecessary brackets in Categories"
Expand All @@ -358,7 +388,9 @@ class CustomExpectationsSettings(BaseSettings):

MINIMUM_DATASET_OBSERVATION_THRESH: int = 10
OBSERVATIONS_MORE_THAN_THRESH_NAME: str = "Minimum required observation"
OBSERVATIONS_MORE_THAN_THRESH_MSG: str = "Generally the datasets must be more a threshold number of observation ({thresh})"
OBSERVATIONS_MORE_THAN_THRESH_MSG: str = (
"Generally the datasets must be more a threshold number of observation ({thresh})"
)


class MetadataSettings(BaseSettings):
Expand Down Expand Up @@ -504,6 +536,11 @@ class MetadataSettings(BaseSettings):
],
}

DESCRIPTION_NAME: str = "Description"
DESCRIPTION_ERROR_MSG: str = (
"Description should be in the range of 50 to 5000"
)

TIME_SAVED_IN_HOURS_NAME: str = "Null values in columns - {column}"
TIME_SAVED_IN_HOURS_MSG: str = (
"Null values should not present in these columns"
Expand Down
1 change: 0 additions & 1 deletion app/core/sector.csv
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,3 @@ Youth and Sports
Banking
Trade
Water Resources
Youth and Sports
56 changes: 56 additions & 0 deletions app/expectations/custom_expectations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import logging
from datetime import date

import numpy as np
import pandas as pd
from great_expectations.dataset import MetaPandasDataset, PandasDataset

from app.core.config import CustomExpectationsSettings

custom_expectation_settings = CustomExpectationsSettings()

CURRENT_YEAR = str(date.today().year)
logging.basicConfig(level=logging.INFO)


class GenericCustomExpectations(PandasDataset):
Expand Down Expand Up @@ -78,3 +81,56 @@ def expect_multicolumn_dataset_to_have_more_than_x_rows(self, column_list):
),
length,
)

@MetaPandasDataset.multicolumn_map_expectation
def expect_numerical_values_to_be_in_specific_pattern(
self,
column_list,
pattern=custom_expectation_settings.NUMERIC_VALUES_PATTERN,
meta={
"expectation_name": "Numeric values in specific pattern",
},
include_meta=True,
):
bool_list = column_list.applymap(
lambda x: True if pattern.match(str(x)) else False
)
return bool_list[bool_list.columns[0]]

@MetaPandasDataset.multicolumn_map_expectation
def flag_negative_numerical_values(
self,
column_list,
pattern=custom_expectation_settings.NEGATIVE_NUMERIC_VALUES_PATTERN,
meta={
"expectation_name": "Negative Numeric values Flag",
},
include_meta=True,
):
bool_list = column_list.applymap(
lambda x: False if pattern.match(str(x)) else True
)
return bool_list[bool_list.columns[0]]

@MetaPandasDataset.multicolumn_map_expectation
def expect_column_names_to_be_in_specific_pattern(
self,
column_list,
pattern=custom_expectation_settings.COLUMN_NAMES_PATTERN,
meta={
"expectation_name": "Values in specific pattern",
},
include_meta=True,
find_columns=False,
):
boolean_list = pd.Series(column_list.columns).apply(
lambda x: True if pattern.match(str(x)) else False
)
# improper_column_list = [
# column
# for column, boolean in zip(column_list.columns, boolean_list)
# if not boolean
# ]
# logging.info(boolean_list.all())

return boolean_list.all()
34 changes: 32 additions & 2 deletions app/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ def get_encoding(obj):


async def read_dataset(
source: str, s3_client=None, bucket_name: Union[str, None] = None, **kwargs
source: str,
s3_client=None,
bucket_name: Union[str, None] = None,
is_file: bool = False,
**kwargs,
) -> ge.dataset.pandas_dataset.PandasDataset:
if s3_client:
# dataset should be downloaded from s3 storage
Expand All @@ -42,7 +46,19 @@ async def read_dataset(
finally:
response.close()
response.release_conn()

elif is_file:
try:
file = source.file.read()
dataset = ge.read_csv(BytesIO(file))
logger.info(f"Dataset read from : {source.filename}")
except UnicodeDecodeError:
encoding = get_encoding(obj=file)
dataset = ge.read_csv(BytesIO(file), encoding=encoding)
logger.info(
f"Dataset read from : {source.filename} with non-utf8 encoding"
)
except Exception as e:
logger.info(f"Error reading Dataset from : {source.filename}: {e}")
else:
session = kwargs.pop("session")
try:
Expand Down Expand Up @@ -96,6 +112,20 @@ async def modify_default_expectation_suite(
return expectation_suite


async def modify_values_to_be_in_between(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_values_to_be_between"
):
expectation["kwargs"].update(
changed_config["expect_column_values_to_be_between"]
)
return default_config


async def modify_values_to_be_in_set(
changed_config: dict, default_config: str
):
Expand Down
Loading

0 comments on commit 4023e9f

Please sign in to comment.