-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon.py
218 lines (185 loc) · 7.04 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import logging
import re
from io import BytesIO
from typing import Union
import great_expectations as ge
import pandas as pd
from charset_normalizer import from_bytes
from fastapi.logger import logger
from app.core.config import APP_DIR, GeographySettings
logging.basicConfig(level=logging.INFO)
geographic_settings = GeographySettings()
async def get_file(session, url):
async with session.get(url) as response:
return await response.read()
def get_encoding(obj):
encoding = from_bytes(obj).best().encoding
return encoding
async def read_dataset(
source: str,
s3_client=None,
bucket_name: Union[str, None] = None,
is_file: bool = False,
**kwargs,
) -> ge.dataset.pandas_dataset.PandasDataset:
if s3_client:
# dataset should be downloaded from s3 storage
try:
response = s3_client.get_object(bucket_name, source)
dataset = ge.read_csv(BytesIO(response.data))
logger.info(f"Dataset read from : {source}")
except UnicodeDecodeError:
encoding = get_encoding(obj=response.data)
dataset = ge.read_csv(BytesIO(response.data), encoding=encoding)
logger.info(f"Dataset read from : {source} with non-utf8 encoding")
except Exception as e:
logger.info(f"Error reading Dataset from : {source}: {e}")
finally:
response.close()
response.release_conn()
elif is_file:
try:
file = source.file.read()
dataset = ge.read_csv(BytesIO(file))
logger.info(f"Dataset read from : {source.filename}")
except UnicodeDecodeError:
encoding = get_encoding(obj=file)
dataset = ge.read_csv(BytesIO(file), encoding=encoding)
logger.info(
f"Dataset read from : {source.filename} with non-utf8 encoding"
)
except Exception as e:
logger.info(f"Error reading Dataset from : {source.filename}: {e}")
else:
session = kwargs.pop("session")
try:
dataset = ge.read_csv(source, **kwargs)
logger.info(f"Dataset read from : {source}")
except UnicodeDecodeError:
file = await get_file(url=source, session=session)
encoding = get_encoding(obj=file)
dataset = ge.read_csv(BytesIO(file), encoding=encoding)
logger.info(f"Dataset read from : {source} with non-utf8 encoding")
except Exception as e:
logger.info(f"Error reading Dataset from : {source}: {e}")
return dataset
async def read_pandas_dataset(source: str, **kwargs):
dataset = pd.read_csv(source, **kwargs)
return dataset
async def load_values_to_be_in_set(domain: str):
# this function is used to load csv files, consisting values
# for states or country that are required to be in specific set
set_values_file = APP_DIR / "core" / f"{domain}.csv"
set_values = pd.read_csv(set_values_file)[f"{domain}"].unique()
return set_values
async def modify_column_names_to_expectation_suite(
expectation_suite: dict, expectation_config: dict
):
modified_expectations = []
for expectation in expectation_suite["expectations"]:
expectation["kwargs"].update(expectation_config)
modified_expectations.append(expectation)
expectation_suite["expectations"] = modified_expectations
return expectation_suite
async def modify_default_expectation_suite(
expectation_suite: dict, expectation_config: dict
):
modified_expectation = []
for expectation in expectation_suite["expectations"]:
if expectation["expectation_type"] in expectation_config.keys():
expectation["kwargs"].update(
expectation_config[expectation["expectation_type"]]
)
modified_expectation.append(expectation)
expectation_suite["expectations"] = modified_expectation
return expectation_suite
async def modify_column_order_expectation_suite(
expectation_suite: dict, column_order: list
):
modified_expectations = []
for expectation in expectation_suite["expectations"]:
if (
expectation["expectation_type"]
== "expect_table_columns_to_match_ordered_list"
):
expectation["kwargs"]["column_list"] = column_order
modified_expectations.append(expectation)
expectation_suite["expectations"] = modified_expectations
return expectation_suite
async def modify_values_to_be_in_between(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_values_to_be_between"
):
expectation["kwargs"].update(
changed_config["expect_column_values_to_be_between"]
)
return default_config
async def modify_values_length_to_be_between(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_value_lengths_to_be_between"
):
expectation["kwargs"].update(
changed_config["expect_column_value_lengths_to_be_between"]
)
return default_config
async def modify_values_to_be_in_set(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_values_to_be_in_set"
):
expectation["kwargs"].update(
changed_config["expect_column_values_to_be_in_set"]
)
return default_config
async def modify_values_to_match_regex_list(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_values_to_match_regex_list"
):
expectation["kwargs"].update(
changed_config["expect_column_values_to_match_regex_list"]
)
return default_config
async def modify_values_to_match_regex(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_values_to_match_regex"
):
expectation["kwargs"].update(
changed_config["expect_column_values_to_match_regex"]
)
return default_config
async def modify_values_to_match_strftime_format(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_values_to_match_strftime_format"
):
expectation["kwargs"].update(
changed_config["expect_column_values_to_match_strftime_format"]
)
return default_config
def slugify(text: str):
text = text.lower()
text = re.sub(r"[^/.a-z0-9_-]", "-", text)
text = re.sub(r"-{2,}", "-", text)
return text