Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/production-dependencies-c52d6…
Browse files Browse the repository at this point in the history
…2cd3d
  • Loading branch information
LeonLuttenberger authored Sep 5, 2023
2 parents ff4901f + f19b0ad commit df21f4d
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 55 deletions.
12 changes: 6 additions & 6 deletions awswrangler/catalog/_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def get_columns_comments(
query_as_of_time: Optional[str] = None,
catalog_id: Optional[str] = None,
boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, str]:
) -> Dict[str, Optional[str]]:
"""Get all columns comments.
Note
Expand All @@ -987,8 +987,8 @@ def get_columns_comments(
Returns
-------
Dict[str, str]
Columns comments. e.g. {"col1": "foo boo bar"}.
Dict[str, Optional[str]]
Columns comments. e.g. {"col1": "foo boo bar", "col2": None}.
Examples
--------
Expand All @@ -1005,12 +1005,12 @@ def get_columns_comments(
),
)
)
comments: Dict[str, str] = {}
comments: Dict[str, Optional[str]] = {}
for c in response["Table"]["StorageDescriptor"]["Columns"]:
comments[c["Name"]] = c["Comment"]
comments[c["Name"]] = c.get("Comment")
if "PartitionKeys" in response["Table"]:
for p in response["Table"]["PartitionKeys"]:
comments[p["Name"]] = p["Comment"]
comments[p["Name"]] = p.get("Comment")
return comments


Expand Down
19 changes: 12 additions & 7 deletions awswrangler/s3/_write_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,9 +432,10 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals,too-many-b
Useful when you have columns with undetermined or mixed data types.
(e.g. {'col name': 'bigint', 'col2 name': 'int'})
athena_partition_projection_settings: typing.AthenaPartitionProjectionSettings, optional
Parameters of the Athena Partition Projection (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as an
instance of AthenaPartitionProjectionSettings or as a regular Python dict.
Parameters of the Athena Partition Projection
(https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as
an instance of AthenaPartitionProjectionSettings or as a regular Python dict.
Following projection parameters are supported:
Expand Down Expand Up @@ -758,6 +759,7 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments,too-many-local
path_suffix: Optional[str] = None,
path_ignore_suffix: Union[str, List[str], None] = None,
ignore_empty: bool = True,
ignore_null: bool = False,
dtype: Optional[Dict[str, str]] = None,
sampling: float = 1.0,
dataset: bool = False,
Expand Down Expand Up @@ -810,6 +812,8 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments,too-many-local
Suffix or List of suffixes for S3 keys to be ignored.
ignore_empty: bool
Ignore files with 0 bytes.
ignore_null: bool
Ignore columns with null type.
dtype : Dict[str, str], optional
Dictionary of columns names and Athena/Glue types to be casted.
Useful when you have columns with undetermined data types as partitions columns.
Expand Down Expand Up @@ -844,9 +848,10 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments,too-many-local
Keep enabled even when working with projections is useful to keep
Redshift Spectrum working with the regular partitions.
athena_partition_projection_settings: typing.AthenaPartitionProjectionSettings, optional
Parameters of the Athena Partition Projection (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as an
instance of AthenaPartitionProjectionSettings or as a regular Python dict.
Parameters of the Athena Partition Projection
(https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as
an instance of AthenaPartitionProjectionSettings or as a regular Python dict.
Following projection parameters are supported:
Expand Down Expand Up @@ -934,7 +939,7 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments,too-many-local
path_suffix=path_suffix,
path_ignore_suffix=path_ignore_suffix,
ignore_empty=ignore_empty,
ignore_null=False,
ignore_null=ignore_null,
use_threads=use_threads,
s3_additional_kwargs=s3_additional_kwargs,
boto3_session=boto3_session,
Expand Down
4 changes: 3 additions & 1 deletion building/lambda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ WORKDIR /root
FROM ${python_version}
COPY pyproject.toml poetry.lock ./

RUN pip3 install --upgrade pip wheel && pip3 install --upgrade six cython cmake hypothesis poetry
RUN pip3 install --upgrade pip wheel
RUN pip3 install --upgrade urllib3==1.26.16 # temporary to avoid https://github.com/urllib3/urllib3/issues/2168 (TODO remove when the AL2 image updates to support OpenSSL 1.1.1+)
RUN pip3 install --upgrade six cython cmake hypothesis poetry
RUN poetry config virtualenvs.create false --local && poetry install --no-root --no-dev

RUN rm -f pyproject.toml poetry.lock
Expand Down
82 changes: 41 additions & 41 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions tests/unit/test_athena_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,38 @@ def test_store_metadata_partitions_sample_dataset(glue_database, glue_table, pat
assert df.c2.sum() * num_files == df2.c2.sum()


def test_store_metadata_ignore_null_columns(glue_database, glue_table, path):
df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2_null": [None, None, None], "c3_null": [None, None, None]})
wr.s3.to_parquet(df=df, path=path, dataset=True, dtype={"c2_null": "int", "c3_null": "int"})
wr.s3.store_parquet_metadata(
path=path,
database=glue_database,
table=glue_table,
ignore_null=True,
dataset=True,
dtype={"c2_null": "int", "c3_null": "int"},
)


@pytest.mark.parametrize("partition_cols", [None, ["c0"], ["c0", "c1"]])
def test_store_metadata_ignore_null_columns_partitions(glue_database, glue_table, path, partition_cols):
# only partition on non-null columns
num_files = 10
df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2_null": [None, None, None], "c3_null": [None, None, None]})
for _ in range(num_files):
wr.s3.to_parquet(
df=df, path=path, dataset=True, dtype={"c2_null": "int", "c3_null": "int"}, partition_cols=partition_cols
)
wr.s3.store_parquet_metadata(
path=path,
database=glue_database,
table=glue_table,
ignore_null=True,
dtype={"c2_null": "int", "c3_null": "int"},
dataset=True,
)


@pytest.mark.parametrize("partition_cols", [None, ["c1"], ["c2"], ["c1", "c2"], ["c2", "c1"]])
def test_to_parquet_reverse_partitions(glue_database, glue_table, path, partition_cols):
df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]})
Expand Down

0 comments on commit df21f4d

Please sign in to comment.