Merge branch 'main' into dependabot/pip/production-dependencies-c52d6…

…2cd3d
aws · Sep 5, 2023 · df21f4d · df21f4d
2 parents ff4901f + f19b0ad
commit df21f4d
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 55 deletions.
diff --git a/awswrangler/catalog/_get.py b/awswrangler/catalog/_get.py
@@ -961,7 +961,7 @@ def get_columns_comments(
     query_as_of_time: Optional[str] = None,
     catalog_id: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
-) -> Dict[str, str]:
+) -> Dict[str, Optional[str]]:
     """Get all columns comments.
 
     Note
@@ -987,8 +987,8 @@ def get_columns_comments(
 
     Returns
     -------
-    Dict[str, str]
-        Columns comments. e.g. {"col1": "foo boo bar"}.
+    Dict[str, Optional[str]]
+        Columns comments. e.g. {"col1": "foo boo bar", "col2": None}.
 
     Examples
     --------
@@ -1005,12 +1005,12 @@ def get_columns_comments(
             ),
         )
     )
-    comments: Dict[str, str] = {}
+    comments: Dict[str, Optional[str]] = {}
     for c in response["Table"]["StorageDescriptor"]["Columns"]:
-        comments[c["Name"]] = c["Comment"]
+        comments[c["Name"]] = c.get("Comment")
     if "PartitionKeys" in response["Table"]:
         for p in response["Table"]["PartitionKeys"]:
-            comments[p["Name"]] = p["Comment"]
+            comments[p["Name"]] = p.get("Comment")
     return comments
 
 

diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py
@@ -432,9 +432,10 @@ def to_parquet(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
         Useful when you have columns with undetermined or mixed data types.
         (e.g. {'col name': 'bigint', 'col2 name': 'int'})
     athena_partition_projection_settings: typing.AthenaPartitionProjectionSettings, optional
-        Parameters of the Athena Partition Projection (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
-        AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as an
-        instance of AthenaPartitionProjectionSettings or as a regular Python dict.
+        Parameters of the Athena Partition Projection
+        (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
+        AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as
+        an instance of AthenaPartitionProjectionSettings or as a regular Python dict.
 
         Following projection parameters are supported:
 
@@ -758,6 +759,7 @@ def store_parquet_metadata(  # pylint: disable=too-many-arguments,too-many-local
     path_suffix: Optional[str] = None,
     path_ignore_suffix: Union[str, List[str], None] = None,
     ignore_empty: bool = True,
+    ignore_null: bool = False,
     dtype: Optional[Dict[str, str]] = None,
     sampling: float = 1.0,
     dataset: bool = False,
@@ -810,6 +812,8 @@ def store_parquet_metadata(  # pylint: disable=too-many-arguments,too-many-local
         Suffix or List of suffixes for S3 keys to be ignored.
     ignore_empty: bool
         Ignore files with 0 bytes.
+    ignore_null: bool
+        Ignore columns with null type.
     dtype : Dict[str, str], optional
         Dictionary of columns names and Athena/Glue types to be casted.
         Useful when you have columns with undetermined data types as partitions columns.
@@ -844,9 +848,10 @@ def store_parquet_metadata(  # pylint: disable=too-many-arguments,too-many-local
         Keep enabled even when working with projections is useful to keep
         Redshift Spectrum working with the regular partitions.
     athena_partition_projection_settings: typing.AthenaPartitionProjectionSettings, optional
-        Parameters of the Athena Partition Projection (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
-        AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as an
-        instance of AthenaPartitionProjectionSettings or as a regular Python dict.
+        Parameters of the Athena Partition Projection
+        (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html).
+        AthenaPartitionProjectionSettings is a `TypedDict`, meaning the passed parameter can be instantiated either as
+        an instance of AthenaPartitionProjectionSettings or as a regular Python dict.
 
         Following projection parameters are supported:
 
@@ -934,7 +939,7 @@ def store_parquet_metadata(  # pylint: disable=too-many-arguments,too-many-local
         path_suffix=path_suffix,
         path_ignore_suffix=path_ignore_suffix,
         ignore_empty=ignore_empty,
-        ignore_null=False,
+        ignore_null=ignore_null,
         use_threads=use_threads,
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,

diff --git a/building/lambda/Dockerfile b/building/lambda/Dockerfile
@@ -23,7 +23,9 @@ WORKDIR /root
 FROM ${python_version}
 COPY pyproject.toml poetry.lock ./
 
-RUN pip3 install --upgrade pip wheel && pip3 install --upgrade six cython cmake hypothesis poetry
+RUN pip3 install --upgrade pip wheel
+RUN pip3 install --upgrade urllib3==1.26.16  # temporary to avoid https://github.com/urllib3/urllib3/issues/2168 (TODO remove when the AL2 image updates to support OpenSSL 1.1.1+)
+RUN pip3 install --upgrade six cython cmake hypothesis poetry
 RUN poetry config virtualenvs.create false --local && poetry install --no-root --no-dev
 
 RUN rm -f pyproject.toml poetry.lock

diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/unit/test_athena_parquet.py b/tests/unit/test_athena_parquet.py
@@ -373,6 +373,38 @@ def test_store_metadata_partitions_sample_dataset(glue_database, glue_table, pat
     assert df.c2.sum() * num_files == df2.c2.sum()
 
 
+def test_store_metadata_ignore_null_columns(glue_database, glue_table, path):
+    df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2_null": [None, None, None], "c3_null": [None, None, None]})
+    wr.s3.to_parquet(df=df, path=path, dataset=True, dtype={"c2_null": "int", "c3_null": "int"})
+    wr.s3.store_parquet_metadata(
+        path=path,
+        database=glue_database,
+        table=glue_table,
+        ignore_null=True,
+        dataset=True,
+        dtype={"c2_null": "int", "c3_null": "int"},
+    )
+
+
+@pytest.mark.parametrize("partition_cols", [None, ["c0"], ["c0", "c1"]])
+def test_store_metadata_ignore_null_columns_partitions(glue_database, glue_table, path, partition_cols):
+    # only partition on non-null columns
+    num_files = 10
+    df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2_null": [None, None, None], "c3_null": [None, None, None]})
+    for _ in range(num_files):
+        wr.s3.to_parquet(
+            df=df, path=path, dataset=True, dtype={"c2_null": "int", "c3_null": "int"}, partition_cols=partition_cols
+        )
+    wr.s3.store_parquet_metadata(
+        path=path,
+        database=glue_database,
+        table=glue_table,
+        ignore_null=True,
+        dtype={"c2_null": "int", "c3_null": "int"},
+        dataset=True,
+    )
+
+
 @pytest.mark.parametrize("partition_cols", [None, ["c1"], ["c2"], ["c1", "c2"], ["c2", "c1"]])
 def test_to_parquet_reverse_partitions(glue_database, glue_table, path, partition_cols):
     df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]})