lincc-frameworks · dougbrn · Apr 9, 2024 · Apr 8, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py
@@ -1,8 +1,9 @@
 from .example_module import greetings, meaning
 from .nestedframe import NestedFrame
+from .nestedframe.io import read_parquet
 
 # Import for registering
 from .series.accessor import NestSeriesAccessor  # noqa: F401
 from .series.dtype import NestedDtype
 
-__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"]
+__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"]
diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py
@@ -1 +1,2 @@
 from .core import NestedFrame  # noqa
+from .io import read_parquet  # noqa
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -0,0 +1,69 @@
+# typing.Self and "|" union syntax don't exist in Python 3.9
+from __future__ import annotations
+
+import pandas as pd
+
+from .core import NestedFrame
+
+
+def read_parquet(
+    data: str,
+    to_pack: dict,
+    engine: str = "auto",
+    columns: list[str] | None = None,
+    pack_columns: dict | None = None,
+) -> NestedFrame:
+    """
+    Load a parquet object from a file path and load a set of other
+    parquet objects to pack into the resulting NestedFrame.
+
+    Docstring based on the Pandas equivalent.
+
+    #TODO after MVP: Include full kwarg-set
+
+    Parameters
+    ----------
+    data : str, path object or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function.
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        gs, and file. For file URLs, a host is expected. A local file could be:
+        ``file://localhost/path/to/table.parquet``.
+        A file URL can also be a path to a directory that contains multiple
+        partitioned parquet files. Both pyarrow and fastparquet support
+        paths to directories as well as file URLs. A directory path could be:
+        ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
+    to_pack: dict,
+        A dictionary of parquet data paths (same criteria as `data`), where
+        each key reflects the desired column name to pack the data into and
+        each value reflects the parquet data to pack.
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+        Parquet library to use. If 'auto', then the option
+        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+        behavior is to try 'pyarrow', falling back to 'fastparquet' if
+        'pyarrow' is unavailable.
+
+        When using the ``'pyarrow'`` engine and no storage options are provided
+        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
+        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
+        Use the filesystem keyword with an instantiated fsspec filesystem
+        if you wish to use its implementation.
+    columns : list, default=None
+        If not None, only these columns will be read from the file.
+    pack_columns: dict, default=None
+        If not None, selects a set of columns from each keyed nested parquet
+        object to read from the nested files.
+
+    Returns
+    -------
+    NestedFrame
+    """
+
+    df = NestedFrame(pd.read_parquet(data, engine, columns))
+
+    for pack_key in to_pack:
+        col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None
+        packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset)
+        df = df.add_nested(packed, pack_key)
+
+    return df
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -0,0 +1,59 @@
+import os
+
+import pandas as pd
+import pytest
+from nested_pandas import read_parquet
+
+
+@pytest.mark.parametrize("columns", [["a"], None])
+@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None])
+def test_read_parquet(tmp_path, columns, pack_columns):
+    """Test nested parquet loading"""
+    # Setup a temporary directory for files
+    save_path = os.path.join(tmp_path, ".")
+
+    # Generate some test data
+    base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested1 = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    nested2 = pd.DataFrame(
+        data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    # Save to parquet
+    base.to_parquet(os.path.join(save_path, "base.parquet"))
+    nested1.to_parquet(os.path.join(save_path, "nested1.parquet"))
+    nested2.to_parquet(os.path.join(save_path, "nested2.parquet"))
+
+    # Read from parquet
+    nf = read_parquet(
+        data=os.path.join(save_path, "base.parquet"),
+        to_pack={
+            "nested1": os.path.join(save_path, "nested1.parquet"),
+            "nested2": os.path.join(save_path, "nested2.parquet"),
+        },
+        columns=columns,
+        pack_columns=pack_columns,
+    )
+
+    # Check Base Columns
+    if columns is not None:
+        assert nf.columns.tolist() == columns + ["nested1", "nested2"]
+    else:
+        assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"]
+
+    # Check Nested Columns
+    if pack_columns is not None:
+        for nested_col in pack_columns:
+            assert nf[nested_col].nest.fields == pack_columns[nested_col]
+    else:
+        for nested_col in nf.nested_columns:
+            if nested_col == "nested1":
+                assert nf[nested_col].nest.fields == nested1.columns.tolist()
+            elif nested_col == "nested2":
+                assert nf[nested_col].nest.fields == nested2.columns.tolist()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .core import NestedFrame # noqa
		from .io import read_parquet # noqa