Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pyarrow: Check compatibility of pyarrow-backed pandas objects with numeric dtypes #2774

Merged
merged 16 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
optional-packages: ''
- python-version: '3.12'
numpy-version: '1.26'
optional-packages: ' contextily geopandas ipython rioxarray sphinx-gallery'
optional-packages: ' contextily geopandas ipython pyarrow rioxarray sphinx-gallery'

timeout-minutes: 30
defaults:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_tests_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ jobs:
python -m pip install --pre --prefer-binary \
--extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
numpy pandas xarray netCDF4 packaging \
build contextily dvc geopandas ipython rioxarray \
build contextily dvc geopandas ipython pyarrow rioxarray \
'pytest>=6.0' pytest-cov pytest-doctestplus pytest-mpl \
sphinx-gallery

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci_tests_legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ jobs:
contextily
geopandas
ipython
pyarrow
rioxarray
sphinx-gallery
build
Expand Down
9 changes: 9 additions & 0 deletions doc/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ The following are optional dependencies:
* `GeoPandas <https://geopandas.org>`__: For using and plotting GeoDataFrame objects.
* `RioXarray <https://corteva.github.io/rioxarray>`__: For saving multi-band rasters to GeoTIFFs.

.. note::

If you have `PyArrow <https://arrow.apache.org/docs/python/index.html>`__
installed, PyGMT does have some initial support for ``pandas.Series`` and
``pandas.DataFrame`` objects with Apache Arrow-backed arrays. Specifically,
only uint/int/float dtypes are supported for now. Support for datetime and
string Arrow dtypes are still works in progress. For more details, see
weiji14 marked this conversation as resolved.
Show resolved Hide resolved
https://github.com/GenericMappingTools/pygmt/issues/2800.
weiji14 marked this conversation as resolved.
Show resolved Hide resolved

Installing GMT and other dependencies
-------------------------------------

Expand Down
20 changes: 15 additions & 5 deletions pygmt/tests/test_clib_virtualfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from pygmt.helpers import GMTTempFile
from pygmt.tests.test_clib import mock

try:
import pyarrow as pa
except ImportError:
pa = None
seisman marked this conversation as resolved.
Show resolved Hide resolved

TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt")

Expand Down Expand Up @@ -321,16 +326,21 @@ def test_virtualfile_from_matrix_slice(dtypes):

def test_virtualfile_from_vectors_pandas(dtypes):
"""
Pass vectors to a dataset using pandas Series.
Pass vectors to a dataset using pandas Series, checking both numpy and
weiji14 marked this conversation as resolved.
Show resolved Hide resolved
pyarrow dtypes.
"""
size = 13
if pa is not None:
dtypes.extend([f"{dtype}[pyarrow]" for dtype in dtypes])
seisman marked this conversation as resolved.
Show resolved Hide resolved

for dtype in dtypes:
data = pd.DataFrame(
data={
"x": np.arange(size, dtype=dtype),
"y": np.arange(size, size * 2, 1, dtype=dtype),
"z": np.arange(size * 2, size * 3, 1, dtype=dtype),
}
"x": np.arange(size),
"y": np.arange(size, size * 2, 1),
"z": np.arange(size * 2, size * 3, 1),
},
dtype=dtype,
)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(data.x, data.y, data.z) as vfile:
Expand Down
19 changes: 19 additions & 0 deletions pygmt/tests/test_geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import numpy.testing as npt
import pandas as pd
import pandas.util._test_decorators as td
import pytest
from pygmt import Figure, info, makecpt, which

Expand Down Expand Up @@ -141,6 +142,24 @@ def test_geopandas_plot3d_non_default_circle():
"int64",
pd.Int32Dtype(),
pd.Int64Dtype(),
pytest.param(
"int32[pyarrow]",
marks=[
td.skip_if_no(package="pyarrow"),
pytest.mark.xfail(
reason="geopandas doesn't support writing columns with pyarrow dtypes to OGR_GMT yet."
),
],
),
pytest.param(
"int64[pyarrow]",
marks=[
td.skip_if_no(package="pyarrow"),
pytest.mark.xfail(
reason="geopandas doesn't support writing columns with pyarrow dtypes to OGR_GMT yet."
),
],
),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So geopandas.GeoDataFrame objects with columns that have pyarrow dtypes can't be plotted in PyGMT yet, because we write them to a temporary OGR_GMT file, and geopandas doesn't support writing int32[pyarrow] or int64[pyarrow] columns. We'll either need to wait for geopandas to support this, or switch PyGMT away from using temporay OGR_GMT files.

],
)
@pytest.mark.mpl_image_compare(filename="test_geopandas_plot_int_dtypes.png")
Expand Down
37 changes: 34 additions & 3 deletions pygmt/tests/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
from pygmt import info
from pygmt.exceptions import GMTInvalidInput

try:
import pyarrow # noqa: F401

HAS_PYARROW = True
except ImportError:
HAS_PYARROW = False

seisman marked this conversation as resolved.
Show resolved Hide resolved
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt")

Expand Down Expand Up @@ -74,16 +81,40 @@ def test_info_2d_list():
assert output == expected_output


def test_info_series():
@pytest.mark.parametrize(
"dtype",
[
"int64",
pytest.param(
"int64[pyarrow]",
marks=pytest.mark.skipif(
condition=not HAS_PYARROW, reason="Could not import 'pyarrow'"
),
),
],
)
def test_info_series(dtype):
"""
Make sure info works on a pandas.Series input.
"""
output = info(pd.Series(data=[0, 4, 2, 8, 6]))
output = info(pd.Series(data=[0, 4, 2, 8, 6], dtype=dtype))
expected_output = "<vector memory>: N = 5 <0/8>\n"
assert output == expected_output


def test_info_dataframe():
@pytest.mark.parametrize(
"dtype",
[
"float64",
pytest.param(
"float64[pyarrow]",
marks=pytest.mark.skipif(
condition=not HAS_PYARROW, reason="Could not import 'pyarrow'"
),
),
],
)
def test_info_dataframe(dtype):
"""
Make sure info works on pandas.DataFrame inputs.
"""
Expand Down