Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the virtualfile_in function to accept more 1-D arrays #2744

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
66c4b97
Refactor the data_kind and the virtualfile_to_data functions
seisman Oct 13, 2023
78c28cd
Update more functions
seisman Oct 14, 2023
f849e5a
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 15, 2023
f37413b
Change ncols to names
seisman Oct 15, 2023
3de7666
Fix more tests
seisman Oct 15, 2023
93b91d0
Fix project
seisman Oct 15, 2023
2eecf48
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 16, 2023
1d6e568
Fix more tests
seisman Oct 16, 2023
6f9fc19
Fixes
seisman Oct 16, 2023
68034ed
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 17, 2023
0db21bc
Fix triangulate
seisman Oct 17, 2023
7cf5290
Fix text
seisman Oct 17, 2023
b0b6d2a
Fix more failing tests
seisman Oct 17, 2023
fa875ef
More fixes
seisman Oct 17, 2023
2ee0df2
Fix linting issues
seisman Oct 17, 2023
d5c8340
Fix linting issues
seisman Oct 17, 2023
30bacb1
Fix linting issues
seisman Oct 18, 2023
4465f9b
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 20, 2023
593f252
Update pygmt/clib/session.py
seisman Oct 20, 2023
409337f
Apply suggestions from code review
seisman Oct 25, 2023
872fd59
Merge branch 'main' into refactor/virtualfile-to-data
seisman Dec 25, 2023
3ed0eb2
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jan 16, 2024
efa7a11
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jan 18, 2024
23fc3ea
Merge branch 'main' into refactor/virtualfile-to-data
seisman Mar 1, 2024
aa05333
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jul 11, 2024
5c10fc4
Fix plot and plot3d
seisman Jul 11, 2024
525a353
Fix errors in merging the main branch
seisman Jul 11, 2024
2f3fcc4
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jul 20, 2024
b55a9ad
Fix merging issue
seisman Jul 20, 2024
46be0fa
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jul 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 17 additions & 19 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
fmt_docstring,
tempfile_from_geojson,
tempfile_from_image,
validate_data_input,
)

FAMILIES = [
Expand Down Expand Up @@ -1467,11 +1468,8 @@ def virtualfile_from_data( # noqa: PLR0912
self,
check_kind=None,
data=None,
x=None,
y=None,
z=None,
extra_arrays=None,
required_z=False,
vectors=None,
names="xy",
required_data=True,
):
"""
Expand All @@ -1490,13 +1488,12 @@ def virtualfile_from_data( # noqa: PLR0912
Any raster or vector data format. This could be a file name or
path, a raster grid, a vector matrix/arrays, or other supported
data input.
x/y/z : 1-D arrays or None
x, y, and z columns as numpy arrays.
extra_arrays : list of 1-D arrays
Optional. A list of numpy arrays in addition to x, y, and z.
All of these arrays must be of the same size as the x/y/z arrays.
required_z : bool
State whether the 'z' column is required.
vectors : list of 1-D arrays or None
A list of 1-D arrays. Each array will be a column in the table.
All of these arrays must be of the same size.
names : str or list of str
A list of names for each of the columns. Must be of the same size
as the number of vectors. Default is ``"xy"``.
required_data : bool
Set to True when 'data' is required, or False when dealing with
optional virtual files. [Default is True].
Expand Down Expand Up @@ -1530,8 +1527,13 @@ def virtualfile_from_data( # noqa: PLR0912
...
<vector memory>: N = 3 <7/9> <4/6> <1/3>
"""
kind = data_kind(
data, x, y, z, required_z=required_z, required_data=required_data
kind = data_kind(data, required=required_data)
validate_data_input(
data=data,
vectors=vectors,
names=names,
required_data=required_data,
kind=kind,
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The validation checks have been moved from within data_kind to virtualfile_from_data here. But in plot.py, we actually use data_kind on its own here:

kind = data_kind(data, x, y)

Are we ok with raising GMTInvalidInput much later here in virtualfile_from_data (after all the keyword argument parsing), rather than early on in data_kind?


if check_kind:
Expand Down Expand Up @@ -1572,11 +1574,7 @@ def virtualfile_from_data( # noqa: PLR0912
warnings.warn(message=msg, category=RuntimeWarning, stacklevel=2)
_data = (data,) if not isinstance(data, pathlib.PurePath) else (str(data),)
elif kind == "vectors":
_data = [np.atleast_1d(x), np.atleast_1d(y)]
if z is not None:
_data.append(np.atleast_1d(z))
if extra_arrays:
_data.extend(extra_arrays)
_data = [np.atleast_1d(v) for v in vectors]
elif kind == "matrix": # turn 2-D arrays into list of vectors
if hasattr(data, "items") and not hasattr(data, "to_frame"):
# pandas.DataFrame or xarray.Dataset types.
Expand Down
1 change: 1 addition & 0 deletions pygmt/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@
is_nonstr_iter,
launch_external_viewer,
non_ascii_to_octal,
validate_data_input,
)
from pygmt.helpers.validators import validate_output_table_type
187 changes: 102 additions & 85 deletions pygmt/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,127 +16,154 @@
from pygmt.exceptions import GMTInvalidInput


def _validate_data_input(
data=None, x=None, y=None, z=None, required_z=False, required_data=True, kind=None
def validate_data_input(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's more useful to pass the list of column names instead, i.e., replacing ncols=2 with names=["x", "y"].

So, for most modules, vectors=["x", "y"] and names=["x", "y"] or vectors=[x, y, z] and names=["x", "y", "z"].

For more complicated modules like plot or plot3d, the names can be
names=["x", "y", "direction_arg1", "direction_arg2", "fill", "size", "symbol", "transparency"].

The column names will be very useful when the GMTInvalidInput exception is raised.
For example, instead of "Column 5 can't be None.", we can say "Column 5 ('size') can't be None.". Instead of "data must have at least 8 columns.", we can say

data must have at least 8 columns:
x y direction_arg1 direction_arg2 fill size symbol transparency

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in f37413b

seisman marked this conversation as resolved.
Show resolved Hide resolved
data=None, vectors=None, names="xy", required_data=True, kind=None
):
"""
Check if the combination of data/x/y/z is valid.
Check if the data input is valid.

Parameters
----------
data : str, pathlib.PurePath, None, bool, xarray.DataArray or {table-like}
Pass in either a file name or :class:`pathlib.Path` to an ASCII data
table, an :class:`xarray.DataArray`, a 1-D/2-D
{table-classes} or an option argument.
vectors : list of 1-D arrays
A list of 1-D arrays with the data columns.
names : list of str
List of column names.
required_data : bool
Set to True when 'data' is required, or False when dealing with
optional virtual files [Default is True].
kind : str or None
The kind of data that will be passed to a module. If not given, it
will be determined by calling :func:`data_kind`.

Examples
--------
>>> _validate_data_input(data="infile")
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6])
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], z=[7, 8, 9])
>>> _validate_data_input(data=None, required_data=False)
>>> _validate_data_input()
>>> validate_data_input(data="infile")
>>> validate_data_input(vectors=[[1, 2, 3], [4, 5, 6]], names="xy")
>>> validate_data_input(
... vectors=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], names="xyz"
... )
>>> validate_data_input(data=None, required_data=False)
>>> validate_data_input()
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: No input data provided.
>>> _validate_data_input(x=[1, 2, 3])
>>> validate_data_input(vectors=[[1, 2, 3], None], names="xy")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(y=[4, 5, 6])
pygmt.exceptions.GMTInvalidInput: Column 1 ('y') can't be None.
>>> validate_data_input(vectors=[None, [4, 5, 6]], names="xy")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], required_z=True)
pygmt.exceptions.GMTInvalidInput: Column 0 ('x') can't be None.
>>> validate_data_input(vectors=[[1, 2, 3], [4, 5, 6], None], names="xyz")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide x, y, and z.
pygmt.exceptions.GMTInvalidInput: Column 2 ('z') can't be None.
>>> import numpy as np
>>> import pandas as pd
>>> import xarray as xr
>>> data = np.arange(8).reshape((4, 2))
>>> _validate_data_input(data=data, required_z=True, kind="matrix")
>>> validate_data_input(data=data, names="xyz", kind="matrix")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(
pygmt.exceptions.GMTInvalidInput: data must have at least 3 columns.
x y z
>>> validate_data_input(
... data=pd.DataFrame(data, columns=["x", "y"]),
... required_z=True,
... names="xyz",
... kind="matrix",
... )
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(
pygmt.exceptions.GMTInvalidInput: data must have at least 3 columns.
x y z
>>> validate_data_input(
... data=xr.Dataset(pd.DataFrame(data, columns=["x", "y"])),
... required_z=True,
... names="xyz",
... kind="matrix",
... )
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(data="infile", x=[1, 2, 3])
pygmt.exceptions.GMTInvalidInput: data must have at least 3 columns.
x y z
>>> validate_data_input(data="infile", vectors=[[1, 2, 3], None])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", y=[4, 5, 6])
pygmt...GMTInvalidInput: Too much data. Use either 'data' or 1-D arrays.
>>> validate_data_input(data="infile", vectors=[None, [4, 5, 6]])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", z=[7, 8, 9])
pygmt...GMTInvalidInput: Too much data. Use either 'data' or 1-D arrays.
>>> validate_data_input(data="infile", vectors=[None, None, [7, 8, 9]])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
pygmt...GMTInvalidInput: Too much data. Use either 'data' or 1-D arrays.

Raises
------
GMTInvalidInput
If the data input is not valid.
"""
if data is None: # data is None
if x is None and y is None: # both x and y are None
if required_data: # data is not optional
raise GMTInvalidInput("No input data provided.")
elif x is None or y is None: # either x or y is None
raise GMTInvalidInput("Must provide both x and y.")
if required_z and z is None: # both x and y are not None, now check z
raise GMTInvalidInput("Must provide x, y, and z.")
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
# For 'matrix' kind, check if data has the required z column
if kind == "matrix" and required_z:
if kind is None:
kind = data_kind(data=data, required=required_data)

if kind == "vectors": # From data_kind, we know that data is None
if vectors is None:
raise GMTInvalidInput("No input data provided.")
if len(vectors) < len(names):
raise GMTInvalidInput(

Check warning on line 118 in pygmt/helpers/utils.py

View check run for this annotation

Codecov / codecov/patch

pygmt/helpers/utils.py#L118

Added line #L118 was not covered by tests
f"Requires {len(names)} 1-D arrays but got {len(vectors)}."
)
Comment on lines +122 to +125
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing unit test for this if-condition.

for i, v in enumerate(vectors[: len(names)]):
if v is None:
raise GMTInvalidInput(f"Column {i} ('{names[i]}') can't be None.")
else:
if vectors is not None and any(v is not None for v in vectors):
raise GMTInvalidInput("Too much data. Use either 'data' or 1-D arrays.")
if kind == "matrix": # check number of columns for matrix-like data
msg = f"data must have at least {len(names)} columns.\n" + " ".join(names)
if hasattr(data, "shape"): # np.ndarray or pd.DataFrame
if len(data.shape) == 1 and data.shape[0] < 3:
raise GMTInvalidInput("data must provide x, y, and z columns.")
if len(data.shape) > 1 and data.shape[1] < 3:
raise GMTInvalidInput("data must provide x, y, and z columns.")
if hasattr(data, "data_vars") and len(data.data_vars) < 3: # xr.Dataset
raise GMTInvalidInput("data must provide x, y, and z columns.")
if len(data.shape) == 1 and data.shape[0] < len(names):
raise GMTInvalidInput(msg)

Check warning on line 131 in pygmt/helpers/utils.py

View check run for this annotation

Codecov / codecov/patch

pygmt/helpers/utils.py#L131

Added line #L131 was not covered by tests
Comment on lines +135 to +136
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing unit test for this if-condition.

if len(data.shape) > 1 and data.shape[1] < len(names):
raise GMTInvalidInput(msg)
if hasattr(data, "data_vars") and len(data.data_vars) < len(
names
): # xr.Dataset
raise GMTInvalidInput(msg)


def data_kind(data=None, x=None, y=None, z=None, required_z=False, required_data=True):
def data_kind(data=None, required=True):
"""
Check what kind of data is provided to a module.
Determine the kind of data that will be passed to a module.

Possible types:
It checks the type of the ``data`` argument and determines the kind of
data. Falls back to ``"vectors"`` if ``data`` is None but required.

* a file name provided as 'data'
* a pathlib.PurePath object provided as 'data'
* an xarray.DataArray object provided as 'data'
* a 2-D matrix provided as 'data'
* 1-D arrays x and y (and z, optionally)
* an optional argument (None, bool, int or float) provided as 'data'
Possible data kinds:

Arguments should be ``None`` if not used. If doesn't fit any of these
categories (or fits more than one), will raise an exception.
- ``'file'``: a file name or a pathlib.PurePath object provided as 'data'
- ``'arg'``: an optional argument (None, bool, int or float) provided
as 'data'
- ``'grid'``: an xarray.DataArray with 2 dimensions provided as 'data'
- ``'image'``: an xarray.DataArray with 3 dimensions provided as 'data'
- ``'geojson'``: a geo-like Python object that implements
``__geo_interface__`` (geopandas.GeoDataFrame or shapely.geometry)
provided as 'data'
- ``'matrix'``: a 2-D array provided as 'data'
- ``'vectors'``: a list of 1-D arrays provided as 'vectors'

Parameters
----------
data : str, pathlib.PurePath, None, bool, xarray.DataArray or {table-like}
Pass in either a file name or :class:`pathlib.Path` to an ASCII data
table, an :class:`xarray.DataArray`, a 1-D/2-D
{table-classes} or an option argument.
x/y : 1-D arrays or None
x and y columns as numpy arrays.
z : 1-D array or None
z column as numpy array. To be used optionally when x and y are given.
required_z : bool
State whether the 'z' column is required.
required_data : bool
required : bool
Set to True when 'data' is required, or False when dealing with
optional virtual files. [Default is True].

Expand All @@ -152,49 +179,39 @@
>>> import numpy as np
>>> import xarray as xr
>>> import pathlib
>>> data_kind(data=None, x=np.array([1, 2, 3]), y=np.array([4, 5, 6]))
>>> data_kind(data=None)
'vectors'
>>> data_kind(data=np.arange(10).reshape((5, 2)), x=None, y=None)
>>> data_kind(data=np.arange(10).reshape((5, 2)))
'matrix'
>>> data_kind(data="my-data-file.txt", x=None, y=None)
>>> data_kind(data="my-data-file.txt")
'file'
>>> data_kind(data=pathlib.Path("my-data-file.txt"), x=None, y=None)
>>> data_kind(data=pathlib.Path("my-data-file.txt"))
'file'
>>> data_kind(data=None, x=None, y=None, required_data=False)
>>> data_kind(data=None, required=False)
'arg'
>>> data_kind(data=2.0, x=None, y=None, required_data=False)
>>> data_kind(data=2.0, required=False)
'arg'
>>> data_kind(data=True, x=None, y=None, required_data=False)
>>> data_kind(data=True, required=False)
'arg'
>>> data_kind(data=xr.DataArray(np.random.rand(4, 3)))
'grid'
>>> data_kind(data=xr.DataArray(np.random.rand(3, 4, 5)))
'image'
"""
# determine the data kind
if isinstance(data, (str, pathlib.PurePath)):
kind = "file"
elif isinstance(data, (bool, int, float)) or (data is None and not required_data):
elif isinstance(data, (bool, int, float)) or (data is None and not required):
kind = "arg"
elif isinstance(data, xr.DataArray):
kind = "image" if len(data.dims) == 3 else "grid"
elif hasattr(data, "__geo_interface__"):
# geo-like Python object that implements ``__geo_interface__``
# (geopandas.GeoDataFrame or shapely.geometry)
kind = "geojson"
elif data is not None:
elif data is not None: # anything but None is taken as a matrix
kind = "matrix"
else:
else: # fallback to vectors if data is None but required
kind = "vectors"
_validate_data_input(
data=data,
x=x,
y=y,
z=z,
required_z=required_z,
required_data=required_data,
kind=kind,
)
return kind


Expand Down
2 changes: 1 addition & 1 deletion pygmt/src/blockm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _blockm(block_method, data, x, y, z, outfile, **kwargs):
with GMTTempFile(suffix=".csv") as tmpfile:
with Session() as lib:
table_context = lib.virtualfile_from_data(
check_kind="vector", data=data, x=x, y=y, z=z, required_z=True
check_kind="vector", data=data, vectors=[x, y, z], names="xyz"
)
# Run blockm* on data table
with table_context as infile:
Expand Down
2 changes: 1 addition & 1 deletion pygmt/src/contour.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def contour(self, data=None, x=None, y=None, z=None, **kwargs):

with Session() as lib:
file_context = lib.virtualfile_from_data(
check_kind="vector", data=data, x=x, y=y, z=z, required_z=True
check_kind="vector", data=data, vectors=[x, y, z], names="xyz"
)
with file_context as fname:
lib.call_module(
Expand Down
2 changes: 1 addition & 1 deletion pygmt/src/nearneighbor.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def nearneighbor(data=None, x=None, y=None, z=None, **kwargs):
with GMTTempFile(suffix=".nc") as tmpfile:
with Session() as lib:
table_context = lib.virtualfile_from_data(
check_kind="vector", data=data, x=x, y=y, z=z, required_z=True
check_kind="vector", data=data, vectors=[x, y, z], names="xyz"
)
with table_context as infile:
if (outgrid := kwargs.get("G")) is None:
Expand Down
Loading
Loading