Skip to content

Commit

Permalink
GMT_DATASET: Return an empty DataFrame if the file has no data
Browse files Browse the repository at this point in the history
  • Loading branch information
seisman committed Mar 21, 2024
1 parent dd8e0cd commit 175ba3c
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 4 deletions.
10 changes: 6 additions & 4 deletions pygmt/datatypes/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure): # noqa: N801
"""
GMT dataset structure for holding multiple tables (files).
This class is only meant for internal use by PyGMT and is not exposed to users.
See the GMT source code gmt_resources.h for the original C struct definitions.
This class is only meant for internal use and is not exposed to users. See the GMT
source code ``gmt_resources.h`` for the original C struct definitions.
Examples
--------
Expand Down Expand Up @@ -151,6 +151,8 @@ def to_dataframe(self) -> pd.DataFrame:
the same. The same column in all segments of all tables are concatenated. The
trailing text column is also concatenated as a single string column.
If the object has no data, an empty DataFrame will be returned.
Returns
-------
df
Expand Down Expand Up @@ -185,8 +187,8 @@ def to_dataframe(self) -> pd.DataFrame:
>>> df.dtypes.to_list()
[dtype('float64'), dtype('float64'), dtype('float64'), string[python]]
"""
# Deal with numeric columns
vectors = []
# Deal with numeric columns
for icol in range(self.n_columns):
colvector = []
for itbl in range(self.n_tables):
Expand All @@ -211,5 +213,5 @@ def to_dataframe(self) -> pd.DataFrame:
pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
)

df = pd.concat(objs=vectors, axis=1)
df = pd.concat(objs=vectors, axis=1) if vectors else pd.DataFrame()
return df
83 changes: 83 additions & 0 deletions pygmt/tests/test_datatypes_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Tests for GMT_DATASET data type.
"""

from pathlib import Path

import pandas as pd
import pytest
from pygmt.clib import Session
from pygmt.helpers import GMTTempFile


def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None):
"""
Read a tabular data as pandas.DataFrame object using pandas.read_csv().
The parameters have the same meaning as in ``pandas.read_csv()``.
"""
try:
df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header)
except pd.errors.EmptyDataError:
# Return an empty DataFrame if the file has no data
return pd.DataFrame()

# By default, pandas reads text strings with whitespaces as multiple columns, but
# GMT contacatenates all trailing text as a single string column. Neet do find all
# string columns (with dtype="object") and combine them into a single string column.
string_columns = df.select_dtypes(include=["object"]).columns
if len(string_columns) > 1:
df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1)
df = df.drop(string_columns[1:], axis=1)
# Convert 'object' to 'string' type
df = df.convert_dtypes(
convert_string=True,
convert_integer=False,
convert_boolean=False,
convert_floating=False,
)
return df


def dataframe_from_gmt(fname):
"""
Read a tabular data as pandas.DataFrame using GMT virtual file.
"""
with Session() as lib:
with lib.virtualfile_out(kind="dataset") as vouttbl:
lib.call_module("read", f"{fname} {vouttbl} -Td")
df = lib.virtualfile_to_dataset(vfname=vouttbl)
return df


@pytest.mark.benchmark
def test_dataset():
"""
Test the basic functionality of GMT_DATASET.
"""
with GMTTempFile(suffix=".txt") as tmpfile:
with Path(tmpfile.name).open(mode="w") as fp:
print(">", file=fp)
print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
print(">", file=fp)
print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)

df = dataframe_from_gmt(tmpfile.name)
expected_df = dataframe_from_pandas(tmpfile.name, comment=">")
pd.testing.assert_frame_equal(df, expected_df)


def test_dataset_empty():
"""
Make sure that an empty DataFrame is returned if a file has no data.
"""
with GMTTempFile(suffix=".txt") as tmpfile:
with Path(tmpfile.name).open(mode="w") as fp:
print("# This is a comment line.", file=fp)

df = dataframe_from_gmt(tmpfile.name)
assert df.empty # Empty DataFrame
expected_df = dataframe_from_pandas(tmpfile.name)
pd.testing.assert_frame_equal(df, expected_df)

0 comments on commit 175ba3c

Please sign in to comment.