Support multiple start/end dates

oceanmodeling · Jul 28, 2024 · 3d439ab · 3d439ab
1 parent dc23d01
commit 3d439ab
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 34 deletions.
diff --git a/examples/NDBC_data.ipynb b/examples/NDBC_data.ipynb
@@ -12,7 +12,7 @@
     "import hvplot.pandas\n",
     "import pandas as pd\n",
     "\n",
-    "from searvey._ndbc_api import fetch_ndbc_station,get_ndbc_stations\n",
+    "from searvey._ndbc_api import fetch_ndbc_station,get_ndbc_stations,_fetch_ndbc\n",
     "logging.basicConfig(\n",
     "    level=20,\n",
     "    style=\"{\",\n",
@@ -59,7 +59,7 @@
     "def plot_map(data, title):\n",
     "\n",
     "    # Plot the world map\n",
-    "    world_plot = data.hvplot(geo=True, tiles=True, hover_cols=[\"Station\", \"location\"])\n",
+    "    world_plot = data.hvplot(geo=True, tiles=True, hover_cols=[\"Station\", \"location\"], title=title)\n",
     "    return world_plot.opts(width=800, height=500)\n",
     "\n",
     "\n",
@@ -144,18 +144,16 @@
     "end_date = \"2023-01-10\"\n",
     "\n",
     "# Get data for selected stations (mode = 'stdmet' for standard meteorological data)\n",
-    "station_ids = east_coast_stations[\"Station\"].tolist()[1:3]\n",
     "data_df = fetch_ndbc_station(\n",
     "    station_id=\"SRST2\",\n",
     "    mode=\"stdmet\",\n",
     "    start_date=start_date,\n",
     "    end_date=end_date,\n",
     ")\n",
     "\n",
-    "e\n",
-    "#output is a dictionary of {station id : data}\n",
-    "data_to_plot = data_df[\"SRST2\"]\n",
-    "data_to_plot"
+    "# Remove columns with all NaN values (unavailable data)\n",
+    "data_df = data_df.dropna(axis=1, how='all')\n",
+    "data_df"
    ]
   },
   {
@@ -172,7 +170,40 @@
    "outputs": [],
    "source": [
     "# Plot the data of data_to_plot\n",
-    "data_to_plot[[\"WSPD\", \"GST\", \"ATMP\"]].rename(columns={\"WSPD\": \"Wind Speed\", \"GST\": \"Wind Gust\", \"ATMP\": \"Air Temperature\"}).hvplot(grid=True)"
+    "data_df[[\"WSPD\", \"GST\", \"ATMP\"]].rename(columns={\"WSPD\": \"Wind Speed\", \"GST\": \"Wind Gust\", \"ATMP\": \"Air Temperature\"}).hvplot(grid=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get data from multiple station id using the internal method\n",
+    "\n",
+    "The internal method supports multiple start dates, this system works exactly the same as other data sources, where each item in the dates arrays correspond to the item in the station id.\n",
+    "\n",
+    "In the example below, data between the days 2023-01-10 and 2023-01-12 is fetched for station SRST2, data between 2023-01-12 and 2023-01-20 is fetched for station AGXC1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#You can get the stations in the region and store it in a list,this example stores 2 stations in a list. This can be used in the fetch ndbc function to get data for the stations\n",
+    "station_ids = east_coast_stations[\"Station\"].tolist()[1:3]\n",
+    "\n",
+    "data = _fetch_ndbc(\n",
+    "    station_ids=[\"SRST2\",\"AGXC1\",\"BAXC1\"],\n",
+    "    mode=\"stdmet\",\n",
+    "    start_dates=[\"2023-01-10\", \"2023-01-12\", \"2023-01-14\"],\n",
+    "    end_dates=[\"2023-01-12\", \"2023-01-20\", \"2023-01-30\"],\n",
+    "    columns=[\"WSPD\", \"GST\", \"ATMP\"],\n",
+    ")\n",
+    "\n",
+    "data\n",
+    "#output is a dictionary of {station id : data}\n",
+    "#data_df[\"SRST2\"] to get the data for station SRST2"
    ]
   }
  ],
@@ -192,7 +223,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/searvey/_ndbc_api.py b/searvey/_ndbc_api.py
@@ -2,7 +2,7 @@
 
 import functools
 import logging
-from typing import List
+from typing import List, Union
 
 import geopandas as gpd
 import multifutures
@@ -84,8 +84,8 @@ def get_ndbc_stations(
 def _fetch_ndbc(
     station_ids: List[str],
     mode: str,
-    start_date: DatetimeLike | None = None,
-    end_date: DatetimeLike | None = None,
+    start_dates: Union[DatetimeLike, List[DatetimeLike]] = None,
+    end_dates: Union[DatetimeLike, List[DatetimeLike]] = None,
     columns: list[str] | None = None,
     multithreading_executor: multifutures.ExecutorProtocol | None = None,
 ) -> dict[str, pd.DataFrame]:
@@ -94,25 +94,35 @@ def _fetch_ndbc(
 
     :param station_ids: A list of station identifiers.
     :param mode: Data mode. One of  ``'txt'``, ``'json'``, ``'spec'``.
-    :param start_date: The starting date of the query. Defaults to 7 days ago.
-    :param end_date: The finishing date of the query. Defaults to "now".
+    :param start_dates: The starting date of the query. Defaults to 7 days ago.
+    :param end_dates: The finishing date of the query. Defaults to "now".
     :param columns:
     :param multithreading_executor: A multithreading executor.
     :return: A dictionary mapping station identifiers to their respective
         TimeSeries.
     """
     now = pd.Timestamp.now("utc")
 
+    # Ensure start_dates and end_dates are lists
+    if not isinstance(start_dates, list):
+        start_dates = [start_dates] * len(station_ids)
+    if not isinstance(end_dates, list):
+        end_dates = [end_dates] * len(station_ids)
+
+    # Ensure that each station has a start_date and end_date
+    if len(start_dates) != len(station_ids) or len(end_dates) != len(station_ids):
+        raise ValueError("Each station must have a start_date and end_date")
+
     # Prepare arguments for each function call
     func_kwargs = [
         {
             "station_id": station_id,
             "mode": mode,
-            "start_time": _resolve_start_date(now, start_date)[0],
-            "end_time": _resolve_end_date(now, end_date)[0],
+            "start_date": _resolve_start_date(now, start_dates)[0],
+            "end_date": _resolve_end_date(now, end_dates)[0],
             "columns": columns,
         }
-        for station_id in station_ids
+        for station_id, start_dates, end_dates in zip(station_ids, start_dates, end_dates)
     ]
 
     # Fetch data concurrently using multithreading
@@ -124,35 +134,51 @@ def _fetch_ndbc(
 
     # Check for errors and collect results
     multifutures.check_results(results)
+
     dataframes = {
         result.kwargs["station_id"]: result.result for result in results if result.exception is None  # type: ignore[index]
+
     }
     return dataframes
 
 
 def fetch_ndbc_station(
     station_id: str,
     mode: str,
-    start_time: pd.Timestamp,
-    end_time: pd.Timestamp,
+    start_date: pd.Timestamp,
+    end_date: pd.Timestamp,
     columns: list[str] | None = None,
 ) -> pd.DataFrame:
-    """Retrieve the TimeSeries of a single NDBC station."""
+    """
+    Retrieve the TimeSeries of a single NDBC station.
+    Make a query to the NDBC API for data for ``station_id``
+    and return the results as a pandas dataframe.
+
+    :param station_id: The station identifier.
+    :param mode: Data mode. Read the example ndbc file for more info.
+    :param start_date: The starting date of the query.
+    :param end_date: The finishing date of the query.
+    :param columns: List of columns to retrieve.
+    :return: ``pandas.DataFrame`` with the station data.
+    """
+    logger.info("NDBC-%s: Starting data retrieval: %s - %s", station_id, start_date, end_date)
     try:
         df = ndbc_api.get_data(
             station_id=station_id,
             mode=mode,
-            start_time=start_time,
-            end_time=end_time,
+            start_time=start_date,
+            end_time=end_date,
             as_df=True,
             cols=columns,
         )
 
         if df.empty:
             logger.warning(f"No data available for station {station_id}")
+
+        logger.info("NDBC-%s: Finished data retrieval: %s - %s", station_id, start_date, end_date)
+
         return df
-    
+
     except Exception as e:
         logger.error(f"Error fetching data for station {station_id}: {str(e)}")
-        return pd.DataFrame()
-
+        return pd.DataFrame()
diff --git a/tests/ndbc_test.py b/tests/ndbc_test.py
@@ -38,8 +38,8 @@ def test_fetch_ndbc_station_data():
         station_id="SRST2",
         mode="stdmet",
         # test that both formats work
-        start_date=datetime.date(2023, 1, 1),
-        end_date="2023-01-10",
+        start_dates=datetime.date(2023, 1, 1),
+        end_dates="2023-01-10"
     )
 
     assert isinstance(dataframes, dict)
@@ -70,7 +70,7 @@ def test_fetch_ndbc_station_data():
     assert df.index[-1] == pd.to_datetime("2023-01-10 00:00:00")
 
 
-def test__fetch_ndbc_data_multiple():
+def test_fetch_ndbc_data_multiple():
     """
     This test will attempt to get data for multiple stations.
     """
@@ -79,8 +79,8 @@ def test__fetch_ndbc_data_multiple():
         station_ids=["STDM4", "TPLM2"],
         mode="stdmet",
         # test that both formats work
-        start_date=datetime.date(2023, 1, 1),
-        end_date="2023-01-10",
+        start_dates=[datetime.date(2023, 1, 1),"2023-01-01"],
+        end_dates=["2023-01-10","2023-01-20"]
     )
 
     assert isinstance(dataframes, dict)
@@ -133,16 +133,16 @@ def test__fetch_ndbc_data_multiple():
     assert df1.index[0] == pd.to_datetime("2023-01-01 00:00:00")
 
 
-def test__fetch_ndbc_data_multiple_unavaliable_avaliable_data():
+def test_fetch_ndbc_data_multiple_unavaliable_avaliable_data():
     """
     This is a test that makes sure that the function can handle when some stations have data and some don't.
     """
     dataframes = ndbc._fetch_ndbc(
         station_ids=["41001", "STDM4"],
         mode="stdmet",
         # test that both formats work
-        start_date=datetime.date(2023, 1, 1),
-        end_date="2023-01-10",
+        start_dates=[datetime.date(2023, 1, 1),"2023-01-01"],
+        end_dates=["2023-01-10","2023-01-20"]
     )
 
     assert isinstance(dataframes, dict)
@@ -172,4 +172,4 @@ def test__fetch_ndbc_data_multiple_unavaliable_avaliable_data():
         ]
     )
     assert df.index[0] == pd.to_datetime("2023-01-01 00:00:00")
-    assert df.index[-1] == pd.to_datetime("2023-01-10 00:00:00")
+    assert df.index[-1] == pd.to_datetime("2023-01-10 00:00:00")