From 3814574fa8f05d82c4999f6b8b7e632fe9898c57 Mon Sep 17 00:00:00 2001
From: Konstantin Malanchev <hombit@gmail.com>
Date: Tue, 22 Aug 2023 17:25:06 -0400
Subject: [PATCH] Initial support of light-curve

---
 pyproject.toml                              |  1 +
 src/tape/analysis/__init__.py               |  1 +
 src/tape/analysis/feature_extraction.py     | 94 +++++++++++++++++++++
 tests/tape_tests/test_feature_extraction.py | 58 +++++++++++++
 4 files changed, 154 insertions(+)
 create mode 100644 src/tape/analysis/feature_extraction.py
 create mode 100644 tests/tape_tests/test_feature_extraction.py

diff --git a/pyproject.toml b/pyproject.toml
index 026f8301..d288af33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     'coverage',
     "deprecated",
     "ipykernel", # Support for Jupyter notebooks
+    "light-curve>=0.7.3,<0.8.0",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
diff --git a/src/tape/analysis/__init__.py b/src/tape/analysis/__init__.py
index 1622a47c..24602a73 100644
--- a/src/tape/analysis/__init__.py
+++ b/src/tape/analysis/__init__.py
@@ -1,3 +1,4 @@
+from .feature_extraction import extract_features  # noqa
 from .light_curve import LightCurve  # noqa
 from .stetsonj import *  # noqa
 from .structurefunction2 import *  # noqa
diff --git a/src/tape/analysis/feature_extraction.py b/src/tape/analysis/feature_extraction.py
new file mode 100644
index 00000000..f22c541c
--- /dev/null
+++ b/src/tape/analysis/feature_extraction.py
@@ -0,0 +1,94 @@
+"""
+Auxiliary code for time-series feature extraction with "light-curve"  package
+"""
+
+from typing import List
+
+import numpy as np
+import pandas as pd
+from light_curve.light_curve_ext import _FeatureEvaluator as BaseLightCurveFeature
+
+
+__all__ = ["cols", "meta", "extract_features", "BaseLightCurveFeature"]
+
+
+def cols(ens: "Ensemble") -> List[str]:
+    """Return the columns required for the time-series feature extraction
+
+    Parameters
+    ----------
+    ens : `Ensemble`
+        Ensemble of light curves
+
+    Returns
+    -------
+    cols : `list` of `str`
+        List of input column names
+    """
+    return [ens._time_col, ens._flux_col, ens._err_col, ens._band_col]
+
+
+def meta(feature: BaseLightCurveFeature) -> pd.DataFrame:
+    """Return the meta required by Dask
+
+    Parameters
+    ----------
+    feature : `BaseLightCurveFeature`
+        Feature extractor in use, its `.names` attribute will be used
+
+    Returns
+    -------
+    meta : `list` of `str`
+        List of output column names
+    """
+    return pd.DataFrame(dtype=np.float64, columns=feature.names)
+
+
+def extract_features(
+    time, flux, err, band, *, feature: BaseLightCurveFeature, band_to_calc: str, **kwargs
+) -> pd.Series:
+    """
+    Apply a feature extractor to a light curve, concatenating the results over
+    all bands.
+
+    Parameters
+    ----------
+    feature : `BaseLightCurveFeature`
+        Feature extractor to apply, see "light-curve" package for more details.
+    time : `numpy.ndarray`
+        Time values
+    flux : `numpy.ndarray`
+        Brightness values, flux or magnitudes
+    err : `numpy.ndarray`
+        Errors for "flux"
+    band : `numpy.ndarray`
+        Passband names.
+    **kwargs : `dict`
+        Additional keyword arguments to pass to the feature extractor.
+
+    Returns
+    -------
+    features : pandas.DataFrame
+        Feature values for each band, dtype is a common type for input arrays.
+    """
+
+    # Select passband to calculate
+    band_mask = band == band_to_calc
+    time, flux, err = (a[band_mask] for a in (time, flux, err))
+
+    # Sort inputs by time if not already sorted
+    if not kwargs.get("sorted", False):
+        sort_idx = np.argsort(time)
+        time, flux, err, band = (a[sort_idx] for a in (time, flux, err, band))
+        # Now we can update the kwargs for better performance
+        kwargs = kwargs.copy()
+        kwargs["sorted"] = True
+
+    # Convert the numerical arrays to a common dtype
+    dtype = np.find_common_type([a.dtype for a in (time, flux, err)], [])
+    time, flux, err = (a.astype(dtype) for a in (time, flux, err))
+
+    values = feature(time, flux, err, **kwargs)
+
+    series = pd.Series(dict(zip(feature.names, values)))
+    return series
diff --git a/tests/tape_tests/test_feature_extraction.py b/tests/tape_tests/test_feature_extraction.py
new file mode 100644
index 00000000..71ecd41b
--- /dev/null
+++ b/tests/tape_tests/test_feature_extraction.py
@@ -0,0 +1,58 @@
+"""Test feature extraction with light_curve package"""
+
+import light_curve as licu
+import numpy as np
+from numpy.testing import assert_array_equal, assert_allclose
+
+from tape import Ensemble
+from tape.analysis.feature_extraction import extract_features, cols, meta
+from tape.utils import ColumnMapper
+
+
+def test_stetsonk():
+    stetson_k = licu.StetsonK()
+
+    time = np.array([5.0, 4.0, 3.0, 2.0, 1.0, 0.0] * 2)
+    flux = 1.0 + time**2.0
+    err = np.full_like(time, 0.1, dtype=np.float32)
+    band = np.r_[["g"] * 6, ["r"] * 6]
+
+    result = extract_features(feature=stetson_k, time=time, flux=flux, err=err, band=band, band_to_calc="g")
+    assert result.shape == (1,)
+    assert_array_equal(result.index, ["stetson_K"])
+    assert_allclose(result.values, 0.84932, rtol=1e-5)
+    assert_array_equal(result.dtypes, np.float64)
+
+
+def test_stetsonk_with_ensemble():
+    n = 5
+
+    object1 = {
+        "id": np.full(n, 1),
+        "time": np.arange(n, dtype=np.float64),
+        "flux": np.linspace(1.0, 2.0, n),
+        "err": np.full(n, 0.1),
+        "band": np.full(n, "g"),
+    }
+    object2 = {
+        "id": np.full(2 * n, 2),
+        "time": np.arange(2 * n, dtype=np.float64),
+        "flux": np.r_[np.linspace(1.0, 2.0, n), np.linspace(1.0, 2.0, n)],
+        "err": np.full(2 * n, 0.01),
+        "band": np.r_[np.full(n, "g"), np.full(n, "r")],
+    }
+    rows = {column: np.concatenate([object1[column], object2[column]]) for column in object1}
+
+    cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
+    ens = Ensemble().from_source_dict(rows, cmap)
+
+    stetson_k = licu.StetsonK()
+    result = ens.batch(
+        extract_features,
+        *cols(ens),
+        meta=meta(stetson_k),
+        feature=stetson_k,
+        band_to_calc="g",
+    )
+
+    assert_allclose(result, 0.848528, rtol=1e-5)