From 3814574fa8f05d82c4999f6b8b7e632fe9898c57 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Tue, 22 Aug 2023 17:25:06 -0400 Subject: [PATCH] Initial support of light-curve --- pyproject.toml | 1 + src/tape/analysis/__init__.py | 1 + src/tape/analysis/feature_extraction.py | 94 +++++++++++++++++++++ tests/tape_tests/test_feature_extraction.py | 58 +++++++++++++ 4 files changed, 154 insertions(+) create mode 100644 src/tape/analysis/feature_extraction.py create mode 100644 tests/tape_tests/test_feature_extraction.py diff --git a/pyproject.toml b/pyproject.toml index 026f8301..d288af33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ 'coverage', "deprecated", "ipykernel", # Support for Jupyter notebooks + "light-curve>=0.7.3,<0.8.0", ] # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes) diff --git a/src/tape/analysis/__init__.py b/src/tape/analysis/__init__.py index 1622a47c..24602a73 100644 --- a/src/tape/analysis/__init__.py +++ b/src/tape/analysis/__init__.py @@ -1,3 +1,4 @@ +from .feature_extraction import extract_features # noqa from .light_curve import LightCurve # noqa from .stetsonj import * # noqa from .structurefunction2 import * # noqa diff --git a/src/tape/analysis/feature_extraction.py b/src/tape/analysis/feature_extraction.py new file mode 100644 index 00000000..f22c541c --- /dev/null +++ b/src/tape/analysis/feature_extraction.py @@ -0,0 +1,94 @@ +""" +Auxiliary code for time-series feature extraction with "light-curve" package +""" + +from typing import List + +import numpy as np +import pandas as pd +from light_curve.light_curve_ext import _FeatureEvaluator as BaseLightCurveFeature + + +__all__ = ["cols", "meta", "extract_features", "BaseLightCurveFeature"] + + +def cols(ens: "Ensemble") -> List[str]: + """Return the columns required for the time-series feature extraction + + Parameters + ---------- + ens : `Ensemble` + Ensemble of light curves + + Returns + ------- + cols : `list` of `str` + List of input column names + """ + return [ens._time_col, ens._flux_col, ens._err_col, ens._band_col] + + +def meta(feature: BaseLightCurveFeature) -> pd.DataFrame: + """Return the meta required by Dask + + Parameters + ---------- + feature : `BaseLightCurveFeature` + Feature extractor in use, its `.names` attribute will be used + + Returns + ------- + meta : `list` of `str` + List of output column names + """ + return pd.DataFrame(dtype=np.float64, columns=feature.names) + + +def extract_features( + time, flux, err, band, *, feature: BaseLightCurveFeature, band_to_calc: str, **kwargs +) -> pd.Series: + """ + Apply a feature extractor to a light curve, concatenating the results over + all bands. + + Parameters + ---------- + feature : `BaseLightCurveFeature` + Feature extractor to apply, see "light-curve" package for more details. + time : `numpy.ndarray` + Time values + flux : `numpy.ndarray` + Brightness values, flux or magnitudes + err : `numpy.ndarray` + Errors for "flux" + band : `numpy.ndarray` + Passband names. + **kwargs : `dict` + Additional keyword arguments to pass to the feature extractor. + + Returns + ------- + features : pandas.DataFrame + Feature values for each band, dtype is a common type for input arrays. + """ + + # Select passband to calculate + band_mask = band == band_to_calc + time, flux, err = (a[band_mask] for a in (time, flux, err)) + + # Sort inputs by time if not already sorted + if not kwargs.get("sorted", False): + sort_idx = np.argsort(time) + time, flux, err, band = (a[sort_idx] for a in (time, flux, err, band)) + # Now we can update the kwargs for better performance + kwargs = kwargs.copy() + kwargs["sorted"] = True + + # Convert the numerical arrays to a common dtype + dtype = np.find_common_type([a.dtype for a in (time, flux, err)], []) + time, flux, err = (a.astype(dtype) for a in (time, flux, err)) + + values = feature(time, flux, err, **kwargs) + + series = pd.Series(dict(zip(feature.names, values))) + return series diff --git a/tests/tape_tests/test_feature_extraction.py b/tests/tape_tests/test_feature_extraction.py new file mode 100644 index 00000000..71ecd41b --- /dev/null +++ b/tests/tape_tests/test_feature_extraction.py @@ -0,0 +1,58 @@ +"""Test feature extraction with light_curve package""" + +import light_curve as licu +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose + +from tape import Ensemble +from tape.analysis.feature_extraction import extract_features, cols, meta +from tape.utils import ColumnMapper + + +def test_stetsonk(): + stetson_k = licu.StetsonK() + + time = np.array([5.0, 4.0, 3.0, 2.0, 1.0, 0.0] * 2) + flux = 1.0 + time**2.0 + err = np.full_like(time, 0.1, dtype=np.float32) + band = np.r_[["g"] * 6, ["r"] * 6] + + result = extract_features(feature=stetson_k, time=time, flux=flux, err=err, band=band, band_to_calc="g") + assert result.shape == (1,) + assert_array_equal(result.index, ["stetson_K"]) + assert_allclose(result.values, 0.84932, rtol=1e-5) + assert_array_equal(result.dtypes, np.float64) + + +def test_stetsonk_with_ensemble(): + n = 5 + + object1 = { + "id": np.full(n, 1), + "time": np.arange(n, dtype=np.float64), + "flux": np.linspace(1.0, 2.0, n), + "err": np.full(n, 0.1), + "band": np.full(n, "g"), + } + object2 = { + "id": np.full(2 * n, 2), + "time": np.arange(2 * n, dtype=np.float64), + "flux": np.r_[np.linspace(1.0, 2.0, n), np.linspace(1.0, 2.0, n)], + "err": np.full(2 * n, 0.01), + "band": np.r_[np.full(n, "g"), np.full(n, "r")], + } + rows = {column: np.concatenate([object1[column], object2[column]]) for column in object1} + + cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") + ens = Ensemble().from_source_dict(rows, cmap) + + stetson_k = licu.StetsonK() + result = ens.batch( + extract_features, + *cols(ens), + meta=meta(stetson_k), + feature=stetson_k, + band_to_calc="g", + ) + + assert_allclose(result, 0.848528, rtol=1e-5)