From 797e8a6800673e33d0901ea362eb29eff9d7e683 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Tue, 22 Aug 2023 17:25:06 -0400 Subject: [PATCH 1/5] Initial support of light-curve --- pyproject.toml | 1 + src/tape/analysis/__init__.py | 1 + src/tape/analysis/feature_extractor.py | 95 +++++++++++++++++++++ src/tape/ensemble.py | 5 ++ tests/tape_tests/test_analysis.py | 8 +- tests/tape_tests/test_feature_extraction.py | 58 +++++++++++++ 6 files changed, 166 insertions(+), 2 deletions(-) create mode 100644 src/tape/analysis/feature_extractor.py create mode 100644 tests/tape_tests/test_feature_extraction.py diff --git a/pyproject.toml b/pyproject.toml index 026f8301..d288af33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ 'coverage', "deprecated", "ipykernel", # Support for Jupyter notebooks + "light-curve>=0.7.3,<0.8.0", ] # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes) diff --git a/src/tape/analysis/__init__.py b/src/tape/analysis/__init__.py index 3676d2b1..150edcf2 100644 --- a/src/tape/analysis/__init__.py +++ b/src/tape/analysis/__init__.py @@ -1,4 +1,5 @@ from .base import AnalysisFunction # noqa +from .feature_extractor import FeatureExtractor # noqa from .light_curve import LightCurve # noqa from .stetsonj import * # noqa from .structurefunction2 import * # noqa diff --git a/src/tape/analysis/feature_extractor.py b/src/tape/analysis/feature_extractor.py new file mode 100644 index 00000000..58560d9e --- /dev/null +++ b/src/tape/analysis/feature_extractor.py @@ -0,0 +1,95 @@ +""" +Auxiliary code for time-series feature extraction with "light-curve" package +""" + +from typing import List + +import numpy as np +import pandas as pd +from light_curve.light_curve_ext import _FeatureEvaluator as BaseLightCurveFeature + +from tape.analysis.base import AnalysisFunction + + +__all__ = ["FeatureExtractor", "BaseLightCurveFeature"] + + +class FeatureExtractor(AnalysisFunction): + """Apply light-curve package feature extractor to a light curve + + Parameters + ---------- + feature : light_curve.light_curve_ext._FeatureEvaluator + Feature extractor to apply, see "light-curve" package for more details. + + Attributes + ---------- + feature : light_curve.light_curve_ext._FeatureEvaluator + Feature extractor to apply, see "light-curve" package for more details. + """ + + def __init__(self, feature: BaseLightCurveFeature): + self.feature = feature + + def cols(self, ens: "Ensemble") -> List[str]: + return [ens._time_col, ens._flux_col, ens._err_col, ens._band_col] + + def meta(self, ens: "Ensemble") -> pd.DataFrame: + """Return the schema of the analysis function output. + + It always returns a pandas.DataFrame with the same columns as + `self.feature.names` and dtype `np.float64`. However, if + input columns are all single precision floats then the output dtype + will be `np.float32`. + """ + return pd.DataFrame(dtype=np.float64, columns=self.feature.names) + + def on(self, ens: "Ensemble") -> List[str]: + return [ens._id_col] + + def __call__(self, time, flux, err, band, *, band_to_calc: str, **kwargs) -> pd.DataFrame: + """ + Apply a feature extractor to a light curve, concatenating the results over + all bands. + + Parameters + ---------- + time : `numpy.ndarray` + Time values + flux : `numpy.ndarray` + Brightness values, flux or magnitudes + err : `numpy.ndarray` + Errors for "flux" + band : `numpy.ndarray` + Passband names. + band_to_calc : `str` + Name of the passband to calculate features for. + **kwargs : `dict` + Additional keyword arguments to pass to the feature extractor. + + Returns + ------- + features : pandas.DataFrame + Feature values for each band, dtype is a common type for input arrays. + """ + + # Select passband to calculate + band_mask = band == band_to_calc + time, flux, err = (a[band_mask] for a in (time, flux, err)) + + # Sort inputs by time if not already sorted + if not kwargs.get("sorted", False): + sort_idx = np.argsort(time) + time, flux, err, band = (a[sort_idx] for a in (time, flux, err, band)) + # Now we can update the kwargs for better performance + kwargs = kwargs.copy() + kwargs["sorted"] = True + + # Convert the numerical arrays to a common dtype + dtype = np.find_common_type([a.dtype for a in (time, flux, err)], []) + time, flux, err = (a.astype(dtype) for a in (time, flux, err)) + + values = self.feature(time, flux, err, **kwargs) + + series = pd.Series(dict(zip(self.feature.names, values))) + return series diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 3b46900b..239da8f9 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -12,6 +12,7 @@ from dask.distributed import Client from .analysis.base import AnalysisFunction +from .analysis.feature_extractor import BaseLightCurveFeature, FeatureExtractor from .analysis.structure_function import SF_METHODS from .analysis.structurefunction2 import calc_sf2 from .timeseries import TimeSeries @@ -705,6 +706,10 @@ def batch(self, func, *args, meta=None, use_map=True, compute=True, on=None, **k """ self._lazy_sync_tables(table="all") + # Convert light-curve package feature into analysis function + if isinstance(func, BaseLightCurveFeature): + func = FeatureExtractor(func) + # Extract function information if TAPE analysis function if isinstance(func, AnalysisFunction): args = func.cols(self) meta = func.meta(self) diff --git a/tests/tape_tests/test_analysis.py b/tests/tape_tests/test_analysis.py index ece3bad5..ebc35fca 100644 --- a/tests/tape_tests/test_analysis.py +++ b/tests/tape_tests/test_analysis.py @@ -14,6 +14,12 @@ def test_analysis_function(cls): """ Test AnalysisFunction child classes """ + # We skip child classes with non-trivial constructors + try: + obj = cls() + except TypeError: + pytest.skip(f"Class {cls} has non-trivial constructor") + rows = { "id": [8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002, 8002], "time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1], @@ -24,8 +30,6 @@ def test_analysis_function(cls): cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") ens = Ensemble().from_source_dict(rows, column_mapper=cmap) - obj = cls() - assert isinstance(obj.cols(ens), list) assert len(obj.cols(ens)) > 0 assert isinstance(obj.on(ens), list) diff --git a/tests/tape_tests/test_feature_extraction.py b/tests/tape_tests/test_feature_extraction.py new file mode 100644 index 00000000..16912727 --- /dev/null +++ b/tests/tape_tests/test_feature_extraction.py @@ -0,0 +1,58 @@ +"""Test feature extraction with light_curve package""" + +import light_curve as licu +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose + +from tape import Ensemble +from tape.analysis.feature_extractor import FeatureExtractor +from tape.utils import ColumnMapper + + +def test_stetsonk(): + stetson_k = licu.StetsonK() + + time = np.array([5.0, 4.0, 3.0, 2.0, 1.0, 0.0] * 2) + flux = 1.0 + time**2.0 + err = np.full_like(time, 0.1, dtype=np.float32) + band = np.r_[["g"] * 6, ["r"] * 6] + + extract_features = FeatureExtractor(stetson_k) + result = extract_features(time=time, flux=flux, err=err, band=band, band_to_calc="g") + assert result.shape == (1,) + assert_array_equal(result.index, ["stetson_K"]) + assert_allclose(result.values, 0.84932, rtol=1e-5) + assert_array_equal(result.dtypes, np.float64) + + +def test_stetsonk_with_ensemble(): + n = 5 + + object1 = { + "id": np.full(n, 1), + "time": np.arange(n, dtype=np.float64), + "flux": np.linspace(1.0, 2.0, n), + "err": np.full(n, 0.1), + "band": np.full(n, "g"), + } + object2 = { + "id": np.full(2 * n, 2), + "time": np.arange(2 * n, dtype=np.float64), + "flux": np.r_[np.linspace(1.0, 2.0, n), np.linspace(1.0, 2.0, n)], + "err": np.full(2 * n, 0.01), + "band": np.r_[np.full(n, "g"), np.full(n, "r")], + } + rows = {column: np.concatenate([object1[column], object2[column]]) for column in object1} + + cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") + ens = Ensemble().from_source_dict(rows, cmap) + + stetson_k = licu.Extractor(licu.AndersonDarlingNormal(), licu.InterPercentileRange(0.25), licu.StetsonK()) + result = ens.batch( + stetson_k, + band_to_calc="g", + ) + + assert result.shape == (2, 3) + assert_array_equal(result.columns, ["anderson_darling_normal", "inter_percentile_range_25", "stetson_K"]) + assert_allclose(result, [[0.114875, 0.625, 0.848528]] * 2, atol=1e-5) From 3e8dafbec10165b307d9daa2912811cab454434e Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 30 Aug 2023 09:55:00 -0400 Subject: [PATCH 2/5] batch docs for light-curve pckg --- src/tape/ensemble.py | 50 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 239da8f9..79f869d3 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -668,16 +668,24 @@ def batch(self, func, *args, meta=None, use_map=True, compute=True, on=None, **k Parameters ---------- func : `function` - A function to apply to all objects in the ensemble + A function to apply to all objects in the ensemble. The function + could be a TAPE function, an initialized feature extractor from + `light-curve` package or a user-defined function. In the least + case the function must have the following signature: + `func(*cols, **kwargs)`, where the names of the `cols` are + specified in `args`, `kwargs` are keyword arguments passed to the + function, and the return value schema is described by `meta`. + For TAPE and `light-curve` functions `args`, `meta` and `on` are + populated automatically. *args: Denotes the ensemble columns to use as inputs for a function, order must be correct for function. If passing a TAPE - function, these are populated automatically. + or `light-curve` function, these are populated automatically. meta : `pd.Series`, `pd.DataFrame`, `dict`, or `tuple-like` Dask's meta parameter, which lays down the expected structure of - the results. Overridden by TAPE for TAPE + the results. Overridden by TAPE for TAPE and `light-curve` functions. If none, attempts to coerce the result to a - pandas.series. + pandas.Series. use_map : `boolean` Determines whether `dask.dataframe.DataFrame.map_partitions` is used (True). Using map_partitions is generally more efficient, but @@ -688,21 +696,43 @@ def batch(self, func, *args, meta=None, use_map=True, compute=True, on=None, **k later compute call. on: 'str' or 'list' Designates which column(s) to groupby. Columns may be from the - source or object tables. + source or object tables. For TAPE and `light-curve` functions + this is populated automatically. **kwargs: Additional optional parameters passed for the selected function Returns - ---------- + ------- result: `Dask.Series` Series of function results - Example - ---------- - ` + Examples + -------- + Run a TAPE function on the ensemble: + ``` from tape.analysis.stetsonj import calc_stetson_J + ens = Ensemble().from_dataset('rrlyr82') ensemble.batch(calc_stetson_J, band_to_calc='i') - ` + ``` + + Run a light-curve function on the ensemble: + ``` + from light_curve import EtaE + ens.batch(EtaE(), band_to_calc='g') + ``` + + Run a custom function on the ensemble: + ``` + def s2n_inter_quartile_range(flux, err): + first, third = np.quantile(flux / err, [0.25, 0.75]) + return third - first + + ens.batch(s2n_inter_quartile_range, ens._flux_col, ens._err_col) + ``` + Or even a numpy built-in function: + ``` + amplitudes = ens.batch(np.ptp, ens._flux_col) + ``` """ self._lazy_sync_tables(table="all") From 25698c6ac5e5e397f2f051e46a3942e0117cfc7c Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 30 Aug 2023 10:24:52 -0400 Subject: [PATCH 3/5] Fix tests for using dask_client --- tests/tape_tests/test_analysis.py | 4 ++-- tests/tape_tests/test_feature_extraction.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tape_tests/test_analysis.py b/tests/tape_tests/test_analysis.py index ebc35fca..c75a9621 100644 --- a/tests/tape_tests/test_analysis.py +++ b/tests/tape_tests/test_analysis.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize("cls", analysis.AnalysisFunction.__subclasses__()) -def test_analysis_function(cls): +def test_analysis_function(cls, dask_client): """ Test AnalysisFunction child classes """ @@ -28,7 +28,7 @@ def test_analysis_function(cls): "flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0], } cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") - ens = Ensemble().from_source_dict(rows, column_mapper=cmap) + ens = Ensemble(client=dask_client).from_source_dict(rows, column_mapper=cmap) assert isinstance(obj.cols(ens), list) assert len(obj.cols(ens)) > 0 diff --git a/tests/tape_tests/test_feature_extraction.py b/tests/tape_tests/test_feature_extraction.py index 16912727..c2567934 100644 --- a/tests/tape_tests/test_feature_extraction.py +++ b/tests/tape_tests/test_feature_extraction.py @@ -25,7 +25,7 @@ def test_stetsonk(): assert_array_equal(result.dtypes, np.float64) -def test_stetsonk_with_ensemble(): +def test_stetsonk_with_ensemble(dask_client): n = 5 object1 = { @@ -45,7 +45,7 @@ def test_stetsonk_with_ensemble(): rows = {column: np.concatenate([object1[column], object2[column]]) for column in object1} cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") - ens = Ensemble().from_source_dict(rows, cmap) + ens = Ensemble(dask_client).from_source_dict(rows, cmap) stetson_k = licu.Extractor(licu.AndersonDarlingNormal(), licu.InterPercentileRange(0.25), licu.StetsonK()) result = ens.batch( From 5be3f852216cc7d1a107cd7cfa9b37326ebe4a07 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 30 Aug 2023 11:00:22 -0400 Subject: [PATCH 4/5] Add light_curve to tutorial --- .../tutorials/working_with_the_ensemble.ipynb | 459 +++++++++++++++--- 1 file changed, 402 insertions(+), 57 deletions(-) diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb index b4aede41..c5098095 100644 --- a/docs/tutorials/working_with_the_ensemble.ipynb +++ b/docs/tutorials/working_with_the_ensemble.ipynb @@ -19,8 +19,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:34.203827Z", + "start_time": "2023-08-30T14:58:34.187300Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -53,9 +58,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.125402Z", + "start_time": "2023-08-30T14:58:34.190790Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from tape.ensemble import Ensemble\n", "\n", @@ -90,9 +109,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.209050Z", + "start_time": "2023-08-30T14:58:36.115521Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from tape.utils import ColumnMapper\n", "\n", @@ -127,9 +160,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.219081Z", + "start_time": "2023-08-30T14:58:36.205629Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "Dask DataFrame Structure:\n time flux error band\nnpartitions=1 \n0 float64 float64 float64 string\n9 ... ... ... ...\nDask Name: sort_index, 4 graph layers", + "text/html": "
Dask DataFrame Structure:
\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
npartitions=1
0float64float64float64string
9............
\n
\n
Dask Name: sort_index, 4 graph layers
" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens._source # We have not actually loaded any data into memory" ] @@ -143,9 +191,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.484627Z", + "start_time": "2023-08-30T14:58:36.213215Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band\nid \n0 1.0 120.851100 11.633225 g\n0 2.0 136.016225 12.635291 g\n0 3.0 100.005719 14.429710 g\n0 4.0 115.116629 11.786349 g\n0 5.0 107.337795 14.542676 g\n.. ... ... ... ...\n9 96.0 138.371176 12.237541 r\n9 97.0 104.060829 10.920638 r\n9 98.0 149.920678 14.143664 r\n9 99.0 119.480601 10.154990 r\n9 100.0 145.260138 14.733641 r\n\n[1000 rows x 4 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
01.0120.85110011.633225g
02.0136.01622512.635291g
03.0100.00571914.429710g
04.0115.11662911.786349g
05.0107.33779514.542676g
...............
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n

1000 rows × 4 columns

\n
" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.compute(\"source\") # Compute lets dask know we're ready to bring the data into memory" ] @@ -180,9 +243,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.696142Z", + "start_time": "2023-08-30T14:58:36.361967Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Object Table\n", + "\n", + "Index: 10 entries, 0 to 9\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 nobs_g 10 non-null float64\n", + " 1 nobs_r 10 non-null float64\n", + " 2 nobs_total 10 non-null float64\n", + "dtypes: float64(3)\n", + "memory usage: 320.0 bytes\n", + "Source Table\n", + "\n", + "Index: 1000 entries, 0 to 9\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 time 1000 non-null float64\n", + " 1 flux 1000 non-null float64\n", + " 2 error 1000 non-null float64\n", + " 3 band 1000 non-null string\n", + "dtypes: float64(3), string(1)\n", + "memory usage: 36.1 KB\n" + ] + } + ], "source": [ "# Inspection\n", "\n", @@ -198,18 +296,48 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.696879Z", + "start_time": "2023-08-30T14:58:36.510953Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "band nobs_g nobs_r nobs_total\nid \n0 50 50 100\n1 50 50 100\n2 50 50 100\n3 50 50 100\n4 50 50 100", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
bandnobs_gnobs_rnobs_total
id
05050100
15050100
25050100
35050100
45050100
\n
" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.head(\"object\", 5) # Grabs the first 5 rows of the object table" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.697259Z", + "start_time": "2023-08-30T14:58:36.561399Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band\nid \n9 96.0 138.371176 12.237541 r\n9 97.0 104.060829 10.920638 r\n9 98.0 149.920678 14.143664 r\n9 99.0 119.480601 10.154990 r\n9 100.0 145.260138 14.733641 r", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n
" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.tail(\"source\", 5) # Grabs the last 5 rows of the source table" ] @@ -223,9 +351,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.697769Z", + "start_time": "2023-08-30T14:58:36.592238Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band\nid \n0 1.0 120.851100 11.633225 g\n0 2.0 136.016225 12.635291 g\n0 3.0 100.005719 14.429710 g\n0 4.0 115.116629 11.786349 g\n0 5.0 107.337795 14.542676 g\n.. ... ... ... ...\n9 96.0 138.371176 12.237541 r\n9 97.0 104.060829 10.920638 r\n9 98.0 149.920678 14.143664 r\n9 99.0 119.480601 10.154990 r\n9 100.0 145.260138 14.733641 r\n\n[1000 rows x 4 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
01.0120.85110011.633225g
02.0136.01622512.635291g
03.0100.00571914.429710g
04.0115.11662911.786349g
05.0107.33779514.542676g
...............
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n

1000 rows × 4 columns

\n
" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.compute(\"source\")" ] @@ -243,9 +386,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.698305Z", + "start_time": "2023-08-30T14:58:36.615492Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band\nid \n0 2.0 136.016225 12.635291 g\n0 12.0 134.260975 10.685679 g\n0 14.0 143.905872 13.484091 g\n0 16.0 133.523376 13.777315 g\n0 21.0 140.037228 10.099401 g\n.. ... ... ... ...\n9 91.0 140.368263 14.320720 r\n9 92.0 148.476901 12.239495 r\n9 96.0 138.371176 12.237541 r\n9 98.0 149.920678 14.143664 r\n9 100.0 145.260138 14.733641 r\n\n[422 rows x 4 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
02.0136.01622512.635291g
012.0134.26097510.685679g
014.0143.90587213.484091g
016.0133.52337613.777315g
021.0140.03722810.099401g
...............
991.0140.36826314.320720r
992.0148.47690112.239495r
996.0138.37117612.237541r
998.0149.92067814.143664r
9100.0145.26013814.733641r
\n

422 rows × 4 columns

\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.query(f\"{ens._flux_col} > 130.0\", table=\"source\")\n", "ens.compute(\"source\")" @@ -260,9 +418,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.754980Z", + "start_time": "2023-08-30T14:58:36.669055Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "id\n0 False\n0 True\n0 False\n0 False\n0 True\n ... \n9 False\n9 False\n9 False\n9 False\n9 False\nName: error, Length: 422, dtype: bool" + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "keep_rows = ens._source[\"error\"] < 12.0\n", "keep_rows.compute()" @@ -277,9 +449,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:36.792088Z", + "start_time": "2023-08-30T14:58:36.690772Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band\nid \n0 12.0 134.260975 10.685679 g\n0 21.0 140.037228 10.099401 g\n0 22.0 148.413079 10.131055 g\n0 24.0 134.616131 11.231055 g\n0 30.0 143.907125 11.395918 g\n.. ... ... ... ...\n9 81.0 149.016644 10.755373 r\n9 85.0 130.071670 11.960329 r\n9 86.0 136.297942 11.419338 r\n9 88.0 134.215481 11.202422 r\n9 89.0 147.302751 11.271162 r\n\n[169 rows x 4 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
012.0134.26097510.685679g
021.0140.03722810.099401g
022.0148.41307910.131055g
024.0134.61613111.231055g
030.0143.90712511.395918g
...............
981.0149.01664410.755373r
985.0130.07167011.960329r
986.0136.29794211.419338r
988.0134.21548111.202422r
989.0147.30275111.271162r
\n

169 rows × 4 columns

\n
" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.filter_from_series(keep_rows, table=\"source\")\n", "ens.compute(\"source\")" @@ -294,9 +481,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.026887Z", + "start_time": "2023-08-30T14:58:36.715537Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Object Table\n", + "\n", + "Index: 10 entries, 0 to 9\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 nobs_g 10 non-null float64\n", + " 1 nobs_r 10 non-null float64\n", + " 2 nobs_total 10 non-null float64\n", + "dtypes: float64(3)\n", + "memory usage: 320.0 bytes\n", + "Source Table\n", + "\n", + "Index: 169 entries, 0 to 9\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 time 169 non-null float64\n", + " 1 flux 169 non-null float64\n", + " 2 error 169 non-null float64\n", + " 3 band 169 non-null string\n", + "dtypes: float64(3), string(1)\n", + "memory usage: 6.1 KB\n" + ] + } + ], "source": [ "# Cleaning nans\n", "ens.dropna(table=\"source\") # clean nans from source table\n", @@ -327,9 +549,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.095991Z", + "start_time": "2023-08-30T14:58:36.917820Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band band2\nid \n0 12.0 134.260975 10.685679 g g2\n0 21.0 140.037228 10.099401 g g2\n0 22.0 148.413079 10.131055 g g2\n0 24.0 134.616131 11.231055 g g2\n0 30.0 143.907125 11.395918 g g2\n.. ... ... ... ... ...\n9 81.0 149.016644 10.755373 r r2\n9 85.0 130.071670 11.960329 r r2\n9 86.0 136.297942 11.419338 r r2\n9 88.0 134.215481 11.202422 r r2\n9 89.0 147.302751 11.271162 r r2\n\n[169 rows x 5 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorbandband2
id
012.0134.26097510.685679gg2
021.0140.03722810.099401gg2
022.0148.41307910.131055gg2
024.0134.61613111.231055gg2
030.0143.90712511.395918gg2
..................
981.0149.01664410.755373rr2
985.0130.07167011.960329rr2
986.0136.29794211.419338rr2
988.0134.21548111.202422rr2
989.0147.30275111.271162rr2
\n

169 rows × 5 columns

\n
" + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Add a new column so we can filter it out later.\n", "ens._source = ens._source.assign(band2=ens._source[\"band\"] + \"2\")\n", @@ -338,9 +575,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.096860Z", + "start_time": "2023-08-30T14:58:36.937579Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band\nid \n0 12.0 134.260975 10.685679 g\n0 21.0 140.037228 10.099401 g\n0 22.0 148.413079 10.131055 g\n0 24.0 134.616131 11.231055 g\n0 30.0 143.907125 11.395918 g\n.. ... ... ... ...\n9 81.0 149.016644 10.755373 r\n9 85.0 130.071670 11.960329 r\n9 86.0 136.297942 11.419338 r\n9 88.0 134.215481 11.202422 r\n9 89.0 147.302751 11.271162 r\n\n[169 rows x 4 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
012.0134.26097510.685679g
021.0140.03722810.099401g
022.0148.41307910.131055g
024.0134.61613111.231055g
030.0143.90712511.395918g
...............
981.0149.01664410.755373r
985.0130.07167011.960329r
986.0136.29794211.419338r
988.0134.21548111.202422r
989.0147.30275111.271162r
\n

169 rows × 4 columns

\n
" + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.select([\"time\", \"flux\", \"error\", \"band\"], table=\"source\")\n", "ens.compute(\"source\")" @@ -359,9 +611,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.097571Z", + "start_time": "2023-08-30T14:58:36.958927Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " time flux error band lower_bnd\nid \n0 12.0 134.260975 10.685679 g 112.889618\n0 21.0 140.037228 10.099401 g 119.838427\n0 22.0 148.413079 10.131055 g 128.150969\n0 24.0 134.616131 11.231055 g 112.154020\n0 30.0 143.907125 11.395918 g 121.115288\n.. ... ... ... ... ...\n9 81.0 149.016644 10.755373 r 127.505899\n9 85.0 130.071670 11.960329 r 106.151012\n9 86.0 136.297942 11.419338 r 113.459267\n9 88.0 134.215481 11.202422 r 111.810638\n9 89.0 147.302751 11.271162 r 124.760428\n\n[169 rows x 5 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorbandlower_bnd
id
012.0134.26097510.685679g112.889618
021.0140.03722810.099401g119.838427
022.0148.41307910.131055g128.150969
024.0134.61613111.231055g112.154020
030.0143.90712511.395918g121.115288
..................
981.0149.01664410.755373r127.505899
985.0130.07167011.960329r106.151012
986.0136.29794211.419338r113.459267
988.0134.21548111.202422r111.810638
989.0147.30275111.271162r124.760428
\n

169 rows × 5 columns

\n
" + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ens.assign(table=\"source\", lower_bnd=lambda x: x[\"flux\"] - 2.0 * x[\"error\"])\n", "ens.compute(table=\"source\")" @@ -379,9 +646,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.492980Z", + "start_time": "2023-08-30T14:58:36.981314Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "id\n0 {'g': -0.8833723170736909, 'r': -0.81291313232...\n1 {'g': -0.7866661902102343, 'r': -0.79927945599...\n2 {'g': -0.8650811883274131, 'r': -0.87939085289...\n3 {'g': -0.9140015912865537, 'r': -0.90284371456...\n4 {'g': -0.8232578922439672, 'r': -0.81922455220...\n5 {'g': -0.668795976899231, 'r': -0.784477243304...\n6 {'g': -0.8115552290707235, 'r': -0.90666227394...\n7 {'g': -0.6217573153267577, 'r': -0.60999974938...\n8 {'g': -0.7001359525394822, 'r': -0.73620435205...\n9 {'g': -0.7266040976469818, 'r': -0.68878460237...\nName: stetsonJ, dtype: object" + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# using tape analysis functions\n", "from tape.analysis import calc_stetson_J\n", @@ -390,6 +671,46 @@ "res" ] }, + { + "cell_type": "markdown", + "source": [ + "## Using light-curve package features\n", + "\n", + "`Ensemble.batch` also supports the use of [light-curve](https://pypi.org/project/light-curve/) package feature extractor:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": " amplitude anderson_darling_normal stetson_K\nid \n0 7.076052 0.177751 0.834036\n1 8.591493 0.513749 0.769344\n2 8.141189 0.392628 0.856307\n3 5.751674 0.295631 0.809191\n4 7.871321 0.555775 0.849305\n5 8.666473 0.342937 0.823194\n6 8.649326 0.241117 0.832815\n7 8.856443 1.141906 0.772267\n8 9.297713 0.984247 0.968132\n9 8.774109 0.335798 0.754355", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
amplitudeanderson_darling_normalstetson_K
id
07.0760520.1777510.834036
18.5914930.5137490.769344
28.1411890.3926280.856307
35.7516740.2956310.809191
47.8713210.5557750.849305
58.6664730.3429370.823194
68.6493260.2411170.832815
78.8564431.1419060.772267
89.2977130.9842470.968132
98.7741090.3357980.754355
\n
" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import light_curve as licu\n", + "\n", + "extractor = licu.Extractor(licu.Amplitude(), licu.AndersonDarlingNormal(), licu.StetsonK())\n", + "res = ens.batch(extractor, compute=True, band_to_calc=\"g\")\n", + "res" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.514514Z", + "start_time": "2023-08-30T14:58:37.494001Z" + } + } + }, { "attachments": {}, "cell_type": "markdown", @@ -403,8 +724,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.519972Z", + "start_time": "2023-08-30T14:58:37.515404Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -434,9 +760,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.583850Z", + "start_time": "2023-08-30T14:58:37.519056Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "id\n0 {'g': 140.03722843377682, 'r': 138.955084796142}\n1 {'g': 140.91515408243285, 'r': 141.44229039903...\n2 {'g': 139.42093950235392, 'r': 142.21649742828...\n3 {'g': 137.01337116218363, 'r': 139.05032340951...\n4 {'g': 134.61800608117045, 'r': 139.76505837028...\n5 {'g': 135.55144382138587, 'r': 139.41361800167...\n6 {'g': 142.93611137557423, 'r': 137.20679606847...\n7 {'g': 144.52647796976, 'r': 132.2470836256106}\n8 {'g': 144.7469760076462, 'r': 137.5226773361662}\n9 {'g': 136.89977482019205, 'r': 136.29794229244...\nName: id, dtype: object" + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Applying the function to the ensemble\n", "res = ens.batch(my_flux_average, \"flux\", \"band\", compute=True, meta=None, method=\"median\")\n", @@ -452,8 +792,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.764841Z", + "start_time": "2023-08-30T14:58:37.539014Z" + } + }, "outputs": [], "source": [ "ens.client.close() # Tear down the ensemble client" From 6120262bc3ae202de6d388a280be326f00395783 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Wed, 30 Aug 2023 10:27:06 -0700 Subject: [PATCH 5/5] coalesce fix --- src/tape/ensemble.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 79f869d3..6a93f0c9 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -443,15 +443,22 @@ def coalesce(self, input_cols, output_col, table="object", drop_inputs=False): else: raise ValueError(f"{table} is not one of 'object' or 'source'") + # Create a subset dataframe with the coalesced columns + # Drop index for dask series operations - unfortunate + coal_ddf = table_ddf[input_cols].reset_index() + # Coalesce each column iteratively i = 0 - coalesce_col = table_ddf[input_cols[0]] + coalesce_col = coal_ddf[input_cols[0]] while i < len(input_cols) - 1: - coalesce_col = coalesce_col.combine_first(table_ddf[input_cols[i + 1]]) + coalesce_col = coalesce_col.combine_first(coal_ddf[input_cols[i + 1]]) i += 1 + print("am I using this code") + # Assign the new column to the subset df, and reintroduce index + coal_ddf = coal_ddf.assign(**{output_col: coalesce_col}).set_index(self._id_col) # assign the result to the desired column name - table_ddf = table_ddf.assign(**{output_col: coalesce_col}) + table_ddf = table_ddf.assign(**{output_col: coal_ddf[output_col]}) # Drop the input columns if wanted if drop_inputs: