-
Notifications
You must be signed in to change notification settings - Fork 4
/
light_curve_fit.py
164 lines (137 loc) · 8 KB
/
light_curve_fit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Standard modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import validation_curve, ShuffleSplit
from sklearn.metrics import explained_variance_score, make_scorer
from sklearn.svm import SVR
# Custom modules
from jpm_time_conversions import metatimes_to_seconds_since_start, datetimeindex_to_human
from jpm_number_printing import latex_float
import jedi_config
__author__ = 'James Paul Mason'
__contact__ = '[email protected]'
def light_curve_fit(light_curve_df, gamma=np.logspace(-10, -5, num=20, base=10), minimum_score=0.5,
plots_save_path=None):
"""Automatically fit the best support vector machine regression (SVR) model for the input light curve.
Inputs:
light_curve_df [pd DataFrame]: A pandas DataFrame with a DatetimeIndex, and columns for irradiance and uncertainty.
Optional Inputs:
gamma [np.array]: Set this to an array of value(s), e.g., with np.logspace or np.linspace, to to use as the
tunable hyperparameter in the support vector regression fitting.
Note that the more elements in the array, the longer it will take to process the fits.
If a single value is provided, the validation curve plot can't and won't be produced,
but this is perfectly valid to do.
Default is np.logspace(-10, -5, num=20, base=10).
minimum_score [float]: Set this to the minimum explained variance score (0 - 1) acceptable for fits. If the
best fit score is < minimum_score, this function will return np.nan for light_curve_fit.
Default value is 0.5.
plots_save_path [str]: Set to a path in order to save the validation curve and best fit overplot on the data to disk.
Default is None, meaning no plots will be saved to disk.
Outputs:
light_curve_fit_df [pd DataFrame]: A pandas DataFrame with a DatetimeIndex, and columns for fitted irradiance and uncertainty.
best_fit_gamma [float]: The best found gamma hyper parameter for the SVR.
best_fit_score [float]: The best explained variance score.
Optional Outputs:
None
Example:
light_curve_fit_df, best_fit_gamma, best_fit_score = light_curve_fit(light_curve_df)
"""
if jedi_config.verbose:
jedi_config.logger.info("Running on event with light curve start time of {0}.".format(light_curve_df.index[0]))
# Pull data out of the DataFrame for compatibility formatting
X = metatimes_to_seconds_since_start(light_curve_df.index)
y = light_curve_df['irradiance'].values
# Make sure the dtype is float rather than object so numpy doesn't crash
if y.dtype == 'O':
y = y.astype(np.float64)
# Check for NaNs and issue warning that they are being removed from the dataset
if jedi_config.verbose:
if np.isnan(y).any():
jedi_config.logger.warning("There are NaN values in light curve. Dropping them.")
finite_irradiance_indices = np.isfinite(y)
X = X[finite_irradiance_indices]
X = X.reshape(len(X), 1) # Format to be compatible with validation_curve and SVR.fit()
uncertainty = light_curve_df.uncertainty[np.isfinite(y)]
y = y[finite_irradiance_indices]
if jedi_config.verbose:
jedi_config.logger.info("Fitting %s points." % len(y))
# Helper function for compatibility with validation_curve
def jpm_svr(gamma=1e-6, **kwargs):
return make_pipeline(SVR(kernel='rbf', C=1e3, gamma=gamma, **kwargs))
# Overwrite the default scorer (R^2) with explained variance score
evs = make_scorer(explained_variance_score)
# Split the data between training/testing 50/50 but across the whole time range rather than the default consecutive Kfolds
shuffle_split = ShuffleSplit(n_splits=20, train_size=0.5, test_size=0.5, random_state=None)
# Generate the validation curve -- test all them gammas!
# Parallel with n_jobs has absolutely no impact on processing time
train_score, val_score = validation_curve(jpm_svr(), X, y,
'svr__gamma',
gamma, cv=shuffle_split, scoring=evs)
if jedi_config.verbose:
jedi_config.logger.info("Validation curve complete.")
# Identify the best score
scores = np.median(val_score, axis=1)
best_fit_score = np.max(scores)
best_fit_gamma = gamma[np.argmax(scores)]
if jedi_config.verbose:
jedi_config.logger.info('Scores: ' + str(scores))
jedi_config.logger.info('Best score: ' + str(best_fit_score))
jedi_config.logger.info('Best fit gamma: ' + str(best_fit_gamma))
if plots_save_path and np.size(gamma) > 1:
plt.clf()
plt.style.use('jpm-transparent-light')
p1 = plt.plot(gamma, np.median(train_score, 1), label='training score')
p2 = plt.plot(gamma, np.median(val_score, 1), label='validation score')
ax = plt.axes()
plt.title("t$_0$ = " + datetimeindex_to_human(light_curve_df.index)[0])
ax.set_xscale('log')
plt.xlabel('gamma')
plt.ylabel('score')
plt.ylim(0, 1)
p3 = plt.axhline(y=minimum_score, linestyle='dashed', color=p2[0].get_color(), label='minimum score')
p4 = plt.axvline(x=best_fit_gamma, linestyle='dashed', color='black')
t1 = plt.text(best_fit_gamma, minimum_score - 0.05, 'best score = ' + latex_float(best_fit_score) + '\nbest gamma = ' + latex_float(best_fit_gamma),
ha='left', va='top')
plt.legend(loc='best')
filename = plots_save_path + 'Validation Curve t0 ' + datetimeindex_to_human(light_curve_df.index)[0] + '.png'
plt.savefig(filename)
if jedi_config.verbose:
jedi_config.logger.info("Validation curve saved to %s" % filename)
# Return np.nan if only got bad fits
if best_fit_score < minimum_score:
if jedi_config.verbose:
jedi_config.logger.warning("Uh oh. Best fit score {0:.2f} is < user-defined minimum score {1:.2f}".format(best_fit_score, minimum_score))
return np.nan, best_fit_gamma, best_fit_score
# Otherwise train and fit the best model
sample_weight = 1 / uncertainty
model = SVR(kernel='rbf', C=1e3, gamma=best_fit_gamma).fit(X, y, sample_weight)
y_fit = model.predict(X)
if jedi_config.verbose:
jedi_config.logger.info("Best model trained and fitted.")
if plots_save_path:
plt.clf()
plt.errorbar(X.ravel(), y, yerr=uncertainty, color='black', fmt='o', label='Input light curve', zorder=1)
plt.plot(X.ravel(), y_fit, linewidth=6, label='Fit', zorder=2)
plt.title("t$_0$ = " + datetimeindex_to_human(light_curve_df.index)[0])
plt.xlabel('time [seconds since start]')
plt.ylabel('irradiance [%]')
t1 = plt.text(0.03, 0.03,
'fit score = ' + latex_float(best_fit_score),
ha='left', va='bottom', transform=plt.gca().transAxes)
plt.legend(loc='best')
filename = plots_save_path + 'Fit t0 ' + datetimeindex_to_human(light_curve_df.index)[0] + '.png'
plt.savefig(filename)
if jedi_config.verbose:
jedi_config.logger.info("Fitted curve saved to %s" % filename)
# TODO: Get uncertainty of fit at each point... if that's even possible
# Placeholder for now just so that the function can complete: output uncertainty = input uncertainty
fit_uncertainty = uncertainty
# Construct a pandas DataFrame with DatetimeIndex, y_fit, and fit_uncertainty
light_curve_fit_df = pd.DataFrame({'irradiance': y_fit,
'uncertainty': fit_uncertainty})
light_curve_fit_df.index = light_curve_df.index[finite_irradiance_indices]
if jedi_config.verbose:
jedi_config.logger.info("Created output DataFrame")
return light_curve_fit_df, best_fit_gamma, best_fit_score