Implements data transformations in TSA module (#2121)

* Adds scikit-learn data transformations to TSA module * adds the test XML file * adds masking transforms, facilitates common FunctionTransformers, and adds the ability to use custom transformers from a separate python file * refactored to use statsmodels OLS for better handling of missing values * reworks handling data with missing values * removes excess whitespace * updates documentation to include TSA.Transformer class * regolded due to change in regression solver used in TSA.Fourier * adds docstrings * reworks TimeSeriesAnalyzer inheritance structure, adds transformer base classes, makes transformer XML tags explicit * regold to reflect change in transformers XML * WIP unit testing for TSA transformers * simplifies inheritance requirements for ARMA and RWD * minor fixes and additions * minor fix in generation step * reverts RWD to being a characterizer only * TSA README * makes scikit-learn wrappers abstract classes and makes it so abstract classes are not included in User Manual * updates documentation for TSA module * makes docstrings compliant with RAVEN standard * reduces the number of time steps for ARMA and LogARMA tests and regolds * makes TSA.ARMA a transformer * more docstring fixes * removes algorithms Venn diagram * adds updated algorithm Venn diagram * replaces the algorithms Venn diagram with a table in the README file * missing comma in array index * fix for masking in ARMA * regold due to new solver used in Fourier fit * changes gold file for Basic test for mac
idaholab · Jul 10, 2023 · 137d53f · 137d53f
1 parent 9f4cdc5
commit 137d53f
Show file tree

Hide file tree

Showing 49 changed files with 5,621 additions and 5,314 deletions.
diff --git a/doc/user_manual/generated/internalRom.tex b/doc/user_manual/generated/internalRom.tex
diff --git a/ravenframework/Models/PostProcessors/TSACharacterizer.py b/ravenframework/Models/PostProcessors/TSACharacterizer.py
@@ -56,6 +56,8 @@ def _handleInput(self, inp):
     """
     super()._handleInput(inp)
     self.readTSAInput(inp)
+    if not self.canCharacterize():
+      self.raiseAnError(IOError, 'TSACharacterizer requires at least one TSA algorithm that can characterize!')
 
   def initialize(self, runInfo, inputs, initDict=None):
     """

diff --git a/ravenframework/TSA/ARMA.py b/ravenframework/TSA/ARMA.py
@@ -25,11 +25,11 @@
 statsmodels = importerUtils.importModuleLazy('statsmodels', globals())
 
 from .. import Distributions
-from .TimeSeriesAnalyzer import TimeSeriesGenerator, TimeSeriesCharacterizer
+from .TimeSeriesAnalyzer import TimeSeriesGenerator, TimeSeriesCharacterizer, TimeSeriesTransformer
 
 
 # utility methods
-class ARMA(TimeSeriesGenerator, TimeSeriesCharacterizer):
+class ARMA(TimeSeriesGenerator, TimeSeriesCharacterizer, TimeSeriesTransformer):
   r"""
     AutoRegressive Moving Average time series analyzer algorithm
   """
@@ -39,12 +39,15 @@ class ARMA(TimeSeriesGenerator, TimeSeriesCharacterizer):
                'ma',
                'sigma2',
                'const']
+  _acceptsMissingValues = True
+  _isStochastic = True
 
   @classmethod
   def getInputSpecification(cls):
     """
       Method to get a reference to a class that specifies the input data for
       class cls.
+      @ In, None
       @ Out, inputSpecification, InputData.ParameterInput, class to use for
         specifying input of cls.
     """
@@ -94,7 +97,7 @@ def __init__(self, *args, **kwargs):
   def handleInput(self, spec):
     """
       Reads user inputs into this object.
-      @ In, inp, InputData.InputParams, input specifications
+      @ In, spec, InputData.InputParams, input specifications
       @ Out, settings, dict, initialization settings for this algorithm
     """
     settings = super().handleInput(spec)
@@ -119,7 +122,7 @@ def setDefaults(self, settings):
       settings['reduce_memory'] = False
     return settings
 
-  def characterize(self, signal, pivot, targets, settings):
+  def fit(self, signal, pivot, targets, settings):
     """
       Determines the charactistics of the signal based on this algorithm.
       @ In, signal, np.ndarray, time series with dims [time, target]
@@ -143,13 +146,15 @@ def characterize(self, signal, pivot, targets, settings):
     for tg, target in enumerate(targets):
       params[target] = {}
       history = signal[:, tg]
+      mask = ~np.isnan(history)
       if settings.get('gaussianize', True):
         # Transform data to obatain normal distrbuted series. See
         # J.M.Morales, R.Minguez, A.J.Conejo "A methodology to generate statistically dependent wind speed scenarios,"
         # Applied Energy, 87(2010) 843-855
         # -> then train independent ARMAs
-        params[target]['cdf'] = mathUtils.characterizeCDF(history, binOps=2, minBins=self._minBins)
-        normed = mathUtils.gaussianize(history, params[target]['cdf'])
+        params[target]['cdf'] = mathUtils.characterizeCDF(history[mask], binOps=2, minBins=self._minBins)
+        normed = history
+        normed[mask] = mathUtils.gaussianize(history[mask], params[target]['cdf'])
       else:
         normed = history
       # TODO correlation (VARMA) as well as singular -> maybe should be independent TSA algo?
@@ -190,6 +195,44 @@ def characterize(self, signal, pivot, targets, settings):
         params[target]['arma']['results'] = res
     return params
 
+  def getResidual(self, initial, params, pivot, settings):
+    """
+      Removes trained signal from data and find residual
+      @ In, initial, np.array, original signal shaped [pivotValues, targets], targets MUST be in
+                               same order as self.target
+      @ In, params, dict, training parameters as from self.characterize
+      @ In, pivot, np.array, time-like array values
+      @ In, settings, dict, additional settings specific to algorithm
+      @ Out, residual, np.array, reduced signal shaped [pivotValues, targets]
+    """
+    # The residual for an ARMA model can be useful, and we want to return that if it's available.
+    # If the 'reduce_memory' option was used, then the ARIMAResults object from fitting the model
+    # where that residual is stored is not available. In that case, we simply return the original.
+    if settings['reduce_memory']:
+      return initial
+
+    residual = initial.copy()
+    for t, (target, data) in enumerate(params.items()):
+      residual[:, t] = data['arma']['results'].resid
+
+    return residual
+
+  def getComposite(self, initial, params, pivot, settings):
+    """
+      Combines two component signals to form a composite signal. This is essentially the inverse
+      operation of the getResidual method.
+      @ In, initial, np.array, original signal shaped [pivotValues, targets], targets MUST be in
+                               same order as self.target
+      @ In, params, dict, training parameters as from self.characterize
+      @ In, pivot, np.array, time-like array values
+      @ In, settings, dict, additional settings specific to algorithm
+      @ Out, composite, np.array, resulting composite signal
+    """
+    # Add a generated ARMA signal to the initial signal.
+    synthetic = self.generate(params, pivot, settings)
+    composite = initial + synthetic
+    return composite
+
   def getParamNames(self, settings):
     """
       Return list of expected variable names based on the parameters
@@ -224,30 +267,6 @@ def getParamsAsVars(self, params):
         rlz[f'{base}__MA__{q}'] = ma
     return rlz
 
-  def getResidual(self, initial, params, pivot, settings):
-    """
-      @ In, initial, np.array, original signal shaped [pivotValues, targets], targets MUST be in
-                               same order as self.target
-      @ In, params, dict, training parameters as from self.characterize
-      @ In, pivot, np.array, time-like array values
-      @ In, settings, dict, additional settings specific to algorithm
-      @ Out, residual, np.array, reduced signal shaped [pivotValues, targets]
-    """
-    raise NotImplementedError('ARMA cannot provide a residual yet; it must be the last TSA used!')
-    # FIXME how to get a useful residual?
-    # -> the "residual" of the ARMA is ideally white noise, not a 0 vector, even if perfectly fit
-    #    so what does it mean to provide the residual from the ARMA training?
-    # in order to use "predict" (in-sample forecasting) can't be in low-memory mode
-    # if settings['reduce_memory']:
-    #   raise RuntimeError('Cannot get residual of ARMA if in reduced memory mode!')
-    # for tg, (target, data) in enumerate(params.items()):
-    #   armaData = data['arma']
-    #   modelParams = np.hstack([[armaData.get('const', 0)],
-    #                            armaData['ar'],
-    #                            armaData['ma'],
-    #                            [armaData.get('var', 1)]])
-    #   new = armaData['model'].predict(modelParams)
-
   def generate(self, params, pivot, settings):
     """
       Generates a synthetic history from fitted parameters.

diff --git a/ravenframework/TSA/Factory.py b/ravenframework/TSA/Factory.py
@@ -24,11 +24,24 @@
 from .Wavelet import Wavelet
 from .PolynomialRegression import PolynomialRegression
 from .RWD import RWD
+from .Transformers import ZeroFilter, LogTransformer, ArcsinhTransformer, TanhTransformer, SigmoidTransformer, \
+                          OutTruncation, MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
 
 factory = EntityFactory('TimeSeriesAnalyzer')
 # TODO map lower case to upper case, because of silly ROM namespace problems
 aliases = {'Fourier': 'fourier',
            'ARMA': 'arma',
            'RWD': 'rwd',
-           'Wavelet': 'wavelet'}
+           'Wavelet': 'wavelet',
+           'ZeroFilter': 'zerofilter',
+           'OutTruncation': 'outtruncation',
+           'LogTransformer': 'logtransformer',
+           'ArcsinhTransformer': 'arcsinhtransformer',
+           'TanhTransformer': 'tanhtransformer',
+           'SigmoidTransformer': 'sigmoidtransformer',
+           'MaxAbsScaler': 'maxabsscaler',
+           'MinMaxScaler': 'minmaxscaler',
+           'StandardScaler': 'standardscaler',
+           'RobustScaler': 'robustscaler',
+           'QuantileTransformer': 'quantiletransformer'}
 factory.registerAllSubtypes(TimeSeriesAnalyzer, alias=aliases)
diff --git a/ravenframework/TSA/Fourier.py b/ravenframework/TSA/Fourier.py
@@ -19,13 +19,14 @@
 import collections
 import numpy as np
 import sklearn.linear_model
+import statsmodels.api as sm
 
 from ..utils import InputData, InputTypes, randomUtils, xmlUtils, mathUtils, utils
-from .TimeSeriesAnalyzer import TimeSeriesGenerator, TimeSeriesCharacterizer
+from .TimeSeriesAnalyzer import TimeSeriesTransformer, TimeSeriesCharacterizer, TimeSeriesGenerator
 
 
 # utility methods
-class Fourier(TimeSeriesGenerator, TimeSeriesCharacterizer):
+class Fourier(TimeSeriesTransformer, TimeSeriesCharacterizer, TimeSeriesGenerator):
   """
     Perform Fourier analysis; note this is not Fast Fourier, where all Fourier modes are used to fit a
     signal. Instead, detect the presence of specifically-requested Fourier bases.
@@ -35,12 +36,14 @@ class Fourier(TimeSeriesGenerator, TimeSeriesCharacterizer):
   _features = ['sin',       # amplitude of sine coefficients
                'cos',       # amplitude of cosine coefficients
                'intercept'] # mean of signal
+  _acceptsMissingValues = True
 
   @classmethod
   def getInputSpecification(cls):
     """
       Method to get a reference to a class that specifies the input data for
       class cls.
+      @ In, None
       @ Out, inputSpecification, InputData.ParameterInput, class to use for
         specifying input of cls.
     """
@@ -73,14 +76,14 @@ def __init__(self, *args, **kwargs):
   def handleInput(self, spec):
     """
       Reads user inputs into this object.
-      @ In, inp, InputData.InputParams, input specifications
+      @ In, spec, InputData.InputParams, input specifications
       @ Out, settings, dict, initialization settings for this algorithm
     """
     settings = super().handleInput(spec)
     settings['periods'] = spec.findFirst('periods').value
     return settings
 
-  def characterize(self, signal, pivot, targets, settings, simultFit=True):
+  def fit(self, signal, pivot, targets, settings, simultFit=True):
     """
       Determines the charactistics of the signal based on this algorithm.
       @ In, signal, np.ndarray, time series with dims [time, target]
@@ -105,13 +108,11 @@ def characterize(self, signal, pivot, targets, settings, simultFit=True):
     params = {}
     for tg, target in enumerate(targets):
       history = signal[:, tg] # TODO need to keep in sync with SyntheticSignal ROM!
+      mask = ~np.isnan(history)  # some values might be NaN due to masking
       if simultFit and cond < 30:
         print(f'Fourier fitting condition number is {cond:1.1e} for "{target}". ',
                         ' Calculating all Fourier coefficients at once.')
-        fourierEngine = sklearn.linear_model.LinearRegression(normalize=False)
-        fourierEngine.fit(fourierSignals, history)
-        intercept = fourierEngine.intercept_
-        coeffs = fourierEngine.coef_
+        intercept, coeffs = self._fitSignal(fourierSignals, history)
       else:
         print(f'Fourier fitting condition number is {cond:1.1e} for "{target}"! ',
                         'Calculating iteratively instead of all at once.')
@@ -124,10 +125,7 @@ def characterize(self, signal, pivot, targets, settings, simultFit=True):
         coeffs = np.zeros(F2) # amplitude coeffs for sine, cosine
         for fn in range(F2):
           fSignal = fourierSignals[:,fn] # Fourier base signal for this waveform
-          eng = sklearn.linear_model.LinearRegression(normalize=False) # regressor
-          eng.fit(fSignal.reshape(H,1), signalToFit)
-          thisIntercept = eng.intercept_
-          thisCoeff = eng.coef_[0]
+          thisIntercept, thisCoeff = self._fitSignal(fSignal.reshape(H,1), signalToFit)
           coeffs[fn] = thisCoeff
           intercept += thisIntercept
           # remove this signal from the signal to fit
@@ -160,6 +158,63 @@ def characterize(self, signal, pivot, targets, settings, simultFit=True):
       # END for target in targets
     return params
 
+  def _fitSignal(self, baseSignal, signalToFit):
+    """
+      Utility for calculating least-squares approximation of Fourier coefficients
+      @ In, baseSignal, np.ndarray, signal composes of Fourier base(s)
+      @ In, signalToFit, np.ndarray, signal being detrended
+      @ Out, intercept, np.ndarray, intercept for the base Fourier signal
+      @ Out, coeffs, np.ndarray, coefficients of the various Fourier components
+    """
+    fitResult = sm.OLS(signalToFit, sm.add_constant(baseSignal), missing='drop').fit()
+    intercept = fitResult.params[0]
+    coeffs = fitResult.params[1:]
+    return intercept, coeffs
+
+  def getResidual(self, initial, params, pivot, settings):
+    """
+      Removes fitted Fourier signal from data
+      @ In, initial, np.array, original signal shaped [pivotValues, targets], targets MUST be in
+                               same order as self.target
+      @ In, params, dict, training parameters as from self.characterize
+      @ In, pivot, np.array, time-like array values
+      @ In, settings, dict, additional settings specific to algorithm
+      @ Out, residual, np.array, reduced signal shaped [pivotValues, targets]
+    """
+    synthetic = self.generate(params, pivot)
+    residual = initial - synthetic
+    return residual
+
+  def getComposite(self, initial, params, pivot, settings):
+    """
+      Adds the Fourier signal back into initial signal
+      @ In, initial, np.array, original signal shaped [pivotValues, targets], targets MUST be in
+                               same order as self.target
+      @ In, params, dict, training parameters as from self.characterize
+      @ In, pivot, np.array, time-like array values
+      @ In, settings, dict, additional settings specific to algorithm
+      @ Out, composite, np.array, resulting composite signal
+    """
+    synthetic = self.generate(params, pivot)
+    residual = initial + synthetic
+    return residual
+
+  def generate(self, params, pivot):
+    """
+      Generates a synthetic history from fitted parameters.
+      @ In, params, dict, characterization such as otained from self.characterize()
+      @ In, pivot, np.array(float), pivot parameter values
+      @ Out, synthetic, np.array(float), synthetic ARMA signal
+    """
+    synthetic = np.zeros((len(pivot), len(params)))
+    for t, (target, data) in enumerate(params.items()):
+      synthetic[:, t] += data['intercept']
+      for period, coeffs in data['coeffs'].items():
+        C = coeffs['amplitude']
+        s = coeffs['phase']
+        synthetic[:, t] += mathUtils.evalFourier(period, C, s, pivot)
+    return synthetic
+
   def getParamNames(self, settings):
     """
       Return list of expected variable names based on the parameters
@@ -192,24 +247,6 @@ def getParamsAsVars(self, params):
           rlz[f'{baseWave}__{stat}'] = value
     return rlz
 
-
-  def generate(self, params, pivot, settings):
-    """
-      Generates a synthetic history from fitted parameters.
-      @ In, params, dict, characterization such as otained from self.characterize()
-      @ In, pivot, np.array(float), pivot parameter values
-      @ In, settings, dict, additional settings specific to algorithm
-      @ Out, synthetic, np.array(float), synthetic ARMA signal
-    """
-    synthetic = np.zeros((len(pivot), len(params)))
-    for t, (target, data) in enumerate(params.items()):
-      synthetic[:, t] += data['intercept']
-      for period, coeffs in data['coeffs'].items():
-        C = coeffs['amplitude']
-        s = coeffs['phase']
-        synthetic[:, t] += mathUtils.evalFourier(period, C, s, pivot)
-    return synthetic
-
   def writeXML(self, writeTo, params):
     """
       Allows the engine to put whatever it wants into an XML to print to file.