dmey · dmey · Nov 12, 2020 · Nov 11, 2020 · Nov 12, 2020 · Nov 12, 2020
diff --git a/synthia/generators/copula.py b/synthia/generators/copula.py
@@ -83,7 +83,7 @@ def _log(self, msg: str) -> None:
 
     def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
             copula: Copula,
-            is_discrete: Optional[Union[bool, Dict[int, bool], Dict[str, bool]]]=None,
+            types: Optional[Union[str, Dict[int, str], Dict[str, str]]]=None,
             parameterize_by: Optional[Union[Parameterizer, Dict[int, Parameterizer], Dict[str, Parameterizer]]]=None):
         """Fit the marginal distributions and copula model for all features.
 
@@ -94,7 +94,13 @@ def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
 
             copula: The underlying copula to use, for example a GaussianCopula object.
 
-            is_discrete : indicates whether features are discrete or continuous
+            types (str or mapping, optional): Indicates whether features
+                are categorical ('cat'), discrete ('disc'), or continuous ('cont').
+                The following forms are valid:
+
+                - str
+                - per-feature mapping {feature idx: str} -- ndarray/DataArray only
+                - per-variable mapping {var name: str} -- Dataset only
 
             parameterize_by (Parameterizer or mapping, optional): The
                 following forms are valid:
@@ -107,22 +113,23 @@ def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
             None
         """
 
-        data, self.data_info = to_feature_array(data)
+        data, self.data_info = to_feature_array(data, types)
 
         self.dtype = data.dtype
         self.n_features = data.shape[1]
 
-        self.is_discrete = per_feature(is_discrete, self.data_info)
-        if any(self.is_discrete) and not isinstance(copula, VineCopula):
-            raise TypeError('Discrete samples can only be modelled in vine copulas')
+        self.types = per_feature(types, self.data_info, default='cont')
+        is_discrete = [t != 'cont' for t in self.types]
+        if any(is_discrete) and not isinstance(copula, VineCopula):
+            raise TypeError('Discrete/categorical data can only be modelled in vine copulas')
 
         self._log('computing rank data')
-        rank_standardized = compute_rank_standardized(data, self.is_discrete)
+        rank_standardized = compute_rank_standardized(data, is_discrete)
 
         self._log('fitting copula')
-        if any(self.is_discrete):
+        if any(is_discrete):
             assert isinstance(copula, VineCopula)
-            copula.fit_with_discrete(rank_standardized, self.is_discrete)
+            copula.fit_with_discrete(rank_standardized, is_discrete)
         else:
             copula.fit(rank_standardized)
 
@@ -188,10 +195,10 @@ def generate(self, n_samples: int,
             else:
                 feature_samples = self.parameterizers[i].generate(n_samples)
 
-            if self.is_discrete[i]:
-                interp = 'nearest'
-            else:
+            if self.types[i] == 'cont':
                 interp = 'linear'
+            else:
+                interp = 'nearest'
 
             samples[:,i] = np.quantile(feature_samples, q=u[:, i], interpolation=interp)
 
@@ -241,7 +248,7 @@ def compute_rank_standardized(data: xr.DataArray, is_discrete: List[bool]) -> np
                 feature_rank = feature.rank(feature.dims[0]).values
                 ranks.append(feature_rank.reshape(-1, 1))
 
-        feature_rank_min = rankdata(data[:, is_discrete], method='min') - 1
+        feature_rank_min = rankdata(data[:, is_discrete], axis=1, method='min') - 1
         ranks.append(feature_rank_min)
 
         rank = np.concatenate(ranks, axis=1)

diff --git a/synthia/util.py b/synthia/util.py
@@ -100,25 +100,90 @@ def to_unstacked_dataset(arr: np.ndarray, stack_info: StackInfo) -> xr.Dataset:
     ds = xr.Dataset(unstacked)
     return ds
 
-def to_feature_array(data: Union[np.ndarray, xr.DataArray, xr.Dataset]) -> Tuple[xr.DataArray, dict]:
+def to_feature_array(data: Union[np.ndarray, xr.DataArray, xr.Dataset],
+                     types: Optional[Union[str, Dict[str,str], Dict[int,str]]]=None) -> Tuple[xr.DataArray, dict]:
     data_info = {}
     if isinstance(data, xr.Dataset):
         data, stack_info = to_stacked_array(data)
         data_info['stack_info'] = stack_info
     else:
         if isinstance(data, xr.DataArray):
             data_info['da_info'] = dict(
-                coords=data.coords,
                 dims=data.dims,
                 name=data.name,
                 attrs=data.attrs
             )
         data = xr.DataArray(data)
     assert data.ndim == 2, f'Input array must be 2D, given: {data.ndim}D'
     data_info['n_features'] = data.shape[1]
+
+    types_ = per_feature(types, data_info)
+    if not any(t == 'cat' for t in types_):
+        return data, data_info
+
+    # Transform categorical features into one-hot vectors.
+    categories = {}
+    features = []
+    for i in range(data.shape[1]):
+        if types_[i] == 'cat':
+            one_hot, categories[i] = categories_to_one_hot(data[:,i].values)
+            features.append(one_hot)
+        else:
+            features.append(data[:,i:i+1])
+    data = xr.DataArray(np.concatenate(features, axis=1), dims=data.dims)
+    data_info['categories'] = categories
+
     return data, data_info
 
+def categories_to_one_hot(data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    assert data.ndim == 1
+    # determine all categories occurring in the data
+    categories = np.unique(data)
+    assert categories.ndim == 1
+    n_categories = len(categories)
+    # re-index categories, starting from 0
+    indices_dtype = None
+    for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
+        if n_categories < np.iinfo(dtype).max:
+            indices_dtype = dtype
+    assert indices_dtype
+    indices = np.empty(data.shape, indices_dtype)
+    for index, cat_val in enumerate(categories):
+        indices[data == cat_val] = index
+    # create one-hot vectors
+    one_hot = np.eye(n_categories)[indices]    
+    assert one_hot.shape == (len(data), n_categories)
+    return one_hot, categories
+
+def one_hot_to_categories(data: np.ndarray, categories: np.ndarray) -> np.ndarray:
+    assert isinstance(data, np.ndarray)
+    assert data.ndim == 2
+    assert categories.ndim == 1
+    assert data.shape[1] == len(categories)
+    indices = np.argmax(data, axis=1)
+    out = categories[indices]
+    assert out.shape == (data.shape[0],)
+    return out
+
 def from_feature_array(data: np.ndarray, data_info: dict) -> Union[np.ndarray, xr.DataArray, xr.Dataset]:
+    categories = data_info.get('categories')
+    if categories:
+        # Transform one-hot encoded categories into original values.
+        features = []
+        i_onehot = 0
+        i_normal = 0
+        while i_onehot < data.shape[1]:
+            if i_normal in categories:
+                n_categories = len(categories[i_normal])
+                feature = one_hot_to_categories(data[:, i_onehot:i_onehot+n_categories], categories[i_normal])
+                features.append(feature.reshape(-1, 1))
+                i_onehot += n_categories
+            else:
+                features.append(data[:, i_onehot:i_onehot+1])
+                i_onehot += 1
+            i_normal += 1
+        data = np.concatenate(features, axis=1)
+
     stack_info = data_info.get('stack_info')
     if stack_info:
         return to_unstacked_dataset(data, stack_info)
@@ -130,39 +195,54 @@ def from_feature_array(data: np.ndarray, data_info: dict) -> Union[np.ndarray, x
 def prod(iterable) -> int:
     return reduce(operator.mul, iterable, 1)
 
-def per_feature(val: Optional[Union[Any, Dict[str,Any], Dict[int,Any]]], data_info: dict) -> List[Any]:
-    n_features, stack_info = data_info.get('n_features'), data_info.get('stack_info')
+def per_feature(val: Optional[Union[Any, Dict[str,Any], Dict[int,Any]]], data_info: dict, default=None) -> List[Any]:
+    n_features = data_info.get('n_features')
+    stack_info = data_info.get('stack_info')
+
     if n_features is None:
         n_features_ = sum(prod(info.shape) for info in stack_info)
     else:
         n_features_ = n_features
+
     if val is None:
-        return [None] * n_features_
+        out = [default] * n_features_
     elif isinstance(val, dict):
         if stack_info:
             feature_values = []
             current = 0
             for info in stack_info:
                 feature_count = prod(info.shape)
-                p = val.get(info.name, None)
+                p = val.get(info.name, default)
                 feature_values.extend(copy.copy(p) for _ in range(feature_count))
                 current += feature_count
         else:
-            feature_values = [copy.copy(val.get(i, None)) for i in range(n_features_)] # type: ignore
-        return feature_values
+            feature_values = [copy.copy(val.get(i, default)) for i in range(n_features_)] # type: ignore
+        out = feature_values
     else:
-        return [copy.copy(val) for _ in range(n_features_)]
+        out = [copy.copy(val) for _ in range(n_features_)]
+
+    # Repeat values for categorical features since they are one-hot encoded
+    categories = data_info.get('categories')
+    if categories:
+        out_ = []
+        for i in range(n_features_):
+            if i in categories:
+                out_ += [out[i]] * len(categories[i])
+            else:
+                out_ += [out[i]]
+        out = out_
 
+    return out
 
 def load_dataset(name='SAF-Synthetic') -> xr.Dataset:
-    """ Retun a dataframe of 25 000 syntheic temperature profiles
-    from the SAF dataset
+    """ Return a dataset of 25 000 synthetic temperature profiles
+    from the SAF dataset.
     """
     from urllib.request import urlopen
     import pickle
 
     if name != 'SAF-Synthetic':
-        raise RuntimeError('Only SAF-Synthetic is currerlty supported')
+        raise RuntimeError('Only SAF-Synthetic is currently supported')
 
     url = 'https://raw.githubusercontent.com/dmey/synthia/data/generator_saf_temperature_fpca_6.pkl'
     generator = pickle.load(urlopen(url))

diff --git a/tests/test_generators.py b/tests/test_generators.py
@@ -150,6 +150,31 @@ def test_vine_copula_feature_generation():
 
     assert generator.generate(10, seed=42).equals(generator.generate(10, seed=42))
 
+@pytest.mark.skipif(pv == None, reason="Skip test if pyvinecopulib not installed")
+def test_vine_copula_with_categorical():
+    a = np.array([[0, 1, 2], [3, 4, 5]])
+    b = np.array([6, 7])
+    input_data = xr.Dataset({
+        'a': (('sample', 'foo'), a),
+        'b': (('sample'), b)
+        })
+
+    generator = syn.CopulaDataGenerator(verbose=True)
+
+    ctrl = pv.FitControlsVinecop(family_set=[pv.gaussian], trunc_lvl=1, select_trunc_lvl=False, show_trace=True)
+    generator.fit(input_data, types={ 'b': 'cat' }, copula=syn.VineCopula(controls=ctrl))
+
+    pickled = pickle.dumps(generator)
+    generator = pickle.loads(pickled)
+
+    n_synthetic_samples = 50
+    synthetic_data = generator.generate(n_samples=n_synthetic_samples)
+
+    assert synthetic_data['a'].shape == (n_synthetic_samples, 3)
+    assert synthetic_data['b'].shape == (n_synthetic_samples,)
+
+    assert generator.generate(10, seed=42).equals(generator.generate(10, seed=42))
+
 def test_copula_ndarray_feature_generation():
     n_samples = 200
     n_features = 100

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -4,7 +4,12 @@
 import numpy as np
 import xarray as xr
 
-from synthia.util import to_stacked_array, to_unstacked_dataset
+from synthia.util import (
+    to_stacked_array, to_unstacked_dataset,
+    categories_to_one_hot, one_hot_to_categories,
+    to_feature_array, from_feature_array,
+    per_feature
+)
 
 def test_stacking_roundtrip():
     a = np.arange(12).reshape(2,6)
@@ -18,3 +23,76 @@ def test_stacking_roundtrip():
     arr, info = to_stacked_array(ds)
     ds_u = to_unstacked_dataset(arr.values, info)
     assert ds.identical(ds_u)
+
+def test_one_hot_roundtrip():
+    test_cases = [
+        np.array([1, 4, 2, 1, 5]),
+        np.array(['foo', 'bar', 'foo'])
+    ]
+    for data in test_cases:
+        one_hot, categories = categories_to_one_hot(data)
+        data_ = one_hot_to_categories(one_hot, categories)
+        np.testing.assert_equal(data_, data)
+
+def test_feature_array_roundtrip():
+    a = np.array([[0, 1, 2], [3, 4, 5]])
+    b = np.array([6, 7])
+    c = np.array([8, 9])
+    ds = xr.Dataset({
+        'a': (('sample', 'foo'), a),
+        'b': (('sample'), b),
+        'c': (('sample'), c)
+        })
+    types = { 'a': 'cat' }
+
+    arr, info = to_feature_array(ds, types)
+    ds_u = from_feature_array(arr.values, info)
+    assert isinstance(ds_u, xr.Dataset)
+    assert ds.identical(ds_u)
+
+def test_to_feature_array():
+    a = np.array([[0, 1, 2], [3, 4, 5]])
+    b = np.array([6, 7])
+    c = np.array([8, 9])
+    ds = xr.Dataset({
+        'a': (('sample', 'foo'), a),
+        'b': (('sample'), b),
+        'c': (('sample'), c)
+        })
+
+    types = { 'c': 'cat' }
+    arr, _ = to_feature_array(ds, types)
+    np.testing.assert_equal(np.array([
+        [0, 1, 2, 6, 1, 0],
+        [3, 4, 5, 7, 0, 1],
+    ]), arr)
+
+    types = { 'a': 'cat' }
+    arr, _ = to_feature_array(ds, types)
+    np.testing.assert_equal(np.array([
+        [1, 0, 1, 0, 1, 0, 6, 8],
+        [0, 1, 0, 1, 0, 1, 7, 9],
+    ]), arr)
+
+def test_per_feature():
+    a = np.array([[0, 1, 2], [3, 4, 5]])
+    b = np.array([6, 7])
+    ds = xr.Dataset({
+        'a': (('sample', 'foo'), a),
+        'b': (('sample'), b)
+        })
+
+    _, data_info = to_feature_array(ds)
+    vals = per_feature(True, data_info)
+    assert [True, True, True, True] == vals
+
+    vals = per_feature({'a': 1, 'b': 2}, data_info)
+    assert [1, 1, 1, 2] == vals
+
+    vals = per_feature({'a': True}, data_info, default=False)
+    assert [True, True, True, False] == vals
+
+    types = { 'b': 'cat' }
+    _, data_info = to_feature_array(ds, types)
+    vals = per_feature({'a': 'foo', 'b': 'bar'}, data_info)
+    assert ['foo', 'foo', 'foo', 'bar', 'bar'] == vals