Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support categorical data for pyvinecopulib #13

Merged
merged 4 commits into from
Nov 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions synthia/generators/copula.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _log(self, msg: str) -> None:

def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
copula: Copula,
is_discrete: Optional[Union[bool, Dict[int, bool], Dict[str, bool]]]=None,
types: Optional[Union[str, Dict[int, str], Dict[str, str]]]=None,
parameterize_by: Optional[Union[Parameterizer, Dict[int, Parameterizer], Dict[str, Parameterizer]]]=None):
"""Fit the marginal distributions and copula model for all features.

Expand All @@ -94,7 +94,13 @@ def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],

copula: The underlying copula to use, for example a GaussianCopula object.

is_discrete : indicates whether features are discrete or continuous
types (str or mapping, optional): Indicates whether features
are categorical ('cat'), discrete ('disc'), or continuous ('cont').
The following forms are valid:

- str
- per-feature mapping {feature idx: str} -- ndarray/DataArray only
- per-variable mapping {var name: str} -- Dataset only

parameterize_by (Parameterizer or mapping, optional): The
following forms are valid:
Expand All @@ -107,22 +113,23 @@ def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
None
"""

data, self.data_info = to_feature_array(data)
data, self.data_info = to_feature_array(data, types)

self.dtype = data.dtype
self.n_features = data.shape[1]

self.is_discrete = per_feature(is_discrete, self.data_info)
if any(self.is_discrete) and not isinstance(copula, VineCopula):
raise TypeError('Discrete samples can only be modelled in vine copulas')
self.types = per_feature(types, self.data_info, default='cont')
is_discrete = [t != 'cont' for t in self.types]
if any(is_discrete) and not isinstance(copula, VineCopula):
raise TypeError('Discrete/categorical data can only be modelled in vine copulas')

self._log('computing rank data')
rank_standardized = compute_rank_standardized(data, self.is_discrete)
rank_standardized = compute_rank_standardized(data, is_discrete)

self._log('fitting copula')
if any(self.is_discrete):
if any(is_discrete):
assert isinstance(copula, VineCopula)
copula.fit_with_discrete(rank_standardized, self.is_discrete)
copula.fit_with_discrete(rank_standardized, is_discrete)
else:
copula.fit(rank_standardized)

Expand Down Expand Up @@ -188,10 +195,10 @@ def generate(self, n_samples: int,
else:
feature_samples = self.parameterizers[i].generate(n_samples)

if self.is_discrete[i]:
interp = 'nearest'
else:
if self.types[i] == 'cont':
interp = 'linear'
else:
interp = 'nearest'

samples[:,i] = np.quantile(feature_samples, q=u[:, i], interpolation=interp)

Expand Down Expand Up @@ -241,7 +248,7 @@ def compute_rank_standardized(data: xr.DataArray, is_discrete: List[bool]) -> np
feature_rank = feature.rank(feature.dims[0]).values
ranks.append(feature_rank.reshape(-1, 1))

feature_rank_min = rankdata(data[:, is_discrete], method='min') - 1
feature_rank_min = rankdata(data[:, is_discrete], axis=1, method='min') - 1
ranks.append(feature_rank_min)

rank = np.concatenate(ranks, axis=1)
Expand Down
104 changes: 92 additions & 12 deletions synthia/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,25 +100,90 @@ def to_unstacked_dataset(arr: np.ndarray, stack_info: StackInfo) -> xr.Dataset:
ds = xr.Dataset(unstacked)
return ds

def to_feature_array(data: Union[np.ndarray, xr.DataArray, xr.Dataset]) -> Tuple[xr.DataArray, dict]:
def to_feature_array(data: Union[np.ndarray, xr.DataArray, xr.Dataset],
types: Optional[Union[str, Dict[str,str], Dict[int,str]]]=None) -> Tuple[xr.DataArray, dict]:
data_info = {}
if isinstance(data, xr.Dataset):
data, stack_info = to_stacked_array(data)
data_info['stack_info'] = stack_info
else:
if isinstance(data, xr.DataArray):
data_info['da_info'] = dict(
coords=data.coords,
dims=data.dims,
name=data.name,
attrs=data.attrs
)
data = xr.DataArray(data)
assert data.ndim == 2, f'Input array must be 2D, given: {data.ndim}D'
data_info['n_features'] = data.shape[1]

types_ = per_feature(types, data_info)
if not any(t == 'cat' for t in types_):
return data, data_info

# Transform categorical features into one-hot vectors.
categories = {}
features = []
for i in range(data.shape[1]):
if types_[i] == 'cat':
one_hot, categories[i] = categories_to_one_hot(data[:,i].values)
features.append(one_hot)
else:
features.append(data[:,i:i+1])
data = xr.DataArray(np.concatenate(features, axis=1), dims=data.dims)
data_info['categories'] = categories

return data, data_info

def categories_to_one_hot(data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
assert data.ndim == 1
# determine all categories occurring in the data
categories = np.unique(data)
assert categories.ndim == 1
n_categories = len(categories)
# re-index categories, starting from 0
indices_dtype = None
for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
if n_categories < np.iinfo(dtype).max:
indices_dtype = dtype
assert indices_dtype
indices = np.empty(data.shape, indices_dtype)
for index, cat_val in enumerate(categories):
indices[data == cat_val] = index
# create one-hot vectors
one_hot = np.eye(n_categories)[indices]
assert one_hot.shape == (len(data), n_categories)
return one_hot, categories

def one_hot_to_categories(data: np.ndarray, categories: np.ndarray) -> np.ndarray:
assert isinstance(data, np.ndarray)
assert data.ndim == 2
assert categories.ndim == 1
assert data.shape[1] == len(categories)
indices = np.argmax(data, axis=1)
out = categories[indices]
assert out.shape == (data.shape[0],)
return out

def from_feature_array(data: np.ndarray, data_info: dict) -> Union[np.ndarray, xr.DataArray, xr.Dataset]:
categories = data_info.get('categories')
if categories:
# Transform one-hot encoded categories into original values.
features = []
i_onehot = 0
i_normal = 0
while i_onehot < data.shape[1]:
if i_normal in categories:
n_categories = len(categories[i_normal])
feature = one_hot_to_categories(data[:, i_onehot:i_onehot+n_categories], categories[i_normal])
features.append(feature.reshape(-1, 1))
i_onehot += n_categories
else:
features.append(data[:, i_onehot:i_onehot+1])
i_onehot += 1
i_normal += 1
data = np.concatenate(features, axis=1)

stack_info = data_info.get('stack_info')
if stack_info:
return to_unstacked_dataset(data, stack_info)
Expand All @@ -130,39 +195,54 @@ def from_feature_array(data: np.ndarray, data_info: dict) -> Union[np.ndarray, x
def prod(iterable) -> int:
return reduce(operator.mul, iterable, 1)

def per_feature(val: Optional[Union[Any, Dict[str,Any], Dict[int,Any]]], data_info: dict) -> List[Any]:
n_features, stack_info = data_info.get('n_features'), data_info.get('stack_info')
def per_feature(val: Optional[Union[Any, Dict[str,Any], Dict[int,Any]]], data_info: dict, default=None) -> List[Any]:
n_features = data_info.get('n_features')
stack_info = data_info.get('stack_info')

if n_features is None:
n_features_ = sum(prod(info.shape) for info in stack_info)
else:
n_features_ = n_features

if val is None:
return [None] * n_features_
out = [default] * n_features_
elif isinstance(val, dict):
if stack_info:
feature_values = []
current = 0
for info in stack_info:
feature_count = prod(info.shape)
p = val.get(info.name, None)
p = val.get(info.name, default)
feature_values.extend(copy.copy(p) for _ in range(feature_count))
current += feature_count
else:
feature_values = [copy.copy(val.get(i, None)) for i in range(n_features_)] # type: ignore
return feature_values
feature_values = [copy.copy(val.get(i, default)) for i in range(n_features_)] # type: ignore
out = feature_values
else:
return [copy.copy(val) for _ in range(n_features_)]
out = [copy.copy(val) for _ in range(n_features_)]

# Repeat values for categorical features since they are one-hot encoded
categories = data_info.get('categories')
if categories:
out_ = []
for i in range(n_features_):
if i in categories:
out_ += [out[i]] * len(categories[i])
else:
out_ += [out[i]]
out = out_

return out

def load_dataset(name='SAF-Synthetic') -> xr.Dataset:
""" Retun a dataframe of 25 000 syntheic temperature profiles
from the SAF dataset
""" Return a dataset of 25 000 synthetic temperature profiles
from the SAF dataset.
"""
from urllib.request import urlopen
import pickle

if name != 'SAF-Synthetic':
raise RuntimeError('Only SAF-Synthetic is currerlty supported')
raise RuntimeError('Only SAF-Synthetic is currently supported')

url = 'https://raw.githubusercontent.com/dmey/synthia/data/generator_saf_temperature_fpca_6.pkl'
generator = pickle.load(urlopen(url))
Expand Down
25 changes: 25 additions & 0 deletions tests/test_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,31 @@ def test_vine_copula_feature_generation():

assert generator.generate(10, seed=42).equals(generator.generate(10, seed=42))

@pytest.mark.skipif(pv == None, reason="Skip test if pyvinecopulib not installed")
def test_vine_copula_with_categorical():
a = np.array([[0, 1, 2], [3, 4, 5]])
b = np.array([6, 7])
input_data = xr.Dataset({
'a': (('sample', 'foo'), a),
'b': (('sample'), b)
})

generator = syn.CopulaDataGenerator(verbose=True)

ctrl = pv.FitControlsVinecop(family_set=[pv.gaussian], trunc_lvl=1, select_trunc_lvl=False, show_trace=True)
generator.fit(input_data, types={ 'b': 'cat' }, copula=syn.VineCopula(controls=ctrl))

pickled = pickle.dumps(generator)
generator = pickle.loads(pickled)

n_synthetic_samples = 50
synthetic_data = generator.generate(n_samples=n_synthetic_samples)

assert synthetic_data['a'].shape == (n_synthetic_samples, 3)
assert synthetic_data['b'].shape == (n_synthetic_samples,)

assert generator.generate(10, seed=42).equals(generator.generate(10, seed=42))

def test_copula_ndarray_feature_generation():
n_samples = 200
n_features = 100
Expand Down
80 changes: 79 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import numpy as np
import xarray as xr

from synthia.util import to_stacked_array, to_unstacked_dataset
from synthia.util import (
to_stacked_array, to_unstacked_dataset,
categories_to_one_hot, one_hot_to_categories,
to_feature_array, from_feature_array,
per_feature
)

def test_stacking_roundtrip():
a = np.arange(12).reshape(2,6)
Expand All @@ -18,3 +23,76 @@ def test_stacking_roundtrip():
arr, info = to_stacked_array(ds)
ds_u = to_unstacked_dataset(arr.values, info)
assert ds.identical(ds_u)

def test_one_hot_roundtrip():
test_cases = [
np.array([1, 4, 2, 1, 5]),
np.array(['foo', 'bar', 'foo'])
]
for data in test_cases:
one_hot, categories = categories_to_one_hot(data)
data_ = one_hot_to_categories(one_hot, categories)
np.testing.assert_equal(data_, data)

def test_feature_array_roundtrip():
a = np.array([[0, 1, 2], [3, 4, 5]])
b = np.array([6, 7])
c = np.array([8, 9])
ds = xr.Dataset({
'a': (('sample', 'foo'), a),
'b': (('sample'), b),
'c': (('sample'), c)
})
types = { 'a': 'cat' }

arr, info = to_feature_array(ds, types)
ds_u = from_feature_array(arr.values, info)
assert isinstance(ds_u, xr.Dataset)
assert ds.identical(ds_u)

def test_to_feature_array():
a = np.array([[0, 1, 2], [3, 4, 5]])
b = np.array([6, 7])
c = np.array([8, 9])
ds = xr.Dataset({
'a': (('sample', 'foo'), a),
'b': (('sample'), b),
'c': (('sample'), c)
})

types = { 'c': 'cat' }
arr, _ = to_feature_array(ds, types)
np.testing.assert_equal(np.array([
[0, 1, 2, 6, 1, 0],
[3, 4, 5, 7, 0, 1],
]), arr)

types = { 'a': 'cat' }
arr, _ = to_feature_array(ds, types)
np.testing.assert_equal(np.array([
[1, 0, 1, 0, 1, 0, 6, 8],
[0, 1, 0, 1, 0, 1, 7, 9],
]), arr)

def test_per_feature():
a = np.array([[0, 1, 2], [3, 4, 5]])
b = np.array([6, 7])
ds = xr.Dataset({
'a': (('sample', 'foo'), a),
'b': (('sample'), b)
})

_, data_info = to_feature_array(ds)
vals = per_feature(True, data_info)
assert [True, True, True, True] == vals

vals = per_feature({'a': 1, 'b': 2}, data_info)
assert [1, 1, 1, 2] == vals

vals = per_feature({'a': True}, data_info, default=False)
assert [True, True, True, False] == vals

types = { 'b': 'cat' }
_, data_info = to_feature_array(ds, types)
vals = per_feature({'a': 'foo', 'b': 'bar'}, data_info)
assert ['foo', 'foo', 'foo', 'bar', 'bar'] == vals