Skip to content

Commit

Permalink
Merge pull request #236 from cnellington/network_quicktests
Browse files Browse the repository at this point in the history
add network quicktests to main test file, fix bug with averaged corre…
  • Loading branch information
cnellington authored Apr 21, 2024
2 parents 1576003 + 6bdcc19 commit 12843bb
Show file tree
Hide file tree
Showing 43 changed files with 984 additions and 740 deletions.
1 change: 1 addition & 0 deletions contextualized/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from contextualized.analysis.accuracy_split import print_acc_by_covars
from contextualized.analysis.bootstraps import select_good_bootstraps
from contextualized.analysis.embeddings import (
plot_lowdim_rep,
plot_embedding_for_all_covars,
Expand Down
5 changes: 3 additions & 2 deletions contextualized/analysis/accuracy_split.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""
Utilities for post-hoc analysis of trained Contextualized models.
"""

from typing import *

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score as roc


def get_roc(Y_true, Y_pred):
def get_roc(Y_true: np.ndarray, Y_pred: np.ndarray) -> float:
"""Measures ROC. Return np.nan if no valid ROC value."""
try:
return roc(Y_true, Y_pred)
Expand All @@ -20,7 +21,7 @@ def print_acc_by_covars(
Y_true: np.ndarray, Y_pred: np.ndarray, covar_df: pd.DataFrame, **kwargs
) -> None:
"""
Prints Accuracy for different data splits with covariates.
Prints AUROC for each class for different covariate splits. Should only be used with ContextualizedClassifier.
Args:
Y_true (np.ndarray): True labels.
Expand Down
29 changes: 17 additions & 12 deletions contextualized/analysis/bootstraps.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
# Utility functions for bootstraps
import numpy as np
from contextualized.easy.wrappers import SKLearnWrapper


def select_good_bootstraps(sklearn_wrapper, train_errs, tol=2, **kwargs):
def select_good_bootstraps(
sklearn_wrapper: SKLearnWrapper, train_errs: np.ndarray, tol: float = 2
) -> SKLearnWrapper:
"""
Select bootstraps that are good for a given model.
Parameters
----------
sklearn_wrapper : contextualized.easy.wrappers.SKLearnWrapper
train_errs : np.ndarray of shape (n_bootstraps, n_samples, n_outcomes)
tol : float tolerance for the mean of the train_errs
Args:
sklearn_wrapper (contextualized.easy.wrappers.SKLearnWrapper): Wrapper for the sklearn model.
train_errs (np.ndarray): Training errors for each bootstrap (n_bootstraps, n_samples, n_outcomes).
tol (float): Only bootstraps with mean train_errs below tol * min(train_errs) are kept.
Returns
-------
sklearn_wrapper : sklearn_wrapper with only selected bootstraps
Returns:
contextualized.easy.wrappers.SKLearnWrapper: The input model with only selected bootstraps.
"""
if len(train_errs.shape) == 2:
train_errs = train_errs[:, :, None]

train_errs_by_bootstrap = np.mean(train_errs, axis=(1, 2))
sklearn_wrapper.models = sklearn_wrapper.models[
train_errs_by_bootstrap < tol * np.min(train_errs_by_bootstrap)
train_errs_min = np.min(train_errs_by_bootstrap)
sklearn_wrapper.models = [
model
for train_err, model in zip(train_errs_by_bootstrap, sklearn_wrapper.models)
if train_err < train_errs_min * tol
]
sklearn_wrapper.n_bootstraps = len(sklearn_wrapper.models)
return sklearn_wrapper
1 change: 1 addition & 0 deletions contextualized/analysis/effects.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Utilities for plotting effects learned by Contextualized models.
"""

from typing import *

import numpy as np
Expand Down
20 changes: 17 additions & 3 deletions contextualized/analysis/embeddings.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
"""
Utilities for plotting embeddings of fitted Contextualized models.
"""

from typing import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from contextualized.analysis import utils

def convert_to_one_hot(col: Collection[Any]) -> Tuple[np.ndarray, List[Any]]:
"""
Converts a categorical variable to a one-hot vector.
Args:
col (Collection[Any]): The categorical variable.
Returns:
Tuple[np.ndarray, List[Any]]: The one-hot vector and the possible values.
"""
vals = list(set(col))
one_hot_vars = np.array([vals.index(x) for x in col], dtype=np.float32)
return one_hot_vars, vals


def plot_embedding_for_all_covars(
Expand Down Expand Up @@ -89,7 +103,7 @@ def plot_lowdim_rep(
cmap = mpl.colors.LinearSegmentedColormap.from_list(
"Custom cmap", [cmap(i) for i in range(cmap.N)], cmap.N
)
tag, tag_names = utils.convert_to_one_hot(labels)
tag, tag_names = convert_to_one_hot(labels)
order = np.argsort(tag_names)
tag_names = np.array(tag_names)[order]
tag = np.array([list(order).index(int(x)) for x in tag])
Expand All @@ -100,7 +114,7 @@ def plot_lowdim_rep(
tag_names = np.array(tag_names)[good_tags]
good_idxs = np.array([good_tags[int(tag[i])] for i in range(len(tag))])
tag = tag[good_idxs]
tag, _ = utils.convert_to_one_hot(tag)
tag, _ = convert_to_one_hot(tag)
bounds = np.linspace(0, len(tag_names), len(tag_names) + 1)
try:
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
Expand Down
29 changes: 13 additions & 16 deletions contextualized/analysis/pvals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Analysis tools for generating pvalues from bootstrap replicates.
"""

from typing import *

import numpy as np
Expand Down Expand Up @@ -41,7 +42,9 @@ def _validate_args(n_bootstraps: int, verbose: bool = False) -> None:
ValueError: If the number of bootstraps is less than 2.
"""
if n_bootstraps < 2:
raise ValueError(f"P-values are not well defined without multiple bootstrap samples.")
raise ValueError(
f"P-values are not well defined without multiple bootstrap samples."
)
min_pval, max_pval = get_possible_pvals(n_bootstraps)
if verbose:
print(
Expand Down Expand Up @@ -103,11 +106,11 @@ def calc_homogeneous_context_effects_pvals(
Returns:
np.ndarray: P-values of shape (n_contexts, n_outcomes) testing whether the
sign of the direct effect of context on outcomes is consistent across bootstraps.
Raises:
ValueError: If the model's n_bootstraps is less than 2.
"""
_validate_args(model.n_bootstraps, verbose = verbose)
_validate_args(model.n_bootstraps, verbose=verbose)
_, effects = get_homogeneous_context_effects(model, C, **kwargs)
# effects.shape: (n_contexts, n_bootstraps, n_context_vals, n_outcomes)
diffs = effects[:, :, -1] - effects[:, :, 0] # Test whether the sign is consistent
Expand Down Expand Up @@ -146,7 +149,7 @@ def calc_homogeneous_predictor_effects_pvals(
Raises:
ValueError: If the model's n_bootstraps is less than 2.
"""
_validate_args(model.n_bootstraps, verbose = verbose)
_validate_args(model.n_bootstraps, verbose=verbose)
_, effects = get_homogeneous_predictor_effects(model, C, **kwargs)
# effects.shape: (n_predictors, n_bootstraps, n_outcomes)
pvals = np.array(
Expand Down Expand Up @@ -184,7 +187,7 @@ def calc_heterogeneous_predictor_effects_pvals(
Raises:
ValueError: If the model's n_bootstraps is less than 2.
"""
_validate_args(model.n_bootstraps, verbose = verbose)
_validate_args(model.n_bootstraps, verbose=verbose)
_, effects = get_heterogeneous_predictor_effects(model, C, **kwargs)
# effects.shape is (n_contexts, n_predictors, n_bootstraps, n_context_vals, n_outcomes)
diffs = (
Expand Down Expand Up @@ -219,7 +222,8 @@ def test_each_context(
X: pd.DataFrame,
Y: pd.DataFrame,
verbose: bool = True,
**kwargs,
model_kwargs: Dict = {"encoder_type": "linear"},
fit_kwargs: Dict = {"max_epochs": 3, "learning_rate": 1e-2, "n_bootstraps": 20},
) -> pd.DataFrame:
"""
Test heterogeneous predictor effects attributed to every individual context feature.
Expand All @@ -239,24 +243,17 @@ def test_each_context(
Raises:
ValueError: If the model's n_bootstraps is less than 2.
"""
default_fit_params = {
"encoder_type": "mlp",
"max_epochs": 3,
"learning_rate": 1e-2,
"n_bootstraps": 20,
}
fit_params = {**default_fit_params, **kwargs}
pvals_dict = {
"Context": [],
"Predictor": [],
"Target": [],
"Pvals": [],
}
_validate_args(fit_params["n_bootstraps"], verbose = verbose)
_validate_args(fit_kwargs["n_bootstraps"], verbose=verbose)
for context in C.columns:
context_col = C[[context]].values
model = model_constructor(**fit_params)
model.fit(context_col, X.values, Y.values, **fit_params)
model = model_constructor(**model_kwargs)
model.fit(context_col, X.values, Y.values, **fit_kwargs)
pvals = calc_heterogeneous_predictor_effects_pvals(
model, context_col, verbose=False
)
Expand Down
149 changes: 149 additions & 0 deletions contextualized/analysis/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""
Unit tests for analysis utilities.
"""

import unittest
import copy
import torch
import numpy as np
import pandas as pd


from contextualized.analysis import (
test_each_context,
select_good_bootstraps,
calc_heterogeneous_predictor_effects_pvals,
calc_homogeneous_context_effects_pvals,
calc_homogeneous_predictor_effects_pvals,
)

from contextualized.easy import ContextualizedRegressor


class TestTestEachContext(unittest.TestCase):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def setUp(self):
"""
Shared data setup.
"""
torch.manual_seed(0)
np.random.seed(0)
n_samples = 1000
C = np.random.uniform(0, 1, size=(n_samples, 2))
X = np.random.uniform(0, 1, size=(n_samples, 2))
beta = np.concatenate([np.ones((n_samples, 1)), C], axis=1)
Y = np.sum(
beta[:, :2] * X, axis=1
) # X1 changes effect under C0. C1 has no effect, X0 is constant

self.C_train_df = pd.DataFrame(C, columns=["C0", "C1"])
self.X_train_df = pd.DataFrame(X, columns=["X0", "X1"])
self.Y_train_df = pd.DataFrame(Y, columns=["Y"])

def test_test_each_context(self):
"""
Test that the output shape of the test_each_context function is as expected.
"""
pvals = test_each_context(
ContextualizedRegressor,
self.C_train_df,
self.X_train_df,
self.Y_train_df,
model_kwargs={"encoder_type": "mlp", "layers": 1},
fit_kwargs={"max_epochs": 1, "learning_rate": 1e-2, "n_bootstraps": 10},
)

expected_shape = (
self.C_train_df.shape[1]
* self.X_train_df.shape[1]
* self.Y_train_df.shape[1],
4,
)
self.assertEqual(pvals.shape, expected_shape)
self.assertTrue(all(0 <= pval <= 1 for pval in pvals["Pvals"]))

pval_c0_x1 = pvals.loc[1, "Pvals"]
self.assertTrue(pval_c0_x1 < 0.2, "C0 X1 p-value is not significant.")

other_pvals = pvals.drop(1)
self.assertTrue(
all(pval >= 0.2 for pval in other_pvals["Pvals"]),
"Other p-values are significant.",
)


class TestSelectGoodBootstraps(unittest.TestCase):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def setUp(self):
self.C = np.random.uniform(0, 1, size=(100, 2))
self.X = np.random.uniform(0, 1, size=(100, 2))
self.Y = np.random.uniform(0, 1, size=(100, 2))

def test_model_has_fewer_bootstraps(self):
"""
Test that the model has fewer bootstraps after calling select_good_bootstraps.
"""
model = ContextualizedRegressor(n_bootstraps=3)
model.fit(self.C, self.X, self.Y)
Y_pred = model.predict(self.C, self.X, individual_preds=True)
train_errs = np.zeros_like((self.Y - Y_pred) ** 2)
train_errs[0] = 0.1
train_errs[1] = 0.2
train_errs[2] = 0.3
model_copy = copy.deepcopy(model)
select_good_bootstraps(model, train_errs)
self.assertEqual(len(model.models), 1)
self.assertEqual(len(model_copy.models), 3)
self.assertLess(len(model.models), len(model_copy.models))


class TestPvals(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def setUp(self):
np.random.seed(0)
torch.manual_seed(0)
# X1 is a heterogeneous predictor under C0, X0 is a homogeneous predictor
# C0 is a homogeneous context predictor on Y0, C1 is a heterogeneous context predictor on Y1
self.C = np.random.uniform(-1, 1, size=(100, 2))
self.X = np.random.uniform(-1, 1, size=(100, 2))
betas = np.concatenate([np.ones((100, 1)), self.C[:, 0, None]], axis=1)
Y0 = np.sum(betas * self.X, axis=1) + self.C[:, 0]
Y1 = np.sum(betas * self.X, axis=1) + self.C[:, 1]
self.Y = np.column_stack([Y0, Y1])

def test_homogeneous_context_effect_pvals(self):
model = ContextualizedRegressor(n_bootstraps=10)
model.fit(self.C, self.X, self.Y)
pvals = calc_homogeneous_context_effects_pvals(model, self.C)
assert pvals.shape == (self.C.shape[1], self.Y.shape[1])
assert pvals[0, 0] < 0.2 and pvals[1, 1] < 0.2
assert pvals[0, 1] > 0.2 and pvals[1, 0] > 0.2

def test_homogeneous_predictor_effect_pvals(self):
model = ContextualizedRegressor(n_bootstraps=10)
model.fit(self.C, self.X, self.Y)
pvals = calc_homogeneous_predictor_effects_pvals(model, self.X)
assert pvals.shape == (self.X.shape[1], self.Y.shape[1])
assert pvals[0, 0] < 0.2 and pvals[0, 1] < 0.2
assert pvals[1, 0] > 0.2 and pvals[1, 1] > 0.2

def test_heterogeneous_predictor_effect_pvals(self):
model = ContextualizedRegressor(n_bootstraps=10)
model.fit(self.C, self.X, self.Y)
pvals = calc_heterogeneous_predictor_effects_pvals(model, self.C)
assert pvals.shape == (self.C.shape[1], self.X.shape[1], self.Y.shape[1])
assert pvals[0, 1, 0] < 0.2 and pvals[0, 1, 1] < 0.2
pvals[0, 1, 0] = pvals[0, 1, 1] = 1
assert (pvals > 0.2).all()


if __name__ == "__main__":
unittest.main()
22 changes: 0 additions & 22 deletions contextualized/analysis/utils.py

This file was deleted.

Loading

0 comments on commit 12843bb

Please sign in to comment.