From f0d7a56b533a53d3a4308274dcaf5cf9c64ec96e Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 6 May 2024 09:19:14 +0200
Subject: [PATCH 01/16] add fast kci

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/cit.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/causallearn/utils/cit.py b/causallearn/utils/cit.py
index 8119dd72..d842281a 100644
--- a/causallearn/utils/cit.py
+++ b/causallearn/utils/cit.py
@@ -5,6 +5,7 @@
 from scipy.stats import chi2, norm
 
 from causallearn.utils.KCI.KCI import KCI_CInd, KCI_UInd
+from causallearn.utils.FastKCI.FastKCI import FastKCI_CInd, FastKCI_UInd
 from causallearn.utils.PCUtils import Helper
 
 CONST_BINCOUNT_UNIQUE_THRESHOLD = 1e5
@@ -13,6 +14,7 @@
 mv_fisherz = "mv_fisherz"
 mc_fisherz = "mc_fisherz"
 kci = "kci"
+fastkci = "fastkci"
 chisq = "chisq"
 gsq = "gsq"
 d_separation = "d_separation"
@@ -32,6 +34,8 @@ def CIT(data, method='fisherz', **kwargs):
         return FisherZ(data, **kwargs)
     elif method == kci:
         return KCI(data, **kwargs)
+    elif method == fastkci:
+        return FastKCI(data, **kwargs)
     elif method in [chisq, gsq]:
         return Chisq_or_Gsq(data, method_name=method, **kwargs)
     elif method == mv_fisherz:
@@ -193,6 +197,28 @@ def __call__(self, X, Y, condition_set=None):
         self.pvalue_cache[cache_key] = p
         return p
 
+class FastKCI(CIT_Base):
+    def __init__(self, data, **kwargs):
+        super().__init__(data, **kwargs)
+        kci_ui_kwargs = {k: v for k, v in kwargs.items() if k in
+                         ['kernelX', 'kernelY', 'null_ss', 'approx', 'est_width', 'polyd', 'kwidthx', 'kwidthy']}
+        kci_ci_kwargs = {k: v for k, v in kwargs.items() if k in
+                         ['K', 'J', 'alpha', 'use_gp']}
+        self.check_cache_method_consistent(
+            'kci', hashlib.md5(json.dumps(kci_ci_kwargs, sort_keys=True).encode('utf-8')).hexdigest())
+        self.assert_input_data_is_valid()
+        self.kci_ui = KCI_UInd(**kci_ui_kwargs)
+        self.kci_ci = FastKCI_CInd(**kci_ci_kwargs)
+
+    def __call__(self, X, Y, condition_set=None):
+        # Kernel-based conditional independence test.
+        Xs, Ys, condition_set, cache_key = self.get_formatted_XYZ_and_cachekey(X, Y, condition_set)
+        if cache_key in self.pvalue_cache: return self.pvalue_cache[cache_key]
+        p = self.kci_ui.compute_pvalue(self.data[:, Xs], self.data[:, Ys])[0] if len(condition_set) == 0 else \
+            self.kci_ci.compute_pvalue(self.data[:, Xs], self.data[:, Ys], self.data[:, condition_set])[0]
+        self.pvalue_cache[cache_key] = p
+        return p
+
 class Chisq_or_Gsq(CIT_Base):
     def __init__(self, data, method_name, **kwargs):
         def _unique(column):

From b35b9bb876848137d3e1db4bd836a2c8b6a44c66 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 6 May 2024 09:20:46 +0200
Subject: [PATCH 02/16] safe null dist for debug

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/KCI/KCI.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/causallearn/utils/KCI/KCI.py b/causallearn/utils/KCI/KCI.py
index 41cbd467..681d48d4 100644
--- a/causallearn/utils/KCI/KCI.py
+++ b/causallearn/utils/KCI/KCI.py
@@ -306,8 +306,8 @@ def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
             k_appr, theta_appr = self.get_kappa(uu_prod)
             pvalue = 1 - stats.gamma.cdf(test_stat, k_appr, 0, theta_appr)
         else:
-            null_samples = self.null_sample_spectral(uu_prod, size_u, Kx.shape[0])
-            pvalue = sum(null_samples > test_stat) / float(self.nullss)
+            self.null_samples = self.null_sample_spectral(uu_prod, size_u, Kx.shape[0])
+            pvalue = sum(self.null_samples > test_stat) / float(self.nullss)
         return pvalue, test_stat
 
     def kernel_matrix(self, data_x, data_y, data_z):

From 4d1ca1251ba764e8437ba7d37ef82d63104aa03e Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 6 May 2024 09:25:17 +0200
Subject: [PATCH 03/16] add fast kci module

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/FastKCI/FastKCI.py  | 480 ++++++++++++++++++++++++++
 causallearn/utils/FastKCI/__init__.py |   0
 2 files changed, 480 insertions(+)
 create mode 100644 causallearn/utils/FastKCI/FastKCI.py
 create mode 100644 causallearn/utils/FastKCI/__init__.py

diff --git a/causallearn/utils/FastKCI/FastKCI.py b/causallearn/utils/FastKCI/FastKCI.py
new file mode 100644
index 00000000..d9e2d245
--- /dev/null
+++ b/causallearn/utils/FastKCI/FastKCI.py
@@ -0,0 +1,480 @@
+from causallearn.utils.KCI.KCI import GaussianKernel, Kernel
+import numpy as np
+from numpy.linalg import eigh
+import scipy.stats as stats
+from scipy.special import logsumexp
+from sklearn.preprocessing import LabelEncoder
+from joblib import Parallel, delayed
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import ConstantKernel, WhiteKernel, RBF
+import warnings
+
+class FastKCI_CInd(object):
+    """
+    Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test. Unconditional version.
+
+    References
+    ----------
+    [1] K. Zhang, J. Peters, D. Janzing, and B. Schölkopf,
+    "A kernel-based conditional independence test and application in causal discovery," In UAI 2011.
+    [2] M. Zhang, and S. Williamson,
+    "Embarrassingly Parallel Inference for Gaussian Processes" In JMLR 20 (2019)
+
+    """          
+    def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh = 1e-6, trimming_thresh = 1e-3, use_gp=False):
+        """
+        Initialize the FastKCI_CInd object.
+
+        Parameters
+        ----------
+        K: Number of Gaussians that are assumed to be in the mixture.
+        J: Number of independent repittitions.
+        alpha: Parameter for the Dirichlet distribution.
+        epsilon: Penalty for the matrix ridge regression.
+        eig_threshold: Threshold for Eigenvalues.
+        use_gp: Whether to use Gaussian Process Regression to determine the kernel widths
+        """ 
+        self.K = K
+        self.J = J
+        self.alpha = alpha
+        self.epsilon = epsilon
+        self.eig_thresh = eig_thresh
+        self.trimming_thresh = trimming_thresh
+        self.use_gp = use_gp
+        self.nullss = 5000
+        # TODO: Adjust to causal-learn API
+
+    def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
+        """
+        Main function: compute the p value of H_0: X|Y conditional on Z.
+
+        Parameters
+        ----------
+        data_x: input data for x (nxd1 array)
+        data_y: input data for y (nxd2 array)
+        data_z: input data for z (nxd3 array)
+
+        Returns
+        _________
+        pvalue: p value (scalar)
+        test_stat: test statistic (scalar)
+        """
+        self.data = [data_x, data_y]
+        self.data_z = data_z
+        self.n = data_x.shape[0]
+
+        Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)() for i in range(self.J))
+        self.Z_proposal, self.prob_Z = zip(*Z_proposal)
+        block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
+        test_stat, null_samples, log_likelihood = zip(*block_res)
+        
+        log_likelihood = np.array(log_likelihood)
+        self.prob_Z += log_likelihood
+        self.prob_Z = np.around(np.exp(self.prob_Z-logsumexp(self.prob_Z)), 6) # experimental, not used
+        self.all_null_samples = np.vstack(null_samples)
+        self.all_p = np.array([np.sum(self.all_null_samples[i,] > test_stat[i]) / float(self.nullss) for i in range(self.J)])
+        self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)), 6)
+        self.all_test_stats = np.array(test_stat)
+        self.test_stat = np.average(np.array(test_stat), weights = self.prob_weights)
+        self.null_samples = np.average(null_samples, axis = 0, weights = self.prob_weights)
+        # experimental, not used
+        self.pvalue_alt = np.sum(np.average(null_samples, axis = 0, weights = self.prob_Z) > np.average(np.array(test_stat), weights = self.prob_Z)) / float(self.nullss)
+        self.pvalue = np.sum(self.null_samples > self.test_stat) / float(self.nullss)
+
+        return self.pvalue, self.test_stat          
+
+    def partition_data(self):
+        """
+        Partitions data into K Gaussians following an expectation maximization approach on Z.
+
+        Returns
+        _________
+        Z: Latent Gaussian that was drawn for each observation (nx1 array)
+        prob_Z: Log-Likelihood of that random assignment (scalar)
+        """
+        Z_mean = self.data_z.mean(axis=0)
+        Z_sd = self.data_z.std(axis=0)
+        mu_k = np.random.normal(Z_mean, Z_sd, size = (self.K,self.data_z.shape[1]))
+        sigma_k = np.eye(self.data_z.shape[1])
+        pi_j = np.random.dirichlet([self.alpha]*self.K)
+        ll = np.tile(np.log(pi_j),(self.n,1))
+        for k in range(self.K):
+            ll[:,k] += stats.multivariate_normal.logpdf(self.data_z, mu_k[k,:], cov=sigma_k, allow_singular=True)
+        Z = np.array([ np.random.multinomial(1,np.exp(ll[n,:]-logsumexp(ll[n,:]))).argmax() for n in range(self.n)])
+        le = LabelEncoder()
+        Z = le.fit_transform(Z)
+        prob_Z = np.take_along_axis(ll, Z[:, None], axis=1).sum() # experimental, might be removed
+        return Z, prob_Z
+    
+    def pvalue_onblocks(self, Z_proposal):
+        """
+        Calculate p value on given partitions of the data.
+
+        Parameters
+        ----------
+        Z_proposal: partition of the data into K clusters (nxd1 array)
+        
+        Returns
+        _________
+        test_stat: test statistic (scalar)
+        null_samples: bootstrapped sampled of the null distribution (self.nullssx1 array)
+        log_likelihood: estimated probability P(X,Y|Z,k)
+        """
+        unique_Z_j = np.unique(Z_proposal)
+        test_stat = 0
+        log_likelihood = 0
+        null_samples = np.zeros((1,self.nullss))
+        for k in unique_Z_j:
+            K_mask = (Z_proposal==k)
+            X_k = np.copy(self.data[0][K_mask])
+            Y_k = np.copy(self.data[1][K_mask])
+            Z_k = np.copy(self.data_z[K_mask])
+            if (Z_k.shape[0]<6): # small blocks cause problems in GP, experimental
+                continue
+            Kx, Ky, Kzx, Kzy, epsilon_x, epsilon_y, likelihood_x, likelihood_y = self.kernel_matrix(X_k, Y_k, Z_k)
+            KxR, Rzx = Kernel.center_kernel_matrix_regression(Kx, Kzx, epsilon_x)
+            if epsilon_x != epsilon_y:
+                KyR, Rzy = Kernel.center_kernel_matrix_regression(Ky, Kzy, epsilon_y)
+            else:
+                KyR = Rzx.dot(Ky.dot(Rzx))
+            test_stat += np.einsum('ij,ji->', KxR, KyR)
+            uu_prod, size_u = self.get_uuprod(KxR, KyR)
+            null_samples += self.null_sample_spectral(uu_prod, size_u, Kx.shape[0])
+            log_likelihood += likelihood_x + likelihood_y
+        return test_stat, null_samples, log_likelihood
+
+
+    def kernel_matrix(self, data_x, data_y, data_z):
+        """
+        Calculates the Gaussian Kernel for given data inputs as well as the shared kernel.
+
+        Returns
+        _________
+        K: Kernel matrices (n_kxn_k array)
+        """
+        kernel_obj = GaussianKernel()
+        kernel_obj.set_width_empirical_kci(data_z)
+
+        data_x = stats.zscore(data_x, ddof=1, axis=0)
+        data_x[np.isnan(data_x)] = 0.
+        
+        data_y = stats.zscore(data_y, ddof=1, axis=0)
+        data_y[np.isnan(data_y)] = 0.
+        
+        data_z = stats.zscore(data_z, ddof=1, axis=0)
+        data_z[np.isnan(data_z)] = 0.
+
+        data_x = np.concatenate((data_x, 0.5 * data_z), axis=1)
+
+        kernelX = GaussianKernel()
+        kernelX.set_width_empirical_kci(data_z)
+        kernelY = GaussianKernel()
+        kernelY.set_width_empirical_kci(data_z)
+
+        Kx = kernelX.kernel(data_x)
+        Ky = kernelY.kernel(data_y)
+
+        # centering kernel matrix
+        Kx = Kernel.center_kernel_matrix(Kx)
+        Ky = Kernel.center_kernel_matrix(Ky)
+
+        if not self.use_gp:
+            kernelZ = GaussianKernel()
+            kernelZ.set_width_empirical_kci(data_z)
+            Kzx = kernelZ.kernel(data_z)
+            Kzx = Kernel.center_kernel_matrix(Kzx)
+            Kzy = Kzx
+            epsilon_x, epsilon_y = self.epsilon, self.epsilon
+            gpx = GaussianProcessRegressor()
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=Warning)
+                # P(X|Z)
+                gpx.fit(X = data_z, y = data_x)
+            likelihood_x = gpx.log_marginal_likelihood_value_
+            gpy = GaussianProcessRegressor()
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=Warning)
+                # P(Y|X,Z)
+                gpy.fit(X = np.c_[data_x,data_z], y=data_y)
+            likelihood_y = gpy.log_marginal_likelihood_value_
+
+        else: 
+            n, Dz = data_z.shape
+            
+            widthz = np.sqrt(1.0 / (kernelX.width * data_x.shape[1]))
+
+            # Instantiate a Gaussian Process model for x
+            wx, vx = eigh(Kx)
+            topkx = int(np.max([np.min([400, np.floor(n / 4)]), np.min([n+1,8])]))
+            idx = np.argsort(-wx)
+            wx = wx[idx]
+            vx = vx[:, idx]
+            wx = wx[0:topkx]
+            vx = vx[:, 0:topkx]
+            vx = vx[:, np.abs(wx) > np.abs(wx).max() * 1e-10]
+            wx = wx[np.abs(wx) > np.abs(wx).max() * 1e-10]
+            vx = 2 * np.sqrt(n) * vx.dot(np.diag(np.sqrt(wx))) / np.sqrt(wx[0])
+            kernelx = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(widthz * np.ones(Dz), (1e-2, 1e2)) + WhiteKernel(0.1, (1e-10, 1e+1))
+            gpx = GaussianProcessRegressor(kernel=kernelx)
+            # fit Gaussian process, including hyperparameter optimization
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=Warning)
+                gpx.fit(data_z, vx)
+
+            # construct Gaussian kernels according to learned hyperparameters
+            Kzx = gpx.kernel_.k1(data_z, data_z)
+            epsilon_x = np.exp(gpx.kernel_.theta[-1])
+            likelihood_x = gpx.log_marginal_likelihood_value_
+
+            # Instantiate a Gaussian Process model for y
+            wy, vy = eigh(Ky)
+            topky = int(np.max([np.min([400, np.floor(n / 4)]), np.min([n+1,8])]))
+            idy = np.argsort(-wy)
+            wy = wy[idy]
+            vy = vy[:, idy]
+            wy = wy[0:topky]
+            vy = vy[:, 0:topky]
+            vy = vy[:, np.abs(wy) > np.abs(wy).max() * 1e-10]
+            wy = wy[np.abs(wy) > np.abs(wy).max() * 1e-10]
+            vy = 2 * np.sqrt(n) * vy.dot(np.diag(np.sqrt(wy))) / np.sqrt(wy[0])
+            kernely = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(widthz * np.ones(Dz), (1e-2, 1e2)) + WhiteKernel(0.1, (1e-10, 1e+1))
+            gpy = GaussianProcessRegressor(kernel=kernely)
+            # fit Gaussian process, including hyperparameter optimization
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=Warning)
+                gpy.fit(data_z, vy)
+
+            # construct Gaussian kernels according to learned hyperparameters
+            Kzy = gpy.kernel_.k1(data_z, data_z)
+            epsilon_y = np.exp(gpy.kernel_.theta[-1])
+            likelihood_y = gpy.log_marginal_likelihood_value_
+
+        return Kx, Ky, Kzx, Kzy, epsilon_x, epsilon_y, likelihood_x, likelihood_y
+
+    def get_uuprod(self, Kx, Ky):
+        """
+        Compute eigenvalues for null distribution estimation
+
+        Parameters
+        ----------
+        Kx: centralized kernel matrix for data_x (nxn)
+        Ky: centralized kernel matrix for data_y (nxn)
+
+        Returns
+        _________
+        uu_prod: product of the eigenvectors of Kx and Ky
+        size_u: number of produced eigenvectors
+
+        """
+        n_block = Kx.shape[0]
+        wx, vx = eigh(Kx)
+        wy, vy = eigh(Ky)
+        idx = np.argsort(-wx)
+        idy = np.argsort(-wy)
+        wx = wx[idx]
+        vx = vx[:, idx]
+        wy = wy[idy]
+        vy = vy[:, idy]
+        vx = vx[:, wx > np.max(wx) * self.eig_thresh]
+        wx = wx[wx > np.max(wx) * self.eig_thresh]
+        vy = vy[:, wy > np.max(wy) * self.eig_thresh]
+        wy = wy[wy > np.max(wy) * self.eig_thresh]
+        vx = vx.dot(np.diag(np.sqrt(wx)))
+        vy = vy.dot(np.diag(np.sqrt(wy)))
+
+        # calculate their product
+        num_eigx = vx.shape[1]
+        num_eigy = vy.shape[1]
+        size_u = num_eigx * num_eigy
+        uu = np.zeros((n_block, size_u))
+        for i in range(0, num_eigx):
+            for j in range(0, num_eigy):
+                uu[:, i * num_eigy + j] = vx[:, i] * vy[:, j]
+
+        if size_u > self.n:
+            uu_prod = uu.dot(uu.T)
+        else:
+            uu_prod = uu.T.dot(uu)
+
+        return uu_prod, size_u
+        
+    def get_kappa(self, mean_appr, var_appr):
+        """
+        Get parameters for the approximated gamma distribution
+        Parameters
+        ----------
+        uu_prod: product of the eigenvectors of Kx and Ky
+
+        Returns
+        ----------
+        k_appr, theta_appr: approximated parameters of the gamma distribution
+
+        """
+        k_appr = mean_appr ** 2 / var_appr
+        theta_appr = var_appr / mean_appr
+        return k_appr, theta_appr
+
+    def null_sample_spectral(self, uu_prod, size_u, T):
+        """
+        Simulate data from null distribution
+
+        Parameters
+        ----------
+        uu_prod: product of the eigenvectors of Kx and Ky
+        size_u: number of produced eigenvectors
+        T: sample size
+
+        Returns
+        _________
+        null_dstr: samples from the null distribution
+
+        """
+        eig_uu = np.linalg.eigvalsh(uu_prod)
+        eig_uu = -np.sort(-eig_uu)
+        eig_uu = eig_uu[0:np.min((T, size_u))]
+        eig_uu = eig_uu[eig_uu > np.max(eig_uu) * self.eig_thresh]
+        
+        f_rand = np.random.chisquare(1, (eig_uu.shape[0], self.nullss))
+        null_dstr = eig_uu.T.dot(f_rand)
+        return null_dstr
+
+class FastKCI_UInd(object):
+    """
+    Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test. Unconditional version.
+
+    References
+    ----------
+    [1] K. Zhang, J. Peters, D. Janzing, and B. Schölkopf,
+    "A kernel-based conditional independence test and application in causal discovery," In UAI 2011.
+    [2] M. Zhang, and S. Williamson,
+    "Embarrassingly Parallel Inference for Gaussian Processes" In JMLR 20 (2019)
+    """         
+    def __init__(self, K=10, J=8, alpha=500, trimming_thresh = 1e-3):
+        """
+        Construct the FastKCI_UInd model.
+
+        Parameters
+        ----------
+        K: Number of Gaussians that are assumed to be in the mixture
+        J: Number of independent repittitions.
+        alpha: Parameter for the Dirichlet distribution.
+        trimming_thresh: Threshold for trimming the propensity weights.
+        """ 
+        self.K = K
+        self.J = J
+        self.alpha = alpha
+        self.trimming_thresh = trimming_thresh
+        # TODO: Adjust to causal-learn API
+
+    def compute_pvalue(self, data_x=None, data_y=None):
+        """
+        Main function: compute the p value and return it together with the test statistic
+
+        Parameters
+        ----------
+        data_x: input data for x (nxd1 array)
+        data_y: input data for y (nxd2 array)
+
+        Returns
+        _________
+        pvalue: p value (scalar)
+        test_stat: test statistic (scalar)
+        """
+        self.data_x = data_x
+        self.data_y = data_y
+        self.n = data_x.shape[0]
+        
+        Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)(self.data) for i in range(self.J))
+        self.Z_proposal, self.prob_Y = zip(*Z_proposal)
+        block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
+        test_stat, mean_appr, var_appr, log_likelihood = zip(*block_res)
+        self.prob_Y = np.array(self.prob_Y)
+        self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)))
+        self.test_stat = np.average(np.array(test_stat), weights = self.prob_weights)
+        self.mean_appr = np.average(np.array(mean_appr), weights = self.prob_weights)
+        self.var_appr = np.average(np.array(var_appr), weights = self.prob_weights)
+
+        k_appr, theta_appr = self.get_kappa(self.mean_appr, self.var_appr)
+        self.pvalue = 1 - stats.gamma.cdf(self.test_stat, k_appr, 0, theta_appr)
+
+        return self.pvalue, self.test_stat          
+
+    def partition_data(self):
+        """
+        Partitions data into K Gaussians
+
+        Returns
+        _________
+        Z: Latent Gaussian that was drawn for each observation (nx1 array)
+        prob_Y: Log-Likelihood of that random assignment (scalar)
+        """
+        Y_mean = self.data_y.mean(axis=0)
+        Y_sd = self.data_y.std(axis=0)
+        mu_k = np.random.normal(Y_mean, Y_sd, size = (self.K,self.shape[1]))
+        sigma_k = np.eye(self.data_y.shape[1])
+        pi_j = np.random.dirichlet([self.alpha]*self.K)
+        ll = np.tile(np.log(pi_j),(self.n,1))
+        for k in range(self.K):
+            ll[:,k] += stats.multivariate_normal.logpdf(self.data_y, mu_k[k,:], cov=sigma_k, allow_singular=True)
+        Z = np.array([ np.random.multinomial(1,np.exp(ll[n,:]-logsumexp(ll[n,:]))).argmax() for n in range(self.n)])
+        prop_Y = np.take_along_axis(ll, Z[:, None], axis=1).sum()
+        le = LabelEncoder()
+        Z = le.fit_transform(Z)
+        return (Z, prop_Y)
+    
+    def pvalue_onblocks(self, Z_proposal):
+        unique_Z_j = np.unique(Z_proposal)
+        test_stat = 0
+        prob_weight = 0
+        mean_appr = 0
+        var_appr = 0
+        for k in unique_Z_j:
+            K_mask = (Z_proposal==k)
+            X_k = np.copy(self.data[0][K_mask])
+            Y_k = np.copy(self.data[1][K_mask])
+
+            Kx = self.kernel_matrix(X_k)
+            Ky = self.kernel_matrix(Y_k)
+            n_block = Kx.shape[0]
+
+            Kxc = Kernel.center_kernel_matrix(Kx)
+            Kyc = Kernel.center_kernel_matrix(Ky)
+            test_stat += np.einsum('ij,ji->', Kxc, Kyc)
+            mean_appr += np.trace(Kxc) * np.trace(Kyc) / n_block
+            var_appr += 2 * np.sum(Kxc ** 2) * np.sum(Kyc ** 2) / n_block / n_block
+        var_appr /= unique_Z_j.shape[0]
+        return (test_stat, mean_appr, var_appr, prob_weight)
+    
+    def kernel_matrix(self, data):
+        """
+        Calculates the Gaussian Kernel for given data inputs.
+
+        Returns
+        _________
+        K: Kernel matrix (n_kxn_k array)
+        """
+        kernel_obj = GaussianKernel()
+        kernel_obj.set_width_empirical_hsic(data)
+
+        data = stats.zscore(data, ddof=1, axis=0)
+        data[np.isnan(data)] = 0. 
+
+        K = kernel_obj.kernel(data)
+
+        return K
+    
+    def get_kappa(self, mean_appr, var_appr):
+        """
+        Get parameters for the approximated gamma distribution
+        Parameters
+        ----------
+        Kx: kernel matrix for data_x (nxn)
+        Ky: kernel matrix for data_y (nxn)
+
+        Returns
+        _________
+        k_appr, theta_appr: approximated parameters of the gamma distribution
+        """
+        k_appr = mean_appr ** 2 / var_appr
+        theta_appr = var_appr / mean_appr
+        return k_appr, theta_appr
diff --git a/causallearn/utils/FastKCI/__init__.py b/causallearn/utils/FastKCI/__init__.py
new file mode 100644
index 00000000..e69de29b

From 14b9c6e398822a0157b5841722cc4ac773fc9a11 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Tue, 28 May 2024 12:01:48 +0200
Subject: [PATCH 04/16] update UInd test

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/FastKCI/FastKCI.py | 102 ++++++++++++++++++++-------
 causallearn/utils/cit.py             |   4 +-
 2 files changed, 80 insertions(+), 26 deletions(-)

diff --git a/causallearn/utils/FastKCI/FastKCI.py b/causallearn/utils/FastKCI/FastKCI.py
index d9e2d245..5b614dd9 100644
--- a/causallearn/utils/FastKCI/FastKCI.py
+++ b/causallearn/utils/FastKCI/FastKCI.py
@@ -364,6 +364,8 @@ def __init__(self, K=10, J=8, alpha=500, trimming_thresh = 1e-3):
         self.J = J
         self.alpha = alpha
         self.trimming_thresh = trimming_thresh
+        self.nullss = 5000
+        self.eig_thresh = 1e-5
         # TODO: Adjust to causal-learn API
 
     def compute_pvalue(self, data_x=None, data_y=None):
@@ -384,18 +386,14 @@ def compute_pvalue(self, data_x=None, data_y=None):
         self.data_y = data_y
         self.n = data_x.shape[0]
         
-        Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)(self.data) for i in range(self.J))
+        Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)() for i in range(self.J))
         self.Z_proposal, self.prob_Y = zip(*Z_proposal)
         block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
-        test_stat, mean_appr, var_appr, log_likelihood = zip(*block_res)
-        self.prob_Y = np.array(self.prob_Y)
-        self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)))
+        test_stat, null_samples, log_likelihood = zip(*block_res)
+        self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)), 6)
         self.test_stat = np.average(np.array(test_stat), weights = self.prob_weights)
-        self.mean_appr = np.average(np.array(mean_appr), weights = self.prob_weights)
-        self.var_appr = np.average(np.array(var_appr), weights = self.prob_weights)
-
-        k_appr, theta_appr = self.get_kappa(self.mean_appr, self.var_appr)
-        self.pvalue = 1 - stats.gamma.cdf(self.test_stat, k_appr, 0, theta_appr)
+        self.null_samples = np.average(null_samples, axis = 0, weights = self.prob_weights)
+        self.pvalue = np.sum(self.null_samples > self.test_stat) / float(self.nullss)
 
         return self.pvalue, self.test_stat          
 
@@ -410,7 +408,7 @@ def partition_data(self):
         """
         Y_mean = self.data_y.mean(axis=0)
         Y_sd = self.data_y.std(axis=0)
-        mu_k = np.random.normal(Y_mean, Y_sd, size = (self.K,self.shape[1]))
+        mu_k = np.random.normal(Y_mean, Y_sd, size = (self.K,self.data_y.shape[1]))
         sigma_k = np.eye(self.data_y.shape[1])
         pi_j = np.random.dirichlet([self.alpha]*self.K)
         ll = np.tile(np.log(pi_j),(self.n,1))
@@ -425,25 +423,30 @@ def partition_data(self):
     def pvalue_onblocks(self, Z_proposal):
         unique_Z_j = np.unique(Z_proposal)
         test_stat = 0
-        prob_weight = 0
-        mean_appr = 0
-        var_appr = 0
+        log_likelihood = 0
+        null_samples = np.zeros((1,self.nullss))
         for k in unique_Z_j:
             K_mask = (Z_proposal==k)
-            X_k = np.copy(self.data[0][K_mask])
-            Y_k = np.copy(self.data[1][K_mask])
+            X_k = np.copy(self.data_x[K_mask])
+            Y_k = np.copy(self.data_y[K_mask])
 
             Kx = self.kernel_matrix(X_k)
             Ky = self.kernel_matrix(Y_k)
-            n_block = Kx.shape[0]
-
-            Kxc = Kernel.center_kernel_matrix(Kx)
-            Kyc = Kernel.center_kernel_matrix(Ky)
-            test_stat += np.einsum('ij,ji->', Kxc, Kyc)
-            mean_appr += np.trace(Kxc) * np.trace(Kyc) / n_block
-            var_appr += 2 * np.sum(Kxc ** 2) * np.sum(Kyc ** 2) / n_block / n_block
-        var_appr /= unique_Z_j.shape[0]
-        return (test_stat, mean_appr, var_appr, prob_weight)
+
+            v_stat, Kxc, Kyc = self.HSIC_V_statistic(Kx, Ky)
+
+            null_samples += self.null_sample_spectral(Kxc, Kyc)
+            test_stat += v_stat
+
+            gpx = GaussianProcessRegressor()
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=Warning)
+                # P(X|Y)
+                gpx.fit(X = Y_k, y = X_k)
+            likelihood = gpx.log_marginal_likelihood_value_
+            log_likelihood += likelihood
+            
+        return test_stat, null_samples, log_likelihood
     
     def kernel_matrix(self, data):
         """
@@ -478,3 +481,54 @@ def get_kappa(self, mean_appr, var_appr):
         k_appr = mean_appr ** 2 / var_appr
         theta_appr = var_appr / mean_appr
         return k_appr, theta_appr
+
+    def null_sample_spectral(self, Kxc, Kyc):
+        """
+        Simulate data from null distribution
+
+        Parameters
+        ----------
+        Kxc: centralized kernel matrix for data_x (nxn)
+        Kyc: centralized kernel matrix for data_y (nxn)
+
+        Returns
+        _________
+        null_dstr: samples from the null distribution
+
+        """
+        T = Kxc.shape[0]
+        if T > 1000:
+            num_eig = int(np.floor(T / 2))
+        else:
+            num_eig = T
+        lambdax = np.linalg.eigvalsh(Kxc)
+        lambday = np.linalg.eigvalsh(Kyc)
+        lambdax = -np.sort(-lambdax)
+        lambday = -np.sort(-lambday)
+        lambdax = lambdax[0:num_eig]
+        lambday = lambday[0:num_eig]
+        lambda_prod = np.dot(lambdax.reshape(num_eig, 1), lambday.reshape(1, num_eig)).reshape(
+            (num_eig ** 2, 1))
+        lambda_prod = lambda_prod[lambda_prod > lambda_prod.max() * self.eig_thresh]
+        f_rand = np.random.chisquare(1, (lambda_prod.shape[0], self.nullss))
+        null_dstr = lambda_prod.T.dot(f_rand) / T
+        return null_dstr
+    
+    def HSIC_V_statistic(self, Kx, Ky):
+        """
+        Compute V test statistic from kernel matrices Kx and Ky
+        Parameters
+        ----------
+        Kx: kernel matrix for data_x (nxn)
+        Ky: kernel matrix for data_y (nxn)
+
+        Returns
+        _________
+        Vstat: HSIC v statistics
+        Kxc: centralized kernel matrix for data_x (nxn)
+        Kyc: centralized kernel matrix for data_y (nxn)
+        """
+        Kxc = Kernel.center_kernel_matrix(Kx)
+        Kyc = Kernel.center_kernel_matrix(Ky)
+        V_stat = np.einsum('ij,ij->', Kxc, Kyc)
+        return V_stat, Kxc, Kyc
diff --git a/causallearn/utils/cit.py b/causallearn/utils/cit.py
index d842281a..be168e40 100644
--- a/causallearn/utils/cit.py
+++ b/causallearn/utils/cit.py
@@ -201,13 +201,13 @@ class FastKCI(CIT_Base):
     def __init__(self, data, **kwargs):
         super().__init__(data, **kwargs)
         kci_ui_kwargs = {k: v for k, v in kwargs.items() if k in
-                         ['kernelX', 'kernelY', 'null_ss', 'approx', 'est_width', 'polyd', 'kwidthx', 'kwidthy']}
+                         ['K', 'J', 'alpha']}
         kci_ci_kwargs = {k: v for k, v in kwargs.items() if k in
                          ['K', 'J', 'alpha', 'use_gp']}
         self.check_cache_method_consistent(
             'kci', hashlib.md5(json.dumps(kci_ci_kwargs, sort_keys=True).encode('utf-8')).hexdigest())
         self.assert_input_data_is_valid()
-        self.kci_ui = KCI_UInd(**kci_ui_kwargs)
+        self.kci_ui = FastKCI_UInd(**kci_ui_kwargs)
         self.kci_ci = FastKCI_CInd(**kci_ci_kwargs)
 
     def __call__(self, X, Y, condition_set=None):

From fc74434ecabe7b246840ec3bf2c922b8531c19ab Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Wed, 18 Sep 2024 13:33:33 +0200
Subject: [PATCH 05/16] add rcit

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/RCIT/RCIT.py     | 251 +++++++++++++++++++++++++++++
 causallearn/utils/RCIT/__init__.py |   0
 causallearn/utils/cit.py           |  26 +++
 setup.py                           |   3 +-
 4 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 causallearn/utils/RCIT/RCIT.py
 create mode 100644 causallearn/utils/RCIT/__init__.py

diff --git a/causallearn/utils/RCIT/RCIT.py b/causallearn/utils/RCIT/RCIT.py
new file mode 100644
index 00000000..c9c931cb
--- /dev/null
+++ b/causallearn/utils/RCIT/RCIT.py
@@ -0,0 +1,251 @@
+import numpy as np
+import itertools
+from scipy.stats import chi2
+from scipy.linalg import pinv
+from momentchi2 import hbe, sw, lpb4
+from scipy.spatial.distance import pdist, squareform
+
+
+class RCIT(object):
+    def __init__(self, approx="lpd4", num_f=100, num_f2=5, rcit=True):
+        self.approx = approx
+        self.num_f = num_f
+        self.num_f2 = num_f2
+        self.rcit = rcit
+
+    def compute_pvalue(self, data_x, data_y, data_z):
+        d = data_z.shape[1]
+        r = data_x.shape[0]
+        r1 = 500 if (r > 500) else r
+
+        data_x = (data_x - data_x.mean(axis=0)) / data_x.std(axis=0, ddof=1)
+        data_y = (data_y - data_y.mean(axis=0)) / data_y.std(axis=0, ddof=1)
+        data_z = (data_z - data_z.mean(axis=0)) / data_z.std(axis=0, ddof=1)
+
+        if self.rcit:
+            data_y = np.column_stack((data_y, data_z))
+
+        sigma = dict()
+        for key, value in [("x", data_x), ("y", data_y), ("z", data_z)]:
+            distances = pdist(value[:r1, :], metric='euclidean')
+            flattened_distances = squareform(distances).ravel()
+            non_zero_distances = flattened_distances[flattened_distances != 0]
+            sigma[key] = np.median(non_zero_distances)
+
+        four_z = self.random_fourier_features(data_z, num_f=self.num_f, sigma=sigma["z"])
+        four_x = self.random_fourier_features(data_x, num_f=self.num_f2, sigma=sigma["x"])
+        four_y = self.random_fourier_features(data_y, num_f=self.num_f2, sigma=sigma["y"])
+
+        f_x = (four_x - four_x.mean(axis=0)) / four_x.std(axis=0, ddof=1)
+        f_y = (four_y - four_y.mean(axis=0)) / four_y.std(axis=0, ddof=1)
+        f_z = (four_z - four_z.mean(axis=0)) / four_z.std(axis=0, ddof=1)
+
+        Cxy = f_x.T @ f_y / (f_x.shape[0] - 1)
+
+        Czz = np.cov(f_z, rowvar=False)
+
+        regularized_Czz = Czz + np.eye(self.num_f) * 1e-10
+        L = np.linalg.cholesky(regularized_Czz)
+        i_Czz = np.linalg.inv(L).T.dot(np.linalg.inv(L))
+
+        Cxz = f_x.T @ f_z / (f_x.shape[0] - 1)
+        Czy = f_z.T @ f_y / (f_z.shape[0] - 1)
+
+        z_i_Czz = f_z @ i_Czz
+        e_x_z = z_i_Czz @ Cxz.T
+        e_y_z = z_i_Czz @ Czy
+
+        res_x = f_x - e_x_z
+        res_y = f_y - e_y_z
+
+        if self.num_f2 == 1:
+            self.approx = "hbe"
+
+        if self.approx == "perm":
+            Cxy_z = self.matrix_cov(res_x, res_y)
+            sta = r * np.sum(Cxy_z**2)
+
+            nperm = 1000
+
+            stas = []
+            for _ in range(nperm):
+                perm = np.random.choice(np.arange(r), size=r, replace=False)
+                Cxy = self.matrix_cov(res_x[perm, ], res_y)
+                sta_p = r * np.sum(Cxy**2)
+                stas.append(sta_p)
+
+            p = 1 - (np.sum(sta >= stas) / len(stas))
+
+        else:
+            Cxy_z = Cxy - Cxz @ i_Czz @ Czy
+            sta = r * np.sum(Cxy_z**2)
+
+            d = list(itertools.product(range(f_x.shape[1]), range(f_y.shape[1])))
+            res = np.array([res_x[:, idx_x] * res_y[:, idx_y] for idx_x, idx_y in d]).T
+            Cov = 1/r * res.T @ res
+
+            if self.approx == "chi2":
+                i_Cov = pinv(Cov)
+
+                sta = r * (np.dot(Cxy_z.flatten(), np.dot(i_Cov, Cxy_z.flatten())))
+                p = 1 - chi2.cdf(sta, Cxy_z.size)
+
+            else:
+                eigenvalues, eigenvectors = np.linalg.eigh(Cov)
+                eig_d = eigenvalues[eigenvalues > 0]
+
+                if self.approx == "gamma":
+                    p = 1 - sw(eig_d, sta)
+
+                elif self.approx == "hbe":
+                    p = 1 - hbe(eig_d, sta)
+
+                elif self.approx == "lpd4":
+                    try:
+                        p = 1 - lpb4(eig_d, sta)
+                    except Exception:
+                        p = 1 - hbe(eig_d, sta)
+                    if np.isnan(p):
+                        p = 1 - hbe(eig_d, sta)
+
+        if (p < 0):
+            p = 0
+
+        return p, sta
+
+    def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
+
+        if num_f is None:
+            num_f = 25
+
+        r = x.shape[0]
+        c = x.shape[1]
+
+        if ((sigma == 0) | (sigma is None)):
+            sigma = 1
+
+        if w is None:
+            w = (1/sigma) * np.random.normal(size=(num_f, c))
+            b = np.tile(2*np.pi*np.random.uniform(size=(num_f, 1)), (1, r))
+
+        feat = np.sqrt(2) * (np.cos(w[0:num_f, 0:c] @ x.T + b[0:num_f, :])).T
+
+        return feat
+
+    def matrix_cov(mat_a, mat_b):
+        n_obs = mat_a.shape[0]
+
+        assert mat_a.shape == mat_b.shape
+        mat_a = mat_a - mat_a.mean(axis=0)
+        mat_b = mat_b - mat_b.mean(axis=0)
+
+        mat_cov = mat_a.T @ mat_b / (n_obs - 1)
+
+        return mat_cov
+
+
+class RIT(object):
+    def __init__(self, approx="lpd4"):
+        self.approx = approx
+
+    def compute_pvalue(self, data_x, data_y):
+        r = data_x.shape[0]
+        r1 = 500 if (r > 500) else r
+
+        data_x = (data_x - data_x.mean(axis=0)) / data_x.std(axis=0, ddof=1)
+        data_y = (data_y - data_y.mean(axis=0)) / data_y.std(axis=0, ddof=1)
+
+        sigma = dict()
+        for key, value in [("x", data_x), ("y", data_y)]:
+            distances = pdist(value[:r1, :], metric='euclidean')
+            flattened_distances = squareform(distances).ravel()
+            non_zero_distances = flattened_distances[flattened_distances != 0]
+            sigma[key] = np.median(non_zero_distances)
+
+        four_x = self.random_fourier_features(data_x, num_f=5, sigma=sigma["x"])
+        four_y = self.random_fourier_features(data_y, num_f=5, sigma=sigma["y"])
+
+        f_x = (four_x - four_x.mean(axis=0)) / four_x.std(axis=0, ddof=1)
+        f_y = (four_y - four_y.mean(axis=0)) / four_y.std(axis=0, ddof=1)
+
+        Cxy = self.matrix_cov(f_x, f_y)
+        sta = r * np.sum(Cxy**2)
+
+        if self.approx == "perm":
+            nperm = 1000
+
+            stas = []
+            for _ in range(nperm):
+                perm = np.random.choice(np.arange(r), size=r, replace=False)
+                Cxy = self.matrix_cov(f_x[perm, ], f_y)
+                sta_p = r * np.sum(Cxy**2)
+                stas.append(sta_p)
+
+            p = 1 - (np.sum(sta >= stas) / len(stas))
+
+        else:
+            res_x = f_x - np.meshgrid(f_x.mean(axis=0), (r, 1))
+            res_y = f_y - np.meshgrid(f_y.mean(axis=0), (r, 1))
+
+            d = list(itertools.product(range(f_x.shape[1]), range(f_y.shape[1])))
+            res = np.array([res_x[:, idx_x] * res_y[:, idx_y] for idx_x, idx_y in d]).T
+            Cov = 1/r * res.T @ res
+
+            if self.approx == "chi2":
+                i_Cov = pinv(Cov)
+
+                sta = r * (np.dot(Cxy.flatten(), np.dot(i_Cov, Cxy.flatten())))
+                p = 1 - chi2.cdf(sta, Cxy.size)
+
+            else:
+                eigenvalues, eigenvectors = np.linalg.eigh(Cov)
+                eig_d = eigenvalues[eigenvalues > 0]
+
+                if self.approx == "gamma":
+                    p = 1 - sw(eig_d, sta)
+
+                elif self.approx == "hbe":
+                    p = 1 - hbe(eig_d, sta)
+
+                elif self.approx == "lpd4":
+                    try:
+                        p = 1 - lpb4(eig_d, sta)
+                    except Exception:
+                        p = 1 - hbe(eig_d, sta)
+                    if np.isnan(p):
+                        p = 1 - hbe(eig_d, sta)
+
+        if (p < 0):
+            p = 0
+
+        return p, sta
+
+    def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
+
+        if num_f is None:
+            num_f = 25
+
+        r = x.shape[0]
+        c = x.shape[1]
+
+        if ((sigma == 0) | (sigma is None)):
+            sigma = 1
+
+        if w is None:
+            w = (1/sigma) * np.random.normal(size=(num_f, c))
+            b = np.tile(2*np.pi*np.random.uniform(size=(num_f, 1)), (1, r))
+
+        feat = np.sqrt(2) * (np.cos(w[0:num_f, 0:c] @ x.T + b[0:num_f, :])).T
+
+        return feat
+
+    def matrix_cov(mat_a, mat_b):
+        n_obs = mat_a.shape[0]
+
+        assert mat_a.shape == mat_b.shape
+        mat_a = mat_a - mat_a.mean(axis=0)
+        mat_b = mat_b - mat_b.mean(axis=0)
+
+        mat_cov = mat_a.T @ mat_b / (n_obs - 1)
+
+        return mat_cov
diff --git a/causallearn/utils/RCIT/__init__.py b/causallearn/utils/RCIT/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/causallearn/utils/cit.py b/causallearn/utils/cit.py
index be168e40..c8858a0b 100644
--- a/causallearn/utils/cit.py
+++ b/causallearn/utils/cit.py
@@ -6,6 +6,7 @@
 
 from causallearn.utils.KCI.KCI import KCI_CInd, KCI_UInd
 from causallearn.utils.FastKCI.FastKCI import FastKCI_CInd, FastKCI_UInd
+from causallearn.utils.RCIT.RCIT import RCIT, RIT
 from causallearn.utils.PCUtils import Helper
 
 CONST_BINCOUNT_UNIQUE_THRESHOLD = 1e5
@@ -14,6 +15,7 @@
 mv_fisherz = "mv_fisherz"
 mc_fisherz = "mc_fisherz"
 kci = "kci"
+rcit = "rcit"
 fastkci = "fastkci"
 chisq = "chisq"
 gsq = "gsq"
@@ -36,6 +38,8 @@ def CIT(data, method='fisherz', **kwargs):
         return KCI(data, **kwargs)
     elif method == fastkci:
         return FastKCI(data, **kwargs)
+    elif method == rcit:
+        return RCIT(data, **kwargs)
     elif method in [chisq, gsq]:
         return Chisq_or_Gsq(data, method_name=method, **kwargs)
     elif method == mv_fisherz:
@@ -218,6 +222,28 @@ def __call__(self, X, Y, condition_set=None):
             self.kci_ci.compute_pvalue(self.data[:, Xs], self.data[:, Ys], self.data[:, condition_set])[0]
         self.pvalue_cache[cache_key] = p
         return p
+  
+class RCIT(CIT_Base):
+    def __init__(self, data, **kwargs):
+        super().__init__(data, **kwargs)
+        rit_kwargs = {k: v for k, v in kwargs.items() if k in
+                      ['approx']}
+        rcit_kwargs = {k: v for k, v in kwargs.items() if k in
+                       ['approx', 'num_f', 'num_f2', 'rcit']}
+        self.check_cache_method_consistent(
+            'kci', hashlib.md5(json.dumps(rcit_kwargs, sort_keys=True).encode('utf-8')).hexdigest())
+        self.assert_input_data_is_valid()
+        self.rit = RIT(**rit_kwargs)
+        self.rcit = RCIT(**rcit_kwargs)
+
+    def __call__(self, X, Y, condition_set=None):
+        # Kernel-based conditional independence test.
+        Xs, Ys, condition_set, cache_key = self.get_formatted_XYZ_and_cachekey(X, Y, condition_set)
+        if cache_key in self.pvalue_cache: return self.pvalue_cache[cache_key]
+        p = self.rit.compute_pvalue(self.data[:, Xs], self.data[:, Ys])[0] if len(condition_set) == 0 else \
+            self.rcit.compute_pvalue(self.data[:, Xs], self.data[:, Ys], self.data[:, condition_set])[0]
+        self.pvalue_cache[cache_key] = p
+        return p
 
 class Chisq_or_Gsq(CIT_Base):
     def __init__(self, data, method_name, **kwargs):
diff --git a/setup.py b/setup.py
index f132a020..f40a536d 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,8 @@
         'matplotlib',
         'networkx',
         'pydot',
-        'tqdm'
+        'tqdm',
+        'momentchi2'
     ],
     url='https://github.com/py-why/causal-learn',
     packages=setuptools.find_packages(),

From cd61fa424cef3b2f337e9dc37de610c7a13e7021 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:15:47 +0200
Subject: [PATCH 06/16] redo naming of rcit class

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/cit.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/causallearn/utils/cit.py b/causallearn/utils/cit.py
index c8858a0b..2d664e9b 100644
--- a/causallearn/utils/cit.py
+++ b/causallearn/utils/cit.py
@@ -6,7 +6,8 @@
 
 from causallearn.utils.KCI.KCI import KCI_CInd, KCI_UInd
 from causallearn.utils.FastKCI.FastKCI import FastKCI_CInd, FastKCI_UInd
-from causallearn.utils.RCIT.RCIT import RCIT, RIT
+from causallearn.utils.RCIT.RCIT import RCIT as RCIT_CInd
+from causallearn.utils.RCIT.RCIT import RIT as RCIT_UInd
 from causallearn.utils.PCUtils import Helper
 
 CONST_BINCOUNT_UNIQUE_THRESHOLD = 1e5
@@ -222,7 +223,7 @@ def __call__(self, X, Y, condition_set=None):
             self.kci_ci.compute_pvalue(self.data[:, Xs], self.data[:, Ys], self.data[:, condition_set])[0]
         self.pvalue_cache[cache_key] = p
         return p
-  
+
 class RCIT(CIT_Base):
     def __init__(self, data, **kwargs):
         super().__init__(data, **kwargs)
@@ -233,8 +234,8 @@ def __init__(self, data, **kwargs):
         self.check_cache_method_consistent(
             'kci', hashlib.md5(json.dumps(rcit_kwargs, sort_keys=True).encode('utf-8')).hexdigest())
         self.assert_input_data_is_valid()
-        self.rit = RIT(**rit_kwargs)
-        self.rcit = RCIT(**rcit_kwargs)
+        self.rit = RCIT_UInd(**rit_kwargs)
+        self.rcit = RCIT_CInd(**rcit_kwargs)
 
     def __call__(self, X, Y, condition_set=None):
         # Kernel-based conditional independence test.

From 173b91330a9c7e5f56805655d6daa48e6aa7a50c Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Thu, 19 Sep 2024 11:11:44 +0200
Subject: [PATCH 07/16] debugging

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/RCIT/RCIT.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/causallearn/utils/RCIT/RCIT.py b/causallearn/utils/RCIT/RCIT.py
index c9c931cb..e1bb8dbc 100644
--- a/causallearn/utils/RCIT/RCIT.py
+++ b/causallearn/utils/RCIT/RCIT.py
@@ -132,7 +132,7 @@ def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
 
         return feat
 
-    def matrix_cov(mat_a, mat_b):
+    def matrix_cov(self, mat_a, mat_b):
         n_obs = mat_a.shape[0]
 
         assert mat_a.shape == mat_b.shape
@@ -184,8 +184,8 @@ def compute_pvalue(self, data_x, data_y):
             p = 1 - (np.sum(sta >= stas) / len(stas))
 
         else:
-            res_x = f_x - np.meshgrid(f_x.mean(axis=0), (r, 1))
-            res_y = f_y - np.meshgrid(f_y.mean(axis=0), (r, 1))
+            res_x = f_x - f_x.mean(axis=0)
+            res_y = f_y - f_y.mean(axis=0)
 
             d = list(itertools.product(range(f_x.shape[1]), range(f_y.shape[1])))
             res = np.array([res_x[:, idx_x] * res_y[:, idx_y] for idx_x, idx_y in d]).T
@@ -239,7 +239,7 @@ def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
 
         return feat
 
-    def matrix_cov(mat_a, mat_b):
+    def matrix_cov(self, mat_a, mat_b):
         n_obs = mat_a.shape[0]
 
         assert mat_a.shape == mat_b.shape

From e83e4f05a948dde022b3f414c281476b610b1618 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:09:09 +0200
Subject: [PATCH 08/16] add docstrings

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/RCIT/RCIT.py | 156 ++++++++++++++++++++++++++++++++-
 1 file changed, 154 insertions(+), 2 deletions(-)

diff --git a/causallearn/utils/RCIT/RCIT.py b/causallearn/utils/RCIT/RCIT.py
index e1bb8dbc..11608ed0 100644
--- a/causallearn/utils/RCIT/RCIT.py
+++ b/causallearn/utils/RCIT/RCIT.py
@@ -7,13 +7,56 @@
 
 
 class RCIT(object):
+    """
+    Python implementation of Randomized Conditional Independence Test (RCIT) test.
+    The original R implementation can be found at https://github.com/ericstrobl/RCIT/tree/master
+
+    References
+    ----------
+    [1] Strobl, E. V., Zhang, K., and Visweswaran, S. (2019). "Approximate kernel-based conditional
+    independence tests for fast non-parametric causal discovery." Journal of Causal Inference, 7(1), 20180017.
+    """
     def __init__(self, approx="lpd4", num_f=100, num_f2=5, rcit=True):
+        """
+        Initialize the RCIT object.
+
+        Parameters
+        ----------
+        approx : str
+            Method for approximating the null distribution.
+            - "lpd4" for the Lindsay-Pilla-Basak method
+            - "hbe" for the Hall-Buckley-Eagleson method
+            - "gamma" for the Satterthwaite-Welch method
+            - "chi2" for a normalized chi-squared statistic
+            - "perm" for permutation testing
+            Default is "lpd4".
+        num_f : int
+            Number of features for conditioning set. Default is 25.
+        num_f2 : int
+            Number of features for non-conditioning sets. Default is 5.
+        rcit : bool
+            Whether to use RCIT or RCoT. Default is True.
+        """
         self.approx = approx
         self.num_f = num_f
         self.num_f2 = num_f2
         self.rcit = rcit
 
     def compute_pvalue(self, data_x, data_y, data_z):
+        """
+        Compute the p value and return it together with the test statistic.
+
+        Parameters
+        ----------
+        data_x: input data for x (nxd1 array)
+        data_y: input data for y (nxd2 array)
+        data_z: input data for z (nxd3 array)
+
+        Returns
+        -------
+        p: p value
+        sta: test statistic
+        """
         d = data_z.shape[1]
         r = data_x.shape[0]
         r1 = 500 if (r > 500) else r
@@ -114,7 +157,27 @@ def compute_pvalue(self, data_x, data_y, data_z):
         return p, sta
 
     def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
-
+        """
+        Generate random Fourier features.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            Random variable x.
+        w : np.ndarray
+            RRandom coefficients.
+        b : np.ndarray
+            Random offsets.
+        num_f : int
+            Number of random Fourier features.
+        sigma : float
+            Smooth parameter of RBF kernel.
+
+        Returns
+        -------
+        feat : np.ndarray
+            Random Fourier features.
+        """
         if num_f is None:
             num_f = 25
 
@@ -133,6 +196,22 @@ def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
         return feat
 
     def matrix_cov(self, mat_a, mat_b):
+        """
+        Compute the covariance matrix between two matrices.
+        Equivalent to ``cov()`` between two matrices in R.
+
+        Parameters
+        ----------
+        mat_a : np.ndarray
+            First data matrix.
+        mat_b : np.ndarray
+            Second data matrix.
+
+        Returns
+        -------
+        mat_cov : np.ndarray
+            Covariance matrix.
+        """
         n_obs = mat_a.shape[0]
 
         assert mat_a.shape == mat_b.shape
@@ -145,10 +224,47 @@ def matrix_cov(self, mat_a, mat_b):
 
 
 class RIT(object):
+    """
+    Python implementation of Randomized Independence Test (RIT) test.
+    The original R implementation can be found at https://github.com/ericstrobl/RCIT/tree/master
+
+    References
+    ----------
+    [1] Strobl, E. V., Zhang, K., and Visweswaran, S. (2019). "Approximate kernel-based conditional
+    independence tests for fast non-parametric causal discovery." Journal of Causal Inference, 7(1), 20180017.
+    """
     def __init__(self, approx="lpd4"):
+        """
+        Initialize the RIT object.
+
+        Parameters
+        ----------
+        approx : str
+            Method for approximating the null distribution.
+            - "lpd4" for the Lindsay-Pilla-Basak method
+            - "hbe" for the Hall-Buckley-Eagleson method
+            - "gamma" for the Satterthwaite-Welch method
+            - "chi2" for a normalized chi-squared statistic
+            - "perm" for permutation testing
+            Default is "lpd4".
+        """
         self.approx = approx
 
     def compute_pvalue(self, data_x, data_y):
+        """
+        Compute the p value and return it together with the test statistic.
+
+        Parameters
+        ----------
+        data_x: input data for x (nxd1 array)
+        data_y: input data for y (nxd2 array)
+        data_z: input data for z (nxd3 array)
+
+        Returns
+        -------
+        p: p value
+        sta: test statistic
+        """
         r = data_x.shape[0]
         r1 = 500 if (r > 500) else r
 
@@ -221,7 +337,27 @@ def compute_pvalue(self, data_x, data_y):
         return p, sta
 
     def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
-
+        """
+        Generate random Fourier features.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            Random variable x.
+        w : np.ndarray
+            RRandom coefficients.
+        b : np.ndarray
+            Random offsets.
+        num_f : int
+            Number of random Fourier features.
+        sigma : float
+            Smooth parameter of RBF kernel.
+
+        Returns
+        -------
+        feat : np.ndarray
+            Random Fourier features.
+        """
         if num_f is None:
             num_f = 25
 
@@ -240,6 +376,22 @@ def random_fourier_features(self, x, w=None, b=None, num_f=None, sigma=None):
         return feat
 
     def matrix_cov(self, mat_a, mat_b):
+        """
+        Compute the covariance matrix between two matrices.
+        Equivalent to ``cov()`` between two matrices in R.
+
+        Parameters
+        ----------
+        mat_a : np.ndarray
+            First data matrix.
+        mat_b : np.ndarray
+            Second data matrix.
+
+        Returns
+        -------
+        mat_cov : np.ndarray
+            Covariance matrix.
+        """
         n_obs = mat_a.shape[0]
 
         assert mat_a.shape == mat_b.shape

From fba553f59ba927c045631537dad0cf9e460cc248 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:30:57 +0200
Subject: [PATCH 09/16] linting

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/FastKCI/FastKCI.py | 110 +++++++++++++--------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/causallearn/utils/FastKCI/FastKCI.py b/causallearn/utils/FastKCI/FastKCI.py
index 5b614dd9..407977dd 100644
--- a/causallearn/utils/FastKCI/FastKCI.py
+++ b/causallearn/utils/FastKCI/FastKCI.py
@@ -9,19 +9,21 @@
 from sklearn.gaussian_process.kernels import ConstantKernel, WhiteKernel, RBF
 import warnings
 
+
 class FastKCI_CInd(object):
     """
-    Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test. Unconditional version.
+    Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test.
+    Unconditional version.
 
     References
     ----------
     [1] K. Zhang, J. Peters, D. Janzing, and B. Schölkopf,
-    "A kernel-based conditional independence test and application in causal discovery," In UAI 2011.
+    "A kernel-based conditional independence test and application in causal discovery", In UAI 2011.
     [2] M. Zhang, and S. Williamson,
-    "Embarrassingly Parallel Inference for Gaussian Processes" In JMLR 20 (2019)
+    "Embarrassingly Parallel Inference for Gaussian Processes", In JMLR 20 (2019)
 
-    """          
-    def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh = 1e-6, trimming_thresh = 1e-3, use_gp=False):
+    """
+    def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh=1e-6, trimming_thresh=1e-3, use_gp=False):
         """
         Initialize the FastKCI_CInd object.
 
@@ -33,7 +35,7 @@ def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh = 1e-6, trimmi
         epsilon: Penalty for the matrix ridge regression.
         eig_threshold: Threshold for Eigenvalues.
         use_gp: Whether to use Gaussian Process Regression to determine the kernel widths
-        """ 
+        """
         self.K = K
         self.J = J
         self.alpha = alpha
@@ -42,7 +44,6 @@ def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh = 1e-6, trimmi
         self.trimming_thresh = trimming_thresh
         self.use_gp = use_gp
         self.nullss = 5000
-        # TODO: Adjust to causal-learn API
 
     def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
         """
@@ -67,21 +68,21 @@ def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
         self.Z_proposal, self.prob_Z = zip(*Z_proposal)
         block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
         test_stat, null_samples, log_likelihood = zip(*block_res)
-        
+
         log_likelihood = np.array(log_likelihood)
         self.prob_Z += log_likelihood
-        self.prob_Z = np.around(np.exp(self.prob_Z-logsumexp(self.prob_Z)), 6) # experimental, not used
+        self.prob_Z = np.around(np.exp(self.prob_Z-logsumexp(self.prob_Z)), 6)  # experimental, not used
         self.all_null_samples = np.vstack(null_samples)
         self.all_p = np.array([np.sum(self.all_null_samples[i,] > test_stat[i]) / float(self.nullss) for i in range(self.J)])
         self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)), 6)
         self.all_test_stats = np.array(test_stat)
-        self.test_stat = np.average(np.array(test_stat), weights = self.prob_weights)
-        self.null_samples = np.average(null_samples, axis = 0, weights = self.prob_weights)
+        self.test_stat = np.average(np.array(test_stat), weights=self.prob_weights)
+        self.null_samples = np.average(null_samples, axis=0, weights=self.prob_weights)
         # experimental, not used
-        self.pvalue_alt = np.sum(np.average(null_samples, axis = 0, weights = self.prob_Z) > np.average(np.array(test_stat), weights = self.prob_Z)) / float(self.nullss)
+        self.pvalue_alt = np.sum(np.average(null_samples, axis=0, weights=self.prob_Z) > np.average(np.array(test_stat), weights=self.prob_Z)) / float(self.nullss)
         self.pvalue = np.sum(self.null_samples > self.test_stat) / float(self.nullss)
 
-        return self.pvalue, self.test_stat          
+        return self.pvalue, self.test_stat
 
     def partition_data(self):
         """
@@ -94,18 +95,18 @@ def partition_data(self):
         """
         Z_mean = self.data_z.mean(axis=0)
         Z_sd = self.data_z.std(axis=0)
-        mu_k = np.random.normal(Z_mean, Z_sd, size = (self.K,self.data_z.shape[1]))
+        mu_k = np.random.normal(Z_mean, Z_sd, size=(self.K, self.data_z.shape[1]))
         sigma_k = np.eye(self.data_z.shape[1])
         pi_j = np.random.dirichlet([self.alpha]*self.K)
-        ll = np.tile(np.log(pi_j),(self.n,1))
+        ll = np.tile(np.log(pi_j), (self.n, 1))
         for k in range(self.K):
-            ll[:,k] += stats.multivariate_normal.logpdf(self.data_z, mu_k[k,:], cov=sigma_k, allow_singular=True)
-        Z = np.array([ np.random.multinomial(1,np.exp(ll[n,:]-logsumexp(ll[n,:]))).argmax() for n in range(self.n)])
+            ll[:, k] += stats.multivariate_normal.logpdf(self.data_z, mu_k[k, :], cov=sigma_k, allow_singular=True)
+        Z = np.array([np.random.multinomial(1, np.exp(ll[n, :]-logsumexp(ll[n, :]))).argmax() for n in range(self.n)])
         le = LabelEncoder()
         Z = le.fit_transform(Z)
-        prob_Z = np.take_along_axis(ll, Z[:, None], axis=1).sum() # experimental, might be removed
+        prob_Z = np.take_along_axis(ll, Z[:, None], axis=1).sum()  # experimental, might be removed
         return Z, prob_Z
-    
+
     def pvalue_onblocks(self, Z_proposal):
         """
         Calculate p value on given partitions of the data.
@@ -113,7 +114,7 @@ def pvalue_onblocks(self, Z_proposal):
         Parameters
         ----------
         Z_proposal: partition of the data into K clusters (nxd1 array)
-        
+
         Returns
         _________
         test_stat: test statistic (scalar)
@@ -123,13 +124,13 @@ def pvalue_onblocks(self, Z_proposal):
         unique_Z_j = np.unique(Z_proposal)
         test_stat = 0
         log_likelihood = 0
-        null_samples = np.zeros((1,self.nullss))
+        null_samples = np.zeros((1, self.nullss))
         for k in unique_Z_j:
-            K_mask = (Z_proposal==k)
+            K_mask = (Z_proposal == k)
             X_k = np.copy(self.data[0][K_mask])
             Y_k = np.copy(self.data[1][K_mask])
             Z_k = np.copy(self.data_z[K_mask])
-            if (Z_k.shape[0]<6): # small blocks cause problems in GP, experimental
+            if (Z_k.shape[0] < 6):  # small blocks cause problems in GP, experimental
                 continue
             Kx, Ky, Kzx, Kzy, epsilon_x, epsilon_y, likelihood_x, likelihood_y = self.kernel_matrix(X_k, Y_k, Z_k)
             KxR, Rzx = Kernel.center_kernel_matrix_regression(Kx, Kzx, epsilon_x)
@@ -143,7 +144,6 @@ def pvalue_onblocks(self, Z_proposal):
             log_likelihood += likelihood_x + likelihood_y
         return test_stat, null_samples, log_likelihood
 
-
     def kernel_matrix(self, data_x, data_y, data_z):
         """
         Calculates the Gaussian Kernel for given data inputs as well as the shared kernel.
@@ -157,10 +157,10 @@ def kernel_matrix(self, data_x, data_y, data_z):
 
         data_x = stats.zscore(data_x, ddof=1, axis=0)
         data_x[np.isnan(data_x)] = 0.
-        
+
         data_y = stats.zscore(data_y, ddof=1, axis=0)
         data_y[np.isnan(data_y)] = 0.
-        
+
         data_z = stats.zscore(data_z, ddof=1, axis=0)
         data_z[np.isnan(data_z)] = 0.
 
@@ -189,23 +189,23 @@ def kernel_matrix(self, data_x, data_y, data_z):
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=Warning)
                 # P(X|Z)
-                gpx.fit(X = data_z, y = data_x)
+                gpx.fit(X=data_z, y=data_x)
             likelihood_x = gpx.log_marginal_likelihood_value_
             gpy = GaussianProcessRegressor()
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=Warning)
                 # P(Y|X,Z)
-                gpy.fit(X = np.c_[data_x,data_z], y=data_y)
+                gpy.fit(X=np.c_[data_x, data_z], y=data_y)
             likelihood_y = gpy.log_marginal_likelihood_value_
 
-        else: 
+        else:
             n, Dz = data_z.shape
-            
+
             widthz = np.sqrt(1.0 / (kernelX.width * data_x.shape[1]))
 
             # Instantiate a Gaussian Process model for x
             wx, vx = eigh(Kx)
-            topkx = int(np.max([np.min([400, np.floor(n / 4)]), np.min([n+1,8])]))
+            topkx = int(np.max([np.min([400, np.floor(n / 4)]), np.min([n+1, 8])]))
             idx = np.argsort(-wx)
             wx = wx[idx]
             vx = vx[:, idx]
@@ -228,7 +228,7 @@ def kernel_matrix(self, data_x, data_y, data_z):
 
             # Instantiate a Gaussian Process model for y
             wy, vy = eigh(Ky)
-            topky = int(np.max([np.min([400, np.floor(n / 4)]), np.min([n+1,8])]))
+            topky = int(np.max([np.min([400, np.floor(n / 4)]), np.min([n+1, 8])]))
             idy = np.argsort(-wy)
             wy = wy[idy]
             vy = vy[:, idy]
@@ -297,7 +297,7 @@ def get_uuprod(self, Kx, Ky):
             uu_prod = uu.T.dot(uu)
 
         return uu_prod, size_u
-        
+
     def get_kappa(self, mean_appr, var_appr):
         """
         Get parameters for the approximated gamma distribution
@@ -333,11 +333,12 @@ def null_sample_spectral(self, uu_prod, size_u, T):
         eig_uu = -np.sort(-eig_uu)
         eig_uu = eig_uu[0:np.min((T, size_u))]
         eig_uu = eig_uu[eig_uu > np.max(eig_uu) * self.eig_thresh]
-        
+
         f_rand = np.random.chisquare(1, (eig_uu.shape[0], self.nullss))
         null_dstr = eig_uu.T.dot(f_rand)
         return null_dstr
 
+
 class FastKCI_UInd(object):
     """
     Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test. Unconditional version.
@@ -348,8 +349,8 @@ class FastKCI_UInd(object):
     "A kernel-based conditional independence test and application in causal discovery," In UAI 2011.
     [2] M. Zhang, and S. Williamson,
     "Embarrassingly Parallel Inference for Gaussian Processes" In JMLR 20 (2019)
-    """         
-    def __init__(self, K=10, J=8, alpha=500, trimming_thresh = 1e-3):
+    """
+    def __init__(self, K=10, J=8, alpha=500, trimming_thresh=1e-3):
         """
         Construct the FastKCI_UInd model.
 
@@ -359,14 +360,13 @@ def __init__(self, K=10, J=8, alpha=500, trimming_thresh = 1e-3):
         J: Number of independent repittitions.
         alpha: Parameter for the Dirichlet distribution.
         trimming_thresh: Threshold for trimming the propensity weights.
-        """ 
+        """
         self.K = K
         self.J = J
         self.alpha = alpha
         self.trimming_thresh = trimming_thresh
         self.nullss = 5000
         self.eig_thresh = 1e-5
-        # TODO: Adjust to causal-learn API
 
     def compute_pvalue(self, data_x=None, data_y=None):
         """
@@ -385,17 +385,17 @@ def compute_pvalue(self, data_x=None, data_y=None):
         self.data_x = data_x
         self.data_y = data_y
         self.n = data_x.shape[0]
-        
+
         Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)() for i in range(self.J))
         self.Z_proposal, self.prob_Y = zip(*Z_proposal)
         block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
         test_stat, null_samples, log_likelihood = zip(*block_res)
         self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)), 6)
-        self.test_stat = np.average(np.array(test_stat), weights = self.prob_weights)
-        self.null_samples = np.average(null_samples, axis = 0, weights = self.prob_weights)
+        self.test_stat = np.average(np.array(test_stat), weights=self.prob_weights)
+        self.null_samples = np.average(null_samples, axis=0, weights=self.prob_weights)
         self.pvalue = np.sum(self.null_samples > self.test_stat) / float(self.nullss)
 
-        return self.pvalue, self.test_stat          
+        return self.pvalue, self.test_stat
 
     def partition_data(self):
         """
@@ -408,25 +408,25 @@ def partition_data(self):
         """
         Y_mean = self.data_y.mean(axis=0)
         Y_sd = self.data_y.std(axis=0)
-        mu_k = np.random.normal(Y_mean, Y_sd, size = (self.K,self.data_y.shape[1]))
+        mu_k = np.random.normal(Y_mean, Y_sd, size=(self.K, self.data_y.shape[1]))
         sigma_k = np.eye(self.data_y.shape[1])
         pi_j = np.random.dirichlet([self.alpha]*self.K)
-        ll = np.tile(np.log(pi_j),(self.n,1))
+        ll = np.tile(np.log(pi_j), (self.n, 1))
         for k in range(self.K):
-            ll[:,k] += stats.multivariate_normal.logpdf(self.data_y, mu_k[k,:], cov=sigma_k, allow_singular=True)
-        Z = np.array([ np.random.multinomial(1,np.exp(ll[n,:]-logsumexp(ll[n,:]))).argmax() for n in range(self.n)])
+            ll[:, k] += stats.multivariate_normal.logpdf(self.data_y, mu_k[k, :], cov=sigma_k, allow_singular=True)
+        Z = np.array([np.random.multinomial(1, np.exp(ll[n, :]-logsumexp(ll[n, :]))).argmax() for n in range(self.n)])
         prop_Y = np.take_along_axis(ll, Z[:, None], axis=1).sum()
         le = LabelEncoder()
         Z = le.fit_transform(Z)
         return (Z, prop_Y)
-    
+
     def pvalue_onblocks(self, Z_proposal):
         unique_Z_j = np.unique(Z_proposal)
         test_stat = 0
         log_likelihood = 0
-        null_samples = np.zeros((1,self.nullss))
+        null_samples = np.zeros((1, self.nullss))
         for k in unique_Z_j:
-            K_mask = (Z_proposal==k)
+            K_mask = (Z_proposal == k)
             X_k = np.copy(self.data_x[K_mask])
             Y_k = np.copy(self.data_y[K_mask])
 
@@ -442,12 +442,12 @@ def pvalue_onblocks(self, Z_proposal):
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=Warning)
                 # P(X|Y)
-                gpx.fit(X = Y_k, y = X_k)
+                gpx.fit(X=Y_k, y=X_k)
             likelihood = gpx.log_marginal_likelihood_value_
             log_likelihood += likelihood
-            
+
         return test_stat, null_samples, log_likelihood
-    
+
     def kernel_matrix(self, data):
         """
         Calculates the Gaussian Kernel for given data inputs.
@@ -460,12 +460,12 @@ def kernel_matrix(self, data):
         kernel_obj.set_width_empirical_hsic(data)
 
         data = stats.zscore(data, ddof=1, axis=0)
-        data[np.isnan(data)] = 0. 
+        data[np.isnan(data)] = 0.
 
         K = kernel_obj.kernel(data)
 
         return K
-    
+
     def get_kappa(self, mean_appr, var_appr):
         """
         Get parameters for the approximated gamma distribution
@@ -513,7 +513,7 @@ def null_sample_spectral(self, Kxc, Kyc):
         f_rand = np.random.chisquare(1, (lambda_prod.shape[0], self.nullss))
         null_dstr = lambda_prod.T.dot(f_rand) / T
         return null_dstr
-    
+
     def HSIC_V_statistic(self, Kx, Ky):
         """
         Compute V test statistic from kernel matrices Kx and Ky

From 96518245d8f0e13b884e4a81e56b0a5031a63ae4 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Thu, 17 Oct 2024 13:59:32 +0200
Subject: [PATCH 10/16] edit docstring

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/cit.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/causallearn/utils/cit.py b/causallearn/utils/cit.py
index 2d664e9b..fbc95e95 100644
--- a/causallearn/utils/cit.py
+++ b/causallearn/utils/cit.py
@@ -28,8 +28,8 @@ def CIT(data, method='fisherz', **kwargs):
     Parameters
     ----------
     data: numpy.ndarray of shape (n_samples, n_features)
-    method: str, in ["fisherz", "mv_fisherz", "mc_fisherz", "kci", "chisq", "gsq"]
-    kwargs: placeholder for future arguments, or for KCI specific arguments now
+    method: str, in ["fisherz", "mv_fisherz", "mc_fisherz", "kci", "rcit", "fastkci", "chisq", "gsq"]
+    kwargs: placeholder for future arguments, or for KCI, FastKCI or RCIT specific arguments now
         TODO: utimately kwargs should be replaced by explicit named parameters.
               check https://github.com/cmu-phil/causal-learn/pull/62#discussion_r927239028
     '''
@@ -52,6 +52,7 @@ def CIT(data, method='fisherz', **kwargs):
     else:
         raise ValueError("Unknown method: {}".format(method))
 
+
 class CIT_Base(object):
     # Base class for CIT, contains basic operations for input check and caching, etc.
     def __init__(self, data, cache_path=None, **kwargs):

From 3d159b6d5d4b6edbd3b2aae4385d722c7b1464ac Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Thu, 17 Oct 2024 14:00:21 +0200
Subject: [PATCH 11/16] remove experimental stuff

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/FastKCI/FastKCI.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/causallearn/utils/FastKCI/FastKCI.py b/causallearn/utils/FastKCI/FastKCI.py
index 407977dd..91b99ad8 100644
--- a/causallearn/utils/FastKCI/FastKCI.py
+++ b/causallearn/utils/FastKCI/FastKCI.py
@@ -21,6 +21,9 @@ class FastKCI_CInd(object):
     "A kernel-based conditional independence test and application in causal discovery", In UAI 2011.
     [2] M. Zhang, and S. Williamson,
     "Embarrassingly Parallel Inference for Gaussian Processes", In JMLR 20 (2019)
+    [3] O. Schacht, and B. Huang
+    "FastKCI: A fast Kernel-based Conditional Indepdence test with application to causal discovery",
+    Working Paper.
 
     """
     def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh=1e-6, trimming_thresh=1e-3, use_gp=False):
@@ -65,21 +68,17 @@ def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
         self.n = data_x.shape[0]
 
         Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)() for i in range(self.J))
-        self.Z_proposal, self.prob_Z = zip(*Z_proposal)
+        self.Z_proposal = zip(*Z_proposal)
         block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
         test_stat, null_samples, log_likelihood = zip(*block_res)
 
         log_likelihood = np.array(log_likelihood)
-        self.prob_Z += log_likelihood
-        self.prob_Z = np.around(np.exp(self.prob_Z-logsumexp(self.prob_Z)), 6)  # experimental, not used
         self.all_null_samples = np.vstack(null_samples)
         self.all_p = np.array([np.sum(self.all_null_samples[i,] > test_stat[i]) / float(self.nullss) for i in range(self.J)])
         self.prob_weights = np.around(np.exp(log_likelihood-logsumexp(log_likelihood)), 6)
         self.all_test_stats = np.array(test_stat)
         self.test_stat = np.average(np.array(test_stat), weights=self.prob_weights)
         self.null_samples = np.average(null_samples, axis=0, weights=self.prob_weights)
-        # experimental, not used
-        self.pvalue_alt = np.sum(np.average(null_samples, axis=0, weights=self.prob_Z) > np.average(np.array(test_stat), weights=self.prob_Z)) / float(self.nullss)
         self.pvalue = np.sum(self.null_samples > self.test_stat) / float(self.nullss)
 
         return self.pvalue, self.test_stat
@@ -104,8 +103,7 @@ def partition_data(self):
         Z = np.array([np.random.multinomial(1, np.exp(ll[n, :]-logsumexp(ll[n, :]))).argmax() for n in range(self.n)])
         le = LabelEncoder()
         Z = le.fit_transform(Z)
-        prob_Z = np.take_along_axis(ll, Z[:, None], axis=1).sum()  # experimental, might be removed
-        return Z, prob_Z
+        return Z
 
     def pvalue_onblocks(self, Z_proposal):
         """
@@ -130,7 +128,7 @@ def pvalue_onblocks(self, Z_proposal):
             X_k = np.copy(self.data[0][K_mask])
             Y_k = np.copy(self.data[1][K_mask])
             Z_k = np.copy(self.data_z[K_mask])
-            if (Z_k.shape[0] < 6):  # small blocks cause problems in GP, experimental
+            if (Z_k.shape[0] < 6):  # small blocks cause problems in GP
                 continue
             Kx, Ky, Kzx, Kzy, epsilon_x, epsilon_y, likelihood_x, likelihood_y = self.kernel_matrix(X_k, Y_k, Z_k)
             KxR, Rzx = Kernel.center_kernel_matrix_regression(Kx, Kzx, epsilon_x)
@@ -214,7 +212,8 @@ def kernel_matrix(self, data_x, data_y, data_z):
             vx = vx[:, np.abs(wx) > np.abs(wx).max() * 1e-10]
             wx = wx[np.abs(wx) > np.abs(wx).max() * 1e-10]
             vx = 2 * np.sqrt(n) * vx.dot(np.diag(np.sqrt(wx))) / np.sqrt(wx[0])
-            kernelx = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(widthz * np.ones(Dz), (1e-2, 1e2)) + WhiteKernel(0.1, (1e-10, 1e+1))
+            kernelx = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(widthz * np.ones(Dz), (1e-2, 1e2)) \
+                + WhiteKernel(0.1, (1e-10, 1e+1))
             gpx = GaussianProcessRegressor(kernel=kernelx)
             # fit Gaussian process, including hyperparameter optimization
             with warnings.catch_warnings():
@@ -237,7 +236,8 @@ def kernel_matrix(self, data_x, data_y, data_z):
             vy = vy[:, np.abs(wy) > np.abs(wy).max() * 1e-10]
             wy = wy[np.abs(wy) > np.abs(wy).max() * 1e-10]
             vy = 2 * np.sqrt(n) * vy.dot(np.diag(np.sqrt(wy))) / np.sqrt(wy[0])
-            kernely = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(widthz * np.ones(Dz), (1e-2, 1e2)) + WhiteKernel(0.1, (1e-10, 1e+1))
+            kernely = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(widthz * np.ones(Dz), (1e-2, 1e2)) \
+                + WhiteKernel(0.1, (1e-10, 1e+1))
             gpy = GaussianProcessRegressor(kernel=kernely)
             # fit Gaussian process, including hyperparameter optimization
             with warnings.catch_warnings():
@@ -341,7 +341,8 @@ def null_sample_spectral(self, uu_prod, size_u, T):
 
 class FastKCI_UInd(object):
     """
-    Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test. Unconditional version.
+    Python implementation of as speed-up version of the Kernel-based Conditional Independence (KCI) test. 
+    Unconditional version.
 
     References
     ----------
@@ -349,6 +350,9 @@ class FastKCI_UInd(object):
     "A kernel-based conditional independence test and application in causal discovery," In UAI 2011.
     [2] M. Zhang, and S. Williamson,
     "Embarrassingly Parallel Inference for Gaussian Processes" In JMLR 20 (2019)
+    [3] O. Schacht, and B. Huang
+    "FastKCI: A fast Kernel-based Conditional Indepdence test with application to causal discovery",
+    Working Paper.
     """
     def __init__(self, K=10, J=8, alpha=500, trimming_thresh=1e-3):
         """

From cf9d425ea6cb62637dfd0b2cba0fe785422b94d5 Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:13:00 +0200
Subject: [PATCH 12/16] remove propensity trimming

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/FastKCI/FastKCI.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/causallearn/utils/FastKCI/FastKCI.py b/causallearn/utils/FastKCI/FastKCI.py
index 91b99ad8..6e431568 100644
--- a/causallearn/utils/FastKCI/FastKCI.py
+++ b/causallearn/utils/FastKCI/FastKCI.py
@@ -26,7 +26,7 @@ class FastKCI_CInd(object):
     Working Paper.
 
     """
-    def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh=1e-6, trimming_thresh=1e-3, use_gp=False):
+    def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh=1e-6, use_gp=False):
         """
         Initialize the FastKCI_CInd object.
 
@@ -44,7 +44,6 @@ def __init__(self, K=10, J=8, alpha=500, epsilon=1e-3, eig_thresh=1e-6, trimming
         self.alpha = alpha
         self.epsilon = epsilon
         self.eig_thresh = eig_thresh
-        self.trimming_thresh = trimming_thresh
         self.use_gp = use_gp
         self.nullss = 5000
 
@@ -67,8 +66,7 @@ def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
         self.data_z = data_z
         self.n = data_x.shape[0]
 
-        Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)() for i in range(self.J))
-        self.Z_proposal = zip(*Z_proposal)
+        self.Z_proposal = Parallel(n_jobs=-1)(delayed(self.partition_data)() for i in range(self.J))
         block_res = Parallel(n_jobs=-1)(delayed(self.pvalue_onblocks)(self.Z_proposal[i]) for i in range(self.J))
         test_stat, null_samples, log_likelihood = zip(*block_res)
 
@@ -354,7 +352,7 @@ class FastKCI_UInd(object):
     "FastKCI: A fast Kernel-based Conditional Indepdence test with application to causal discovery",
     Working Paper.
     """
-    def __init__(self, K=10, J=8, alpha=500, trimming_thresh=1e-3):
+    def __init__(self, K=10, J=8, alpha=500):
         """
         Construct the FastKCI_UInd model.
 
@@ -363,12 +361,10 @@ def __init__(self, K=10, J=8, alpha=500, trimming_thresh=1e-3):
         K: Number of Gaussians that are assumed to be in the mixture
         J: Number of independent repittitions.
         alpha: Parameter for the Dirichlet distribution.
-        trimming_thresh: Threshold for trimming the propensity weights.
         """
         self.K = K
         self.J = J
         self.alpha = alpha
-        self.trimming_thresh = trimming_thresh
         self.nullss = 5000
         self.eig_thresh = 1e-5
 

From 2642628bc003a737740bb5617a76678f2d7db04d Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:15:40 +0200
Subject: [PATCH 13/16] add unit tests

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 tests/TestCIT_FastKCI.py | 33 +++++++++++++++++++++++++++++++++
 tests/TestCIT_RCIT.py    | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 tests/TestCIT_FastKCI.py
 create mode 100644 tests/TestCIT_RCIT.py

diff --git a/tests/TestCIT_FastKCI.py b/tests/TestCIT_FastKCI.py
new file mode 100644
index 00000000..06a84baf
--- /dev/null
+++ b/tests/TestCIT_FastKCI.py
@@ -0,0 +1,33 @@
+import unittest
+
+import numpy as np
+
+import causallearn.utils.cit as cit
+
+
+class TestCIT_FastKCI(unittest.TestCase):
+    def test_Gaussian_dist(self):
+        np.random.seed(10)
+        X = np.random.randn(1200, 1)
+        X_prime = np.random.randn(1200, 1)
+        Y = X + 0.5 * np.random.randn(1200, 1)
+        Z = Y + 0.5 * np.random.randn(1200, 1)
+        data = np.hstack((X, X_prime, Y, Z))
+
+        pvalue01 = []
+        pvalue03 = []
+        pvalue032 = []
+        for K in [3, 10]:
+            for J in [8, 16]:
+                for use_gp in [True, False]:
+                    cit_CIT = cit.CIT(data, 'fastkci', K=K, J=J, use_gp=use_gp)
+                    pvalue01.append(round(cit_CIT(0, 1), 4))
+                    pvalue03.append(round(cit_CIT(0, 3), 4))
+                    pvalue032.append(round(cit_CIT(0, 3, {2}), 4))
+
+        self.assertTrue(np.all((0.0 <= pvalue01) & (pvalue01 <= 1.0)),
+                        "pvalue01 contains invalid values")
+        self.assertTrue(np.all((0.0 <= pvalue03) & (pvalue03 <= 1.0)),
+                        "pvalue03 contains invalid values")
+        self.assertTrue(np.all((0.0 <= pvalue032) & (pvalue032 <= 1.0)),
+                        "pvalue032 contains invalid values")
diff --git a/tests/TestCIT_RCIT.py b/tests/TestCIT_RCIT.py
new file mode 100644
index 00000000..2482f971
--- /dev/null
+++ b/tests/TestCIT_RCIT.py
@@ -0,0 +1,35 @@
+import unittest
+
+import numpy as np
+
+import causallearn.utils.cit as cit
+
+
+class TestCIT_RCIT(unittest.TestCase):
+    def test_Gaussian_dist(self):
+        np.random.seed(10)
+        X = np.random.randn(300, 1)
+        X_prime = np.random.randn(300, 1)
+        Y = X + 0.5 * np.random.randn(300, 1)
+        Z = Y + 0.5 * np.random.randn(300, 1)
+        data = np.hstack((X, X_prime, Y, Z))
+
+        pvalue01 = []
+        pvalue03 = []
+        pvalue032 = []
+        for approx in ["lpd4", "hbe", "gamma", "chi2", "perm"]:
+            for num_f in [50, 100]:
+                for num_f2 in [5, 10]:
+                    for rcit in [True, False]:
+                        cit_CIT = cit.CIT(data, 'rcit', approx=approx, num_f=num_f,
+                                          num_f2=num_f2, rcit=rcit)
+                        pvalue01.append(round(cit_CIT(0, 1), 4))
+                        pvalue03.append(round(cit_CIT(0, 3), 4))
+                        pvalue032.append(round(cit_CIT(0, 3, {2}), 4))
+
+        self.assertTrue(np.all((0.0 <= pvalue01) & (pvalue01 <= 1.0)),
+                        "pvalue01 contains invalid values")
+        self.assertTrue(np.all((0.0 <= pvalue03) & (pvalue03 <= 1.0)),
+                        "pvalue03 contains invalid values")
+        self.assertTrue(np.all((0.0 <= pvalue032) & (pvalue032 <= 1.0)),
+                        "pvalue032 contains invalid values")

From 6a837f6ee4499868aeca8171401faba92676b33a Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 21 Oct 2024 13:06:09 +0200
Subject: [PATCH 14/16] spelling

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 causallearn/utils/FastKCI/FastKCI.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/causallearn/utils/FastKCI/FastKCI.py b/causallearn/utils/FastKCI/FastKCI.py
index 6e431568..f5fcc506 100644
--- a/causallearn/utils/FastKCI/FastKCI.py
+++ b/causallearn/utils/FastKCI/FastKCI.py
@@ -22,7 +22,7 @@ class FastKCI_CInd(object):
     [2] M. Zhang, and S. Williamson,
     "Embarrassingly Parallel Inference for Gaussian Processes", In JMLR 20 (2019)
     [3] O. Schacht, and B. Huang
-    "FastKCI: A fast Kernel-based Conditional Indepdence test with application to causal discovery",
+    "FastKCI: A fast Kernel-based Conditional Independence test with application to causal discovery",
     Working Paper.
 
     """
@@ -349,7 +349,7 @@ class FastKCI_UInd(object):
     [2] M. Zhang, and S. Williamson,
     "Embarrassingly Parallel Inference for Gaussian Processes" In JMLR 20 (2019)
     [3] O. Schacht, and B. Huang
-    "FastKCI: A fast Kernel-based Conditional Indepdence test with application to causal discovery",
+    "FastKCI: A fast Kernel-based Conditional Independence test with application to causal discovery",
     Working Paper.
     """
     def __init__(self, K=10, J=8, alpha=500):

From 6a580ed2e52aa48f19b5ea78bc9f7dc69222290d Mon Sep 17 00:00:00 2001
From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com>
Date: Mon, 21 Oct 2024 13:06:22 +0200
Subject: [PATCH 15/16] fix: array dtype

Signed-off-by: Oliver Schacht <oliver.schacht@uni-hamburg.de>
---
 tests/TestCIT_FastKCI.py | 3 +++
 tests/TestCIT_RCIT.py    | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tests/TestCIT_FastKCI.py b/tests/TestCIT_FastKCI.py
index 06a84baf..b8507f78 100644
--- a/tests/TestCIT_FastKCI.py
+++ b/tests/TestCIT_FastKCI.py
@@ -25,6 +25,9 @@ def test_Gaussian_dist(self):
                     pvalue03.append(round(cit_CIT(0, 3), 4))
                     pvalue032.append(round(cit_CIT(0, 3, {2}), 4))
 
+        pvalue01 = np.array(pvalue01)
+        pvalue03 = np.array(pvalue03)
+        pvalue032 = np.array(pvalue032)
         self.assertTrue(np.all((0.0 <= pvalue01) & (pvalue01 <= 1.0)),
                         "pvalue01 contains invalid values")
         self.assertTrue(np.all((0.0 <= pvalue03) & (pvalue03 <= 1.0)),
diff --git a/tests/TestCIT_RCIT.py b/tests/TestCIT_RCIT.py
index 2482f971..ef6406fd 100644
--- a/tests/TestCIT_RCIT.py
+++ b/tests/TestCIT_RCIT.py
@@ -27,6 +27,9 @@ def test_Gaussian_dist(self):
                         pvalue03.append(round(cit_CIT(0, 3), 4))
                         pvalue032.append(round(cit_CIT(0, 3, {2}), 4))
 
+        pvalue01 = np.array(pvalue01)
+        pvalue03 = np.array(pvalue03)
+        pvalue032 = np.array(pvalue032)
         self.assertTrue(np.all((0.0 <= pvalue01) & (pvalue01 <= 1.0)),
                         "pvalue01 contains invalid values")
         self.assertTrue(np.all((0.0 <= pvalue03) & (pvalue03 <= 1.0)),

From b803533ea492a2c394eb71ba16c7fbd302289d1a Mon Sep 17 00:00:00 2001
From: Haoyue Dai <hyda@cmu.edu>
Date: Tue, 5 Nov 2024 02:30:42 -0500
Subject: [PATCH 16/16] Update KCI.py

---
 causallearn/utils/KCI/KCI.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/causallearn/utils/KCI/KCI.py b/causallearn/utils/KCI/KCI.py
index 681d48d4..41cbd467 100644
--- a/causallearn/utils/KCI/KCI.py
+++ b/causallearn/utils/KCI/KCI.py
@@ -306,8 +306,8 @@ def compute_pvalue(self, data_x=None, data_y=None, data_z=None):
             k_appr, theta_appr = self.get_kappa(uu_prod)
             pvalue = 1 - stats.gamma.cdf(test_stat, k_appr, 0, theta_appr)
         else:
-            self.null_samples = self.null_sample_spectral(uu_prod, size_u, Kx.shape[0])
-            pvalue = sum(self.null_samples > test_stat) / float(self.nullss)
+            null_samples = self.null_sample_spectral(uu_prod, size_u, Kx.shape[0])
+            pvalue = sum(null_samples > test_stat) / float(self.nullss)
         return pvalue, test_stat
 
     def kernel_matrix(self, data_x, data_y, data_z):