Merge pull request #2 from ankitufl/ankitsri/dp_evaluation

Ankitsri/dp evaluation
opendp · Dec 12, 2019 · f07b868 · f07b868
2 parents 72014e7 + 41f772b
commit f07b868
Show file tree

Hide file tree

Showing 3 changed files with 251 additions and 136 deletions.
diff --git a/evaluation/Aggregation.py b/evaluation/Aggregation.py
@@ -4,36 +4,37 @@
 import numpy as np
 
 class Aggregation:
-    def __init__(self, epsilon=1.0, t = 1):
+    def __init__(self, epsilon=1.0, t=1, repeat_count=10000):
         self.epsilon = epsilon
         self.t = t
+        self.repeat_count = repeat_count
 
     # Taking df as a parameter it shall be passed both d1 and d2 that differ by 1 record
     def exact_count(self, df, colname):
-        return df[colname].count()
+        return np.zeros(self.repeat_count) + df[colname].count()
 
     def buggy_count(self, df, colname):
-        return df[colname].count() + (random.random()*10)
+        return df[colname].count() + np.random.random_sample((self.repeat_count,))*10
 
     def dp_count(self, df, colname):
         delta = 1/(len(df) * math.sqrt(len(df)))
         sigmacnt = math.sqrt(self.t)*((math.sqrt(math.log(1/delta)) + math.sqrt(math.log((1/delta)) + self.epsilon)) / (math.sqrt(2)*self.epsilon))
-        dp_noise = np.random.normal(0, sigmacnt, 1)[0]
+        dp_noise = np.random.normal(0, sigmacnt, self.repeat_count)
         return df[colname].count() + dp_noise
 
     def dp_sum(self, df, colname):
         delta = 1/(len(df) * math.sqrt(len(df)))
         M = abs(max(df[colname]) - min(df[colname]))
         sigmasum = math.sqrt(self.t)*M*((math.sqrt(math.log(1/delta)) + math.sqrt(math.log((1/delta)) + self.epsilon)) / (math.sqrt(2)*self.epsilon))
-        dp_noise = np.random.normal(0, sigmasum, 1)[0]
+        dp_noise = np.random.normal(0, sigmasum, self.repeat_count)
         return df[colname].sum() + dp_noise
 
     def dp_mean(self, df, colname):
-        return self.dp_sum(df, colname) / self.dp_count(df, colname)
+        return np.divide(self.dp_sum(df, colname), self.dp_count(df, colname))
 
     def dp_var(self, df, colname):
         cnt = self.dp_count(df, colname)
         sum = self.dp_sum(df, colname)
         df[colname + "squared"] = df[colname] ** 2
         sumsq = self.dp_sum(df, colname + "squared")
-        return (sumsq/cnt) - ((sum/cnt)**2)
+        return np.subtract(np.divide(sumsq, cnt), np.power(np.divide(sum, cnt), 2))
diff --git a/evaluation/DPVerification.py b/evaluation/DPVerification.py
@@ -10,23 +10,23 @@
 import math
 import matplotlib.pyplot as plt
 import Aggregation as agg
-import statistics as st
 from scipy import stats
 
 class DPVerification:
     # Set the epsilon parameter of differential privacy
-    def __init__(self, epsilon=1.0):
+    def __init__(self, epsilon=1.0, dataset_size=10000):
         self.epsilon = epsilon
+        self.dataset_size = dataset_size
         self.df = self.create_simulated_dataset()
         print("Loaded " + str(len(self.df)) + " records")
         self.N = len(self.df)
         self.delta = 1/(self.N * math.sqrt(self.N))
 
     def create_simulated_dataset(self):
         np.random.seed(1)
-        userids = list(range(1, 10001))
+        userids = list(range(1, self.dataset_size+1))
         userids = ["A" + str(user) for user in userids]
-        usage = np.random.geometric(p=0.5, size=10000).tolist()
+        usage = np.random.geometric(p=0.5, size=self.dataset_size).tolist()
         df = pd.DataFrame(list(zip(userids, usage)), columns=['UserId', 'Usage'])
         return df
 
@@ -36,10 +36,6 @@ def generate_neighbors(self):
             print("No records in dataframe to run the test")
             return None, None
 
-        if(self.N > 10000):
-            self.df = self.df.sample(n=10000, random_state=1)
-            self.N = 10000
-
         d1 = self.df
         drop_idx = np.random.choice(self.df.index, 1, replace=False)
         d2 = self.df.drop(drop_idx)
@@ -49,25 +45,23 @@ def generate_neighbors(self):
     # If there is an aggregation function that we need to test, we need to apply it on neighboring datasets
     # This function applies the aggregation repeatedly to log results in two vectors that are then used for generating histogram
     # The histogram is then passed through the DP test
-    def apply_aggregation_neighbors(self, f, args1, args2, repeatcount=10000):
-        fD1, fD2 = [], []
-        for i in range(repeatcount):
-            fD1.append(f(*args1))
-            fD2.append(f(*args2))
+    def apply_aggregation_neighbors(self, f, args1, args2):
+        fD1 = f(*args1)
+        fD2 = f(*args2)
 
-        print("Mean fD1: ", st.mean(fD1), " Stdev fD1: ", st.stdev(fD1), " Mean fD2: ", st.mean(fD2), " Stdev fD2: ", st.stdev(fD2))
+        print("Mean fD1: ", np.mean(fD1), " Stdev fD1: ", np.std(fD1), " Mean fD2: ", np.mean(fD2), " Stdev fD2: ", np.std(fD2))
         return fD1, fD2
 
     # Instead of applying function to dataframe, this'll pass a query through PrivSQL and get response
     # This way we can test actual SQLDP implementation
-    def apply_query(self, d1, d2, agg_query, repeatcount=100):
+    def apply_query(self, d1, d2, agg_query):
         # To do
         return None
 
     # Generate histograms given the vectors of repeated aggregation results applied on neighboring datasets
-    def generate_histogram_neighbors(self, fD1, fD2, numbins = 0, binsize = "auto", alpha = 0.05, exact = False):
-        d1 = np.array(fD1)
-        d2 = np.array(fD2)
+    def generate_histogram_neighbors(self, fD1, fD2, numbins=0, binsize="auto", exact=False):
+        d1 = fD1
+        d2 = fD2
         d = np.concatenate((d1, d2), axis=None)
         n = d.size
         binlist = []
@@ -89,25 +83,15 @@ def generate_histogram_neighbors(self, fD1, fD2, numbins = 0, binsize = "auto",
             binlist = np.arange(np.floor(minval),np.ceil(maxval))
 
         # Calculating histograms of fD1 and fD2
-        d1hist, bin_edges = np.histogram(d1, bins = binlist, density = True)
-        #print("Sum of probabilities in D1 Histogram: ", np.sum(d1hist))
-        d2hist, bin_edges = np.histogram(d2, bins = binlist, density = True)
-        #print("Sum of probabilities in D2 Histogram: ", np.sum(d2hist))
+        d1hist, bin_edges = np.histogram(d1, bins = binlist, density = False)
+        print("Sum of frequencies in D1 Histogram: ", np.sum(d1hist))
+        d2hist, bin_edges = np.histogram(d2, bins = binlist, density = False)
+        print("Sum of frequencies in D2 Histogram: ", np.sum(d2hist))
 
-        # Lower and Upper bound
-        if(not exact):
-            num_buckets = binlist.size - 1
-            critical_value = stats.norm.ppf(1-(alpha/2/num_buckets), loc=0.0, scale=1.0)
-            d1_error_interval = critical_value * math.sqrt(num_buckets / d1.size) / 2
-            d2_error_interval = critical_value * math.sqrt(num_buckets / d2.size) / 2
-        else:
-            d1_error_interval = 0.0
-            d2_error_interval = 0.0
-
-        return d1hist, d2hist, bin_edges, d1_error_interval, d2_error_interval
+        return d1hist, d2hist, bin_edges
 
     # Plot histograms given the vectors of repeated aggregation results applied on neighboring datasets
-    def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1error, d2error, bound = True, exact = False):
+    def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1size, d2size, bound=True, exact=False):
         plt.figure(figsize=(15,6))
         if(exact):
             ax = plt.subplot(1, 1, 1)
@@ -119,19 +103,16 @@ def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1error, d
             ax.legend(['D1', 'D2'], loc="upper right")
             return
 
-        d1histbound, d2histbound, d1upper, d2upper, d1lower, d2lower = \
-            self.get_bounded_histogram(d1hist, d2hist, binlist, d1error, d2error)
+        px, py, d1histupperbound, d2histupperbound, d1histbound, d2histbound, d1lower, d2lower = \
+            self.get_bounded_histogram(d1hist, d2hist, binlist, d1size, d2size, exact = exact)
 
-        if(bound):
-            d1histbound = d1upper * math.exp(self.epsilon) + self.delta
-            d2histbound = d2upper * math.exp(self.epsilon) + self.delta
         ax = plt.subplot(1, 2, 1)
         ax.ticklabel_format(useOffset=False)
         plt.xlabel('Bin')
-        plt.ylabel('Probability')
+        plt.ylabel('Frequency')
         if(bound):
-            plt.bar(binlist[:-1], d1histbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
-            plt.bar(binlist[:-1], d2lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
+            plt.bar(binlist[:-1], d2histupperbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
+            plt.bar(binlist[:-1], d1lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
             plt.legend(['D1', 'D2'], loc="upper right")
         else:
             plt.bar(binlist[:-1], d1hist, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
@@ -141,50 +122,74 @@ def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1error, d
         ax = plt.subplot(1, 2, 2)
         ax.ticklabel_format(useOffset=False)
         plt.xlabel('Bin')
-        plt.ylabel('Probability')
+        plt.ylabel('Frequency')
         if(bound):
-            plt.bar(binlist[:-1], d2histbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
-            plt.bar(binlist[:-1], d1lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
+            plt.bar(binlist[:-1], d1histupperbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
+            plt.bar(binlist[:-1], d2lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
             plt.legend(['D2', 'D1'], loc="upper right")
         else:
             plt.bar(binlist[:-1], d2hist, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
             plt.bar(binlist[:-1], d1hist, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
             plt.legend(['D2', 'D1'], loc="upper right")
         plt.show()
 
-    def get_bounded_histogram(self, d1hist, d2hist, binlist, d1error, d2error):
+    # Check if histogram of fD1 values multiplied by e^epsilon and summed by delta is bounding fD2 and vice versa
+    # Use the histogram results and create bounded histograms to compare in DP test
+    def get_bounded_histogram(self, d1hist, d2hist, binlist, d1size, d2size, exact, alpha=0.05):
+        d1_error_interval = 0.0
+        d2_error_interval = 0.0
+        # Lower and Upper bound
+        if(not exact):
+            num_buckets = binlist.size - 1
+            critical_value = stats.norm.ppf(1-(alpha / 2 / num_buckets), loc=0.0, scale=1.0)
+            d1_error_interval = critical_value * math.sqrt(num_buckets / d1size) / 2
+            d2_error_interval = critical_value * math.sqrt(num_buckets / d2size) / 2
+
         num_buckets = binlist.size - 1
-        d1upper = np.power(np.sqrt(d1hist * num_buckets) + d1error, 2) / num_buckets
-        d2upper = np.power(np.sqrt(d2hist * num_buckets) + d2error, 2) / num_buckets
-        d1lower = np.power(np.sqrt(d1hist * num_buckets) - d1error, 2) / num_buckets
-        d2lower = np.power(np.sqrt(d2hist * num_buckets) - d2error, 2) / num_buckets
+        px = np.divide(d1hist, d1size)
+        py = np.divide(d2hist, d2size)
+
+        d1histbound = px * math.exp(self.epsilon) + self.delta
+        d2histbound = py * math.exp(self.epsilon) + self.delta
+
+        d1upper = np.power(np.sqrt(px * num_buckets) + d1_error_interval, 2) / num_buckets
+        d2upper = np.power(np.sqrt(py * num_buckets) + d2_error_interval, 2) / num_buckets
+        d1lower = np.power(np.sqrt(px * num_buckets) - d1_error_interval, 2) / num_buckets
+        d2lower = np.power(np.sqrt(py * num_buckets) - d2_error_interval, 2) / num_buckets
+
         np.maximum(d1lower, 0.0, d1lower)
         np.maximum(d2lower, 0.0, d1lower)
-        d1histbound = d1upper * math.exp(self.epsilon) + self.delta
-        d2histbound = d2upper * math.exp(self.epsilon) + self.delta
-        return d1histbound, d2histbound, d1upper, d2upper, d1lower, d2lower
 
-    # Check if histogram of fD1 values multiplied by e^epsilon and summed by delta is bounding fD2 and vice versa
-    def dp_test(self, d1hist, d2hist, binlist, d1error, d2error, debug = False):
-        d1histbound, d2histbound, d1upper, d2upper, d1lower, d2lower = \
-            self.get_bounded_histogram(d1hist, d2hist, binlist, d1error, d2error)
+        d1histupperbound = d1upper * math.exp(self.epsilon) + self.delta
+        d2histupperbound = d2upper * math.exp(self.epsilon) + self.delta
+
+        return px, py, d1histupperbound, d2histupperbound, d1histbound, d2histbound, d1lower, d2lower
+
+    # Differentially Private Predicate Test
+    def dp_test(self, d1hist, d2hist, binlist, d1size, d2size, debug=False, exact=False):
+        px, py, d1histupperbound, d2histupperbound, d1histbound, d2histbound, d1lower, d2lower = \
+            self.get_bounded_histogram(d1hist, d2hist, binlist, d1size, d2size, exact)
         if(debug):
             print("Parameters")
             print("epsilon: ", self.epsilon, " delta: ", self.delta)
             print("Bins\n", binlist)
-            print("D1Error\n", d1error)
-            print("D2Error\n", d2error)
             print("Original D1 Histogram\n", d1hist)
+            print("Probability of D1 Histogram\n", px)
             print("D1 Lower\n", d1lower)
-            print("D1 Upper\n", d1upper)
+            print("D1 Upper\n", d1histupperbound)
             print("D1 Histogram to bound D2\n", d1histbound)
             print("Original D2 Histogram\n", d2hist)
+            print("Probability of D2 Histogram\n", py)
             print("D2 Lower\n", d2lower)
-            print("D2 Upper\n", d2upper)
+            print("D2 Upper\n", d2histupperbound)
             print("D2 Histogram to bound D1\n", d2histbound)
-            print("Comparison - D1 bound to D2\n", np.greater(d1histbound, d2lower))
-            print("Comparison - D2 bound to D1\n", np.greater(d2histbound, d1lower))
-        return np.all(np.greater(d1histbound, d2lower)) and np.all(np.greater(d2histbound, d1lower))
+            print("Comparison - D2 bound to D1\n", np.greater(d1hist, np.zeros(d1hist.size)), np.logical_and(np.greater(d1hist, np.zeros(d1hist.size)), np.greater(d1lower, d2histupperbound)))
+            print("Comparison - D1 bound to D2\n", np.greater(d2hist, np.zeros(d2hist.size)), np.logical_and(np.greater(d2hist, np.zeros(d2hist.size)), np.greater(d2lower, d1histupperbound)))
+
+        # Check if any of the bounds across the bins violate the relaxed DP condition
+        bound_exceeded = np.any(np.logical_and(np.greater(d1hist, np.zeros(d1hist.size)), np.greater(d1lower, d2histupperbound))) or \
+        np.any(np.logical_and(np.greater(d2hist, np.zeros(d2hist.size)), np.greater(d2lower, d1histupperbound)))
+        return not bound_exceeded
 
     # K-S Two sample test between the repeated query results on neighboring datasets
     def ks_test(self, fD1, fD2):
@@ -202,19 +207,20 @@ def kl_divergence(self, p, q):
     def wasserstein_distance(self, d1hist, d2hist):
         return stats.wasserstein_distance(d1hist, d2hist)
 
-    def aggtest(self, f, colname, repeatcount, numbins = 0, binsize = "auto", debug = False, plot = True, bound = True, exact = False):
+    def aggtest(self, f, colname, numbins=0, binsize="auto", debug=False, plot=True, bound=True, exact=False):
         d1, d2 = self.generate_neighbors()
 
-        fD1, fD2 = self.apply_aggregation_neighbors(f, (d1, colname), (d2, colname), repeatcount)
-
+        fD1, fD2 = self.apply_aggregation_neighbors(f, (d1, colname), (d2, colname))
+        d1size, d2size = fD1.size, fD2.size
+
         ks_res = self.ks_test(fD1, fD2)
         print("\nKS 2-sample Test Result: ", ks_res, "\n")
 
         #andderson_res = self.anderson_ksamp(fD1, fD2)
         #print("Anderson 2-sample Test Result: ", andderson_res, "\n")
 
-        d1hist, d2hist, bin_edges, d1error, d2error = \
-            self.generate_histogram_neighbors(fD1, fD2, numbins, binsize, exact = exact)
+        d1hist, d2hist, bin_edges = \
+            self.generate_histogram_neighbors(fD1, fD2, numbins, binsize, exact=exact)
 
         #kl_res = self.kl_divergence(d1hist, d2hist)
         #print("\nKL-Divergence Test: ", kl_res, "\n")
@@ -227,11 +233,11 @@ def aggtest(self, f, colname, repeatcount, numbins = 0, binsize = "auto", debug
 
         dp_res = False
         if(not exact):
-            dp_res = self.dp_test(d1hist, d2hist, bin_edges, d1error, d2error, debug)
+            dp_res = self.dp_test(d1hist, d2hist, bin_edges, d1size, d2size, debug, exact=exact)
         print("DP Predicate Test:", dp_res, "\n")
 
         if(plot):
-            self.plot_histogram_neighbors(fD1, fD2, d1hist, d2hist, bin_edges, d1error, d2error, bound, exact)
+            self.plot_histogram_neighbors(fD1, fD2, d1hist, d2hist, bin_edges, d1size, d2size, bound, exact)
         return dp_res, ks_res, ws_res
 
     # Main method listing all the DP verification steps