Skip to content

Commit

Permalink
Merge pull request #2 from ankitufl/ankitsri/dp_evaluation
Browse files Browse the repository at this point in the history
Ankitsri/dp evaluation
  • Loading branch information
ankit-oss authored Dec 12, 2019
2 parents 72014e7 + 41f772b commit f07b868
Show file tree
Hide file tree
Showing 3 changed files with 251 additions and 136 deletions.
15 changes: 8 additions & 7 deletions evaluation/Aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,37 @@
import numpy as np

class Aggregation:
def __init__(self, epsilon=1.0, t = 1):
def __init__(self, epsilon=1.0, t=1, repeat_count=10000):
self.epsilon = epsilon
self.t = t
self.repeat_count = repeat_count

# Taking df as a parameter it shall be passed both d1 and d2 that differ by 1 record
def exact_count(self, df, colname):
return df[colname].count()
return np.zeros(self.repeat_count) + df[colname].count()

def buggy_count(self, df, colname):
return df[colname].count() + (random.random()*10)
return df[colname].count() + np.random.random_sample((self.repeat_count,))*10

def dp_count(self, df, colname):
delta = 1/(len(df) * math.sqrt(len(df)))
sigmacnt = math.sqrt(self.t)*((math.sqrt(math.log(1/delta)) + math.sqrt(math.log((1/delta)) + self.epsilon)) / (math.sqrt(2)*self.epsilon))
dp_noise = np.random.normal(0, sigmacnt, 1)[0]
dp_noise = np.random.normal(0, sigmacnt, self.repeat_count)
return df[colname].count() + dp_noise

def dp_sum(self, df, colname):
delta = 1/(len(df) * math.sqrt(len(df)))
M = abs(max(df[colname]) - min(df[colname]))
sigmasum = math.sqrt(self.t)*M*((math.sqrt(math.log(1/delta)) + math.sqrt(math.log((1/delta)) + self.epsilon)) / (math.sqrt(2)*self.epsilon))
dp_noise = np.random.normal(0, sigmasum, 1)[0]
dp_noise = np.random.normal(0, sigmasum, self.repeat_count)
return df[colname].sum() + dp_noise

def dp_mean(self, df, colname):
return self.dp_sum(df, colname) / self.dp_count(df, colname)
return np.divide(self.dp_sum(df, colname), self.dp_count(df, colname))

def dp_var(self, df, colname):
cnt = self.dp_count(df, colname)
sum = self.dp_sum(df, colname)
df[colname + "squared"] = df[colname] ** 2
sumsq = self.dp_sum(df, colname + "squared")
return (sumsq/cnt) - ((sum/cnt)**2)
return np.subtract(np.divide(sumsq, cnt), np.power(np.divide(sum, cnt), 2))
148 changes: 77 additions & 71 deletions evaluation/DPVerification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,23 @@
import math
import matplotlib.pyplot as plt
import Aggregation as agg
import statistics as st
from scipy import stats

class DPVerification:
# Set the epsilon parameter of differential privacy
def __init__(self, epsilon=1.0):
def __init__(self, epsilon=1.0, dataset_size=10000):
self.epsilon = epsilon
self.dataset_size = dataset_size
self.df = self.create_simulated_dataset()
print("Loaded " + str(len(self.df)) + " records")
self.N = len(self.df)
self.delta = 1/(self.N * math.sqrt(self.N))

def create_simulated_dataset(self):
np.random.seed(1)
userids = list(range(1, 10001))
userids = list(range(1, self.dataset_size+1))
userids = ["A" + str(user) for user in userids]
usage = np.random.geometric(p=0.5, size=10000).tolist()
usage = np.random.geometric(p=0.5, size=self.dataset_size).tolist()
df = pd.DataFrame(list(zip(userids, usage)), columns=['UserId', 'Usage'])
return df

Expand All @@ -36,10 +36,6 @@ def generate_neighbors(self):
print("No records in dataframe to run the test")
return None, None

if(self.N > 10000):
self.df = self.df.sample(n=10000, random_state=1)
self.N = 10000

d1 = self.df
drop_idx = np.random.choice(self.df.index, 1, replace=False)
d2 = self.df.drop(drop_idx)
Expand All @@ -49,25 +45,23 @@ def generate_neighbors(self):
# If there is an aggregation function that we need to test, we need to apply it on neighboring datasets
# This function applies the aggregation repeatedly to log results in two vectors that are then used for generating histogram
# The histogram is then passed through the DP test
def apply_aggregation_neighbors(self, f, args1, args2, repeatcount=10000):
fD1, fD2 = [], []
for i in range(repeatcount):
fD1.append(f(*args1))
fD2.append(f(*args2))
def apply_aggregation_neighbors(self, f, args1, args2):
fD1 = f(*args1)
fD2 = f(*args2)

print("Mean fD1: ", st.mean(fD1), " Stdev fD1: ", st.stdev(fD1), " Mean fD2: ", st.mean(fD2), " Stdev fD2: ", st.stdev(fD2))
print("Mean fD1: ", np.mean(fD1), " Stdev fD1: ", np.std(fD1), " Mean fD2: ", np.mean(fD2), " Stdev fD2: ", np.std(fD2))
return fD1, fD2

# Instead of applying function to dataframe, this'll pass a query through PrivSQL and get response
# This way we can test actual SQLDP implementation
def apply_query(self, d1, d2, agg_query, repeatcount=100):
def apply_query(self, d1, d2, agg_query):
# To do
return None

# Generate histograms given the vectors of repeated aggregation results applied on neighboring datasets
def generate_histogram_neighbors(self, fD1, fD2, numbins = 0, binsize = "auto", alpha = 0.05, exact = False):
d1 = np.array(fD1)
d2 = np.array(fD2)
def generate_histogram_neighbors(self, fD1, fD2, numbins=0, binsize="auto", exact=False):
d1 = fD1
d2 = fD2
d = np.concatenate((d1, d2), axis=None)
n = d.size
binlist = []
Expand All @@ -89,25 +83,15 @@ def generate_histogram_neighbors(self, fD1, fD2, numbins = 0, binsize = "auto",
binlist = np.arange(np.floor(minval),np.ceil(maxval))

# Calculating histograms of fD1 and fD2
d1hist, bin_edges = np.histogram(d1, bins = binlist, density = True)
#print("Sum of probabilities in D1 Histogram: ", np.sum(d1hist))
d2hist, bin_edges = np.histogram(d2, bins = binlist, density = True)
#print("Sum of probabilities in D2 Histogram: ", np.sum(d2hist))
d1hist, bin_edges = np.histogram(d1, bins = binlist, density = False)
print("Sum of frequencies in D1 Histogram: ", np.sum(d1hist))
d2hist, bin_edges = np.histogram(d2, bins = binlist, density = False)
print("Sum of frequencies in D2 Histogram: ", np.sum(d2hist))

# Lower and Upper bound
if(not exact):
num_buckets = binlist.size - 1
critical_value = stats.norm.ppf(1-(alpha/2/num_buckets), loc=0.0, scale=1.0)
d1_error_interval = critical_value * math.sqrt(num_buckets / d1.size) / 2
d2_error_interval = critical_value * math.sqrt(num_buckets / d2.size) / 2
else:
d1_error_interval = 0.0
d2_error_interval = 0.0

return d1hist, d2hist, bin_edges, d1_error_interval, d2_error_interval
return d1hist, d2hist, bin_edges

# Plot histograms given the vectors of repeated aggregation results applied on neighboring datasets
def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1error, d2error, bound = True, exact = False):
def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1size, d2size, bound=True, exact=False):
plt.figure(figsize=(15,6))
if(exact):
ax = plt.subplot(1, 1, 1)
Expand All @@ -119,19 +103,16 @@ def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1error, d
ax.legend(['D1', 'D2'], loc="upper right")
return

d1histbound, d2histbound, d1upper, d2upper, d1lower, d2lower = \
self.get_bounded_histogram(d1hist, d2hist, binlist, d1error, d2error)
px, py, d1histupperbound, d2histupperbound, d1histbound, d2histbound, d1lower, d2lower = \
self.get_bounded_histogram(d1hist, d2hist, binlist, d1size, d2size, exact = exact)

if(bound):
d1histbound = d1upper * math.exp(self.epsilon) + self.delta
d2histbound = d2upper * math.exp(self.epsilon) + self.delta
ax = plt.subplot(1, 2, 1)
ax.ticklabel_format(useOffset=False)
plt.xlabel('Bin')
plt.ylabel('Probability')
plt.ylabel('Frequency')
if(bound):
plt.bar(binlist[:-1], d1histbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d2lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d2histupperbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d1lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.legend(['D1', 'D2'], loc="upper right")
else:
plt.bar(binlist[:-1], d1hist, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
Expand All @@ -141,50 +122,74 @@ def plot_histogram_neighbors(self, fD1, fD2, d1hist, d2hist, binlist, d1error, d
ax = plt.subplot(1, 2, 2)
ax.ticklabel_format(useOffset=False)
plt.xlabel('Bin')
plt.ylabel('Probability')
plt.ylabel('Frequency')
if(bound):
plt.bar(binlist[:-1], d2histbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d1lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d1histupperbound, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d2lower, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.legend(['D2', 'D1'], loc="upper right")
else:
plt.bar(binlist[:-1], d2hist, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.bar(binlist[:-1], d1hist, alpha=0.5, width=np.diff(binlist), ec="k", align="edge")
plt.legend(['D2', 'D1'], loc="upper right")
plt.show()

def get_bounded_histogram(self, d1hist, d2hist, binlist, d1error, d2error):
# Check if histogram of fD1 values multiplied by e^epsilon and summed by delta is bounding fD2 and vice versa
# Use the histogram results and create bounded histograms to compare in DP test
def get_bounded_histogram(self, d1hist, d2hist, binlist, d1size, d2size, exact, alpha=0.05):
d1_error_interval = 0.0
d2_error_interval = 0.0
# Lower and Upper bound
if(not exact):
num_buckets = binlist.size - 1
critical_value = stats.norm.ppf(1-(alpha / 2 / num_buckets), loc=0.0, scale=1.0)
d1_error_interval = critical_value * math.sqrt(num_buckets / d1size) / 2
d2_error_interval = critical_value * math.sqrt(num_buckets / d2size) / 2

num_buckets = binlist.size - 1
d1upper = np.power(np.sqrt(d1hist * num_buckets) + d1error, 2) / num_buckets
d2upper = np.power(np.sqrt(d2hist * num_buckets) + d2error, 2) / num_buckets
d1lower = np.power(np.sqrt(d1hist * num_buckets) - d1error, 2) / num_buckets
d2lower = np.power(np.sqrt(d2hist * num_buckets) - d2error, 2) / num_buckets
px = np.divide(d1hist, d1size)
py = np.divide(d2hist, d2size)

d1histbound = px * math.exp(self.epsilon) + self.delta
d2histbound = py * math.exp(self.epsilon) + self.delta

d1upper = np.power(np.sqrt(px * num_buckets) + d1_error_interval, 2) / num_buckets
d2upper = np.power(np.sqrt(py * num_buckets) + d2_error_interval, 2) / num_buckets
d1lower = np.power(np.sqrt(px * num_buckets) - d1_error_interval, 2) / num_buckets
d2lower = np.power(np.sqrt(py * num_buckets) - d2_error_interval, 2) / num_buckets

np.maximum(d1lower, 0.0, d1lower)
np.maximum(d2lower, 0.0, d1lower)
d1histbound = d1upper * math.exp(self.epsilon) + self.delta
d2histbound = d2upper * math.exp(self.epsilon) + self.delta
return d1histbound, d2histbound, d1upper, d2upper, d1lower, d2lower

# Check if histogram of fD1 values multiplied by e^epsilon and summed by delta is bounding fD2 and vice versa
def dp_test(self, d1hist, d2hist, binlist, d1error, d2error, debug = False):
d1histbound, d2histbound, d1upper, d2upper, d1lower, d2lower = \
self.get_bounded_histogram(d1hist, d2hist, binlist, d1error, d2error)
d1histupperbound = d1upper * math.exp(self.epsilon) + self.delta
d2histupperbound = d2upper * math.exp(self.epsilon) + self.delta

return px, py, d1histupperbound, d2histupperbound, d1histbound, d2histbound, d1lower, d2lower

# Differentially Private Predicate Test
def dp_test(self, d1hist, d2hist, binlist, d1size, d2size, debug=False, exact=False):
px, py, d1histupperbound, d2histupperbound, d1histbound, d2histbound, d1lower, d2lower = \
self.get_bounded_histogram(d1hist, d2hist, binlist, d1size, d2size, exact)
if(debug):
print("Parameters")
print("epsilon: ", self.epsilon, " delta: ", self.delta)
print("Bins\n", binlist)
print("D1Error\n", d1error)
print("D2Error\n", d2error)
print("Original D1 Histogram\n", d1hist)
print("Probability of D1 Histogram\n", px)
print("D1 Lower\n", d1lower)
print("D1 Upper\n", d1upper)
print("D1 Upper\n", d1histupperbound)
print("D1 Histogram to bound D2\n", d1histbound)
print("Original D2 Histogram\n", d2hist)
print("Probability of D2 Histogram\n", py)
print("D2 Lower\n", d2lower)
print("D2 Upper\n", d2upper)
print("D2 Upper\n", d2histupperbound)
print("D2 Histogram to bound D1\n", d2histbound)
print("Comparison - D1 bound to D2\n", np.greater(d1histbound, d2lower))
print("Comparison - D2 bound to D1\n", np.greater(d2histbound, d1lower))
return np.all(np.greater(d1histbound, d2lower)) and np.all(np.greater(d2histbound, d1lower))
print("Comparison - D2 bound to D1\n", np.greater(d1hist, np.zeros(d1hist.size)), np.logical_and(np.greater(d1hist, np.zeros(d1hist.size)), np.greater(d1lower, d2histupperbound)))
print("Comparison - D1 bound to D2\n", np.greater(d2hist, np.zeros(d2hist.size)), np.logical_and(np.greater(d2hist, np.zeros(d2hist.size)), np.greater(d2lower, d1histupperbound)))

# Check if any of the bounds across the bins violate the relaxed DP condition
bound_exceeded = np.any(np.logical_and(np.greater(d1hist, np.zeros(d1hist.size)), np.greater(d1lower, d2histupperbound))) or \
np.any(np.logical_and(np.greater(d2hist, np.zeros(d2hist.size)), np.greater(d2lower, d1histupperbound)))
return not bound_exceeded

# K-S Two sample test between the repeated query results on neighboring datasets
def ks_test(self, fD1, fD2):
Expand All @@ -202,19 +207,20 @@ def kl_divergence(self, p, q):
def wasserstein_distance(self, d1hist, d2hist):
return stats.wasserstein_distance(d1hist, d2hist)

def aggtest(self, f, colname, repeatcount, numbins = 0, binsize = "auto", debug = False, plot = True, bound = True, exact = False):
def aggtest(self, f, colname, numbins=0, binsize="auto", debug=False, plot=True, bound=True, exact=False):
d1, d2 = self.generate_neighbors()

fD1, fD2 = self.apply_aggregation_neighbors(f, (d1, colname), (d2, colname), repeatcount)

fD1, fD2 = self.apply_aggregation_neighbors(f, (d1, colname), (d2, colname))
d1size, d2size = fD1.size, fD2.size

ks_res = self.ks_test(fD1, fD2)
print("\nKS 2-sample Test Result: ", ks_res, "\n")

#andderson_res = self.anderson_ksamp(fD1, fD2)
#print("Anderson 2-sample Test Result: ", andderson_res, "\n")

d1hist, d2hist, bin_edges, d1error, d2error = \
self.generate_histogram_neighbors(fD1, fD2, numbins, binsize, exact = exact)
d1hist, d2hist, bin_edges = \
self.generate_histogram_neighbors(fD1, fD2, numbins, binsize, exact=exact)

#kl_res = self.kl_divergence(d1hist, d2hist)
#print("\nKL-Divergence Test: ", kl_res, "\n")
Expand All @@ -227,11 +233,11 @@ def aggtest(self, f, colname, repeatcount, numbins = 0, binsize = "auto", debug

dp_res = False
if(not exact):
dp_res = self.dp_test(d1hist, d2hist, bin_edges, d1error, d2error, debug)
dp_res = self.dp_test(d1hist, d2hist, bin_edges, d1size, d2size, debug, exact=exact)
print("DP Predicate Test:", dp_res, "\n")

if(plot):
self.plot_histogram_neighbors(fD1, fD2, d1hist, d2hist, bin_edges, d1error, d2error, bound, exact)
self.plot_histogram_neighbors(fD1, fD2, d1hist, d2hist, bin_edges, d1size, d2size, bound, exact)
return dp_res, ks_res, ws_res

# Main method listing all the DP verification steps
Expand Down
Loading

0 comments on commit f07b868

Please sign in to comment.