# 稀疏向量技术#

• 描述稀疏向量技术，以及使用此技术的原因

• 定义和实现”高于阈值”算法

• 在迭代算法中应用稀疏向量技术

## 高于阈值算法#

import random

# 满足ε-差分隐私
def above_threshold(queries, df, T, epsilon):
T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
for idx, q in enumerate(queries):
nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
if q(df) + nu_i >= T_hat:
return idx
return random.randint(0,len(queries)-1)

def above_threshold_fail_signal(queries, df, T, epsilon):
T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)

for idx, q in enumerate(queries):
nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
if q(df) + nu_i >= T_hat:
return idx
# 返回一个无效的问询索引号
return None


AboveThreshold（近似）返回queries中回复结果超过阈值的第一个问询所对应的索引号。算法之所以满足差分隐私，是因为算法有可能返回错误的索引号，索引号对应的问询回复结果有可能未超过给定阈值，索引号对应的问询有可能不是第一个回复结果超过阈值的问询。

# 满足|queries|*ε-差分隐私
def naive_above_threshold(queries, df, T, epsilon):
for idx, q in enumerate(queries):
nu_i = np.random.laplace(loc=0, scale = 1/epsilon)
if q(df) + nu_i >= T:
return idx
return None


## 应用稀疏向量技术#

def age_sum_query(df, b):
return df['Age'].clip(lower=0, upper=b).sum()


913809


def naive_select_b(query, df, epsilon):
bs = range(1, 1000, 10)
best = 0
threshold = 10
epsilon_i = epsilon / len(bs)

for b in bs:
r = laplace_mech(query(df, b), b, epsilon_i)

# 如果新的求和结果与旧的求和结果很接近，则停止
if r - best <= threshold:
return b
# 否则，将"最佳"求和结果更新为当前求和结果
else:
best = r

return bs[-1]


81


Hide code cell source
bs = range(1,150,5)
query_results = [age_sum_query(adult, b) - age_sum_query(adult, b + 1) for b in bs]
plt.xlabel('"b"的值')
plt.ylabel('问询输出变化')
plt.plot(query_results);


def create_query(b):
return lambda df: age_sum_query(df, b) - age_sum_query(df, b + 1)

bs = range(1,150,5)
queries = [create_query(b) for b in bs]
epsilon = .1


91


Hide code cell source
plt.xlabel('备选"b"的值')
plt.ylabel('出现次数')
plt.hist([bs[above_threshold(queries, adult, 0, epsilon)] for i in range(20)]);


def auto_avg(df, epsilon):
def create_query(b):
return lambda df: df.clip(lower=0, upper=b).sum() - df.clip(lower=0, upper=b+1).sum()

# 构造问询流
bs = range(1,150000,5)
queries = [create_query(b) for b in bs]

# 使用1/3的隐私预算执行AboveThreshold，得到一个好的裁剪参数
epsilon_svt = epsilon / 3
final_b = bs[above_threshold(queries, df, 0, epsilon_svt)]

# 分别使用1/3的隐私预算来获得噪声求和值与噪声计数值
epsilon_sum = epsilon / 3
epsilon_count = epsilon / 3

noisy_sum = laplace_mech(df.clip(lower=0, upper=final_b).sum(), final_b, epsilon_sum)
noisy_count = laplace_mech(len(df), 1, epsilon_count)

return noisy_sum/noisy_count


38.58032933605027


auto_avg(adult['Capital Gain'], 1)

1074.7076376917673


## 返回多个问询结果#

1. 从问询流$$qs = \{q_1, \dots, q_k\}$$开始

2. 在问询流$$qs$$上执行AboveThreshold，得到第一个超过阈值的问询索引号$$i$$

3. 使用$$qs = \{q_{i+1}, \dots, q_k\}$$（即问询流的剩余部分）重启算法（转到步骤(1)）

def sparse(queries, df, c, T, epsilon):
idxs = []
pos = 0
epsilon_i = epsilon / c

# 如果我们执行完问询流中的所有问询，或者我们找到了c个超过阈值的问询回复，则停止
while pos < len(queries) and len(idxs) < c:
# 执行AboveThreshold，寻找下一个超过阈值的问询回复
next_idx = above_threshold_fail_signal(queries[pos:], df, T, epsilon_i)

# 如果AboveThreshold执行完了最后一个问询，则返回所有超过阈值的问询索引号
if next_idx == None:
return idxs

# 否则，更新pos，使其指向问询流中剩余的问询
pos = next_idx+pos
# 更新idxs，添加AboveThreshold找到的问询索引号
idxs.append(pos)
# 移动到问询流中的下一个问询
pos = pos + 1

return idxs

epsilon = 1

[18, 21, 24]


## 应用：范围问询#

def age_range_query(df, lower, upper):
df1 = df[df['Age'] > lower]
return len(df1[df1['Age'] < upper])

def create_age_range_query():
lower = np.random.randint(30, 50)
upper = np.random.randint(lower, 70)
return lambda df: age_range_query(df, lower, upper)

range_queries = [create_age_range_query() for i in range(10)]
results = [q(adult) for q in range_queries]
results

[0, 2903, 14408, 4345, 8372, 17629, 12919, 0, 1675, 13380]


def range_query_svt(queries, df, c, T, epsilon):
# 首先，执行sparse，得到"有价值的"问询
sparse_epsilon = epsilon / 2
indices = sparse(queries, adult, c, T, sparse_epsilon)

# 所有，为每个"有价值的"问询执行拉普拉斯机制
laplace_epsilon = epsilon / (2*c)
results = [laplace_mech(queries[i](df), 1, laplace_epsilon) for i in indices]
return results

range_query_svt(range_queries, adult, 5, 10000, 1)

[14417.524053935005, 17621.35957591745, 12933.616763821363, 13382.45145929881]