# 局部敏感度#

• 定义局部敏感度并解释其与全局敏感度的区别

• 描述局部敏感度是如何泄露数据信息的

• 使用”建议-测试-发布”框架来安全地使用局部敏感度

• 描述平滑敏感度框架

• 使用”采样-聚合”框架来回复敏感度为任意值的问询

(15)#\begin{align} LS(f, x) = \max_{x': d(x,x') \leq 1} \lvert f(x) - f(x') \rvert \end{align}

## 均值问询的局部敏感度#

(16)#\begin{align} f(x) =& \frac{\sum_{i=1}^{n} x_i}{n} \end{align}

(17)#\begin{align} \lvert f(x') - f(x) \rvert = & \bigg\lvert \frac{\sum_{i=1}^{n} x_i + u}{n+1} - \frac{\sum_{i=1}^{n} x_i}{n} \bigg\rvert \\ \leq& \bigg\lvert \frac{\sum_{i=1}^{n} x_i + u}{n+1} - \frac{\sum_{i=1}^{n} x_i}{n+1} \bigg\rvert \\ =& \bigg\lvert \frac{\sum_{i=1}^{n} x_i + u - \sum_{i=1}^{n} x_i}{n+1}\bigg\rvert \\ =& \bigg\lvert \frac{u}{n+1} \bigg\rvert \\ \end{align}

## 通过局部敏感度实现差分隐私？#

(18)#\begin{align} F(x) = f(x) + \mathsf{Lap}\left(\frac{LS(f,x)}{\epsilon}\right) \end{align}

(19)#\begin{align} \lvert x \rvert = \frac{b}{LS(f, x)} - 1 \end{align}

### “建议-测试-发布”框架#

(20)#\begin{align} A(f,x,k) = \max_{y: d(x,y) \leq k} LS(f, y) \end{align}

(21)#\begin{align} D(f, x, b) = \text{argmin}_k A(f, x, k) > b \end{align}

1. 建议一个局部敏感度的目标边界$$b$$

2. 如果$$D(f, x, b) + \mathsf{Lap}(\frac{1}{\epsilon}) < \frac{\log(2/\delta)}{2\epsilon}$$，返回$$\bot$$

3. 否则，返回$$f(x)+Lap(\frac{b}{\epsilon})$$

Hide code cell source
def ls_at_distance(df, u, k):
return np.abs(u/(len(df) - k + 1))

def dist_to_high_ls(df, u, b):
k = 0

while ls_at_distance(df, u, k) < b:
k += 1

return k

Hide code cell source
def ptr_avg(df, u, b, epsilon, delta, logging=False):
df_clipped = df.clip(upper=u)
k = dist_to_high_ls(df_clipped, u, b)

noisy_distance = laplace_mech(k, 1, epsilon)
threshold = np.log(2/delta)/(2*epsilon)

if logging:
print(f"噪声距离为{noisy_distance}，而门限值为{threshold}")

if noisy_distance >= threshold:
return laplace_mech(df_clipped.mean(), b, epsilon)
else:
return None

df = adult['Age']
u = 100                    # 设置年龄的上界为100
epsilon = 1                # 设置ε = 1
delta = 1/(len(df)**2)     # 设置δ = 1/n^2
b = 0.005                  # 建议敏感度为0.005

ptr_avg(df, u, b, epsilon, delta, logging=True)

噪声距离为12562.509072913877，而门限值为10.73744412245554

38.56939238631368


Hide code cell source
def gs_avg(df, u, epsilon):
df_clipped = df.clip(upper=u)

noisy_sum = laplace_mech(df_clipped.sum(), u, .5*epsilon)
noisy_count = laplace_mech(len(df_clipped), 1, .5*epsilon)

return noisy_sum / noisy_count

Hide code cell source
gs_avg(adult['Age'], u, epsilon)

38.592278780419505

Hide code cell source
gs_results  = [pct_error(np.mean(adult['Age']), gs_avg(df, u, epsilon)) for i in range(100)]
ptr_results = [pct_error(np.mean(adult['Age']), ptr_avg(df, u, b, epsilon, delta)) for i in range(100)]

_, bins, _ = plt.hist(gs_results, label='全局敏感度');
plt.hist(ptr_results, alpha=.7, label='"建议-测试-发布"框架', bins=bins);
plt.xlabel('误差率')
plt.ylabel('尝试次数')
plt.legend();


## 平滑敏感度#

1. 设置$$\beta = \frac{\epsilon}{2\log(2/\delta)}$$

2. $$S = \max_{k = 1, \dots, n} e^{-\beta k} A(f, x, k)$$

3. 发布$$f(x) + \mathsf{Lap}\left(\frac{2S}{\epsilon}\right)$$

Hide code cell source
df = adult['Age']
epsilon = 1           # 设置ε = 1
delta = 1/len(df)**2  # 设置δ = 1/n^2

# 步骤1：设置β
beta = epsilon / (2*np.log(2/delta))

# 步骤2：对于不同的k值计算平滑后的局部敏感度
r = [np.exp(- beta * k) * ls_at_distance(df, u, k) for k in range(0,200)]
plt.plot(r);
plt.xlabel('k的取值')
plt.ylabel('平滑后的局部敏感度');

S = np.max(r)
sensitivity = 2*S
print(f'最终敏感度: {sensitivity}')

最终敏感度: 0.006142128861863522


## “采样-聚合”框架#

1. 将数据集$$X \in D$$拆分为$$k$$个不相交的数据块$$x_1, \dots, x_k$$

2. 计算每个数据块的裁剪回复值：$$a_i = \max(l, \min(u, f(x_i)))$$

3. 计算平均回复值并增加噪声：$$A = \left(\frac{1}{k} \sum_{i=1}^k a_i \right) + \mathsf{Lap}\left(\frac{u - l}{k\epsilon}\right)$$

Hide code cell content
def f(df):
return df.mean()

def saa_avg_age(k, epsilon, logging=False):

# 计算每个数据块应包含的行数
chunk_size = int(np.ceil(df.shape[0] / k))

if logging:
print(f'数据块大小: {chunk_size}')

# 步骤1：将df拆分为数据块
xs      = [df[i:i+chunk_size] for i in range(0,df.shape[0],chunk_size)]

# 步骤2：在每个x_i上执行f，并裁剪输出值
answers = [f(x_i) for x_i in xs]

u = 80
l = 20

# 步骤3：计算输出均值，并增加噪声
return noisy_mean

saa_avg_age(600, 1, logging=True)

数据块大小: 55

38.74748083041064


Hide code cell content
def plot_results(k):
_, bins, _ = plt.hist([pct_error(np.mean(df), saa_avg_age(k, epsilon)) for i in range(100)]);
plt.hist([pct_error(np.mean(df), gs_avg(df, u, epsilon)) for i in range(100)], alpha=.7, bins=bins);

Hide code cell source
# k = 10；全局敏感度的回复结果准确性*非常*好
plot_results(10)
plt.xlabel('误差率')
plt.ylabel('尝试次数')

Text(0, 0.5, '尝试次数')

Hide code cell source
# k = 1000；全局敏感度的回复结果准确性仍然比较好
plot_results(1000)
plt.xlabel('误差率')
plt.ylabel('尝试次数')

Text(0, 0.5, '尝试次数')

Hide code cell source
# k = 6000；"采样-聚合"框架的回复结果接近全局敏感度的回复结果了！
plot_results(6000)
plt.xlabel('误差率')
plt.ylabel('尝试次数')

Text(0, 0.5, '尝试次数')