import numpy as np
import pandas as pd
n = 10 # Original sample size
x = np.random.normal(size=n) # Normal(0,1) distribution, n samples
print(x)
[ 1.11111278 0.21678038 0.45039085 0.27968718 0.71988904 -0.73534517 1.44172741 0.88015123 -0.20238122 0.76390085]
Let's investigate the sampling error (standard deviation) of $\frac{1}{n}\sum^n_{i=1}x_i$
x_mean = np.mean(x)
x_std = np.std(x) # sigma hat
x_mean_std = x_std / np.sqrt(n)
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
true_error: 0.31622776601683794 est_error: 0.1916481020854632
Boostrap error:
B = 10000
boot = list()
for i in range(B):
boot.append(np.random.choice(x, n))
boot[0:3]
[array([-0.20238122, 1.44172741, 0.45039085, -0.20238122, 0.45039085, -0.73534517, -0.20238122, 0.45039085, -0.73534517, 0.88015123]), array([ 0.27968718, -0.20238122, 0.21678038, 1.11111278, 1.11111278, 0.71988904, 0.27968718, 0.76390085, 1.11111278, 0.88015123]), array([0.76390085, 0.21678038, 0.88015123, 0.21678038, 0.45039085, 0.88015123, 0.21678038, 0.45039085, 0.21678038, 1.44172741])]
boot_mean = np.full(shape=B,fill_value=np.nan)
for i in range(len(boot)):
boot_mean[i] = np.mean(boot[i])
boot_mean[0:3]
array([0.15952172, 0.6271053 , 0.57338339])
x_mean_std_boot = np.sqrt(np.sum((boot_mean - np.mean(boot_mean))**2)/B) # bootstrapped error estimation
👆 It should be similar to x_mean_std
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)
est_error: 0.1916481020854632 bootstrapped_error: 0.19237618312744797
When n is small, x is not a good sampling of N(0,1):
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)
true_error: 0.31622776601683794 est_error: 0.1916481020854632 bootstrapped_error: 0.19237618312744797
The difference of the first and the second is determined by n; the difference of the second and the third is determined by B
When n is large, and when our sampling procedure is good (iid in our case), all the above three will be close.
n = 10000 # Original sample size
x = np.random.normal(size=n) # Normal(0,1) distribution, n samples
x_mean = np.mean(x)
x_std = np.std(x) # sigma hat
x_mean_std = x_std / np.sqrt(n)
B = 10000
boot = list()
for i in range(B):
boot.append(np.random.choice(x, n))
boot_mean = np.full(shape=B,fill_value=np.nan)
for i in range(len(boot)):
boot_mean[i] = np.mean(boot[i])
x_mean_std_boot = np.sqrt(np.sum((boot_mean - np.mean(boot_mean))**2)/B) # bootstrapped error estimation
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)
true_error: 0.01 est_error: 0.009997411955909488 bootstrapped_error: 0.010056950877825977