2023-07-05 17:53:41 +02:00
|
|
|
\lecture{11}{}{Intuition for the CLT}
|
|
|
|
\subsection{The Central Limit Theorem}
|
2023-05-11 17:51:08 +02:00
|
|
|
|
|
|
|
For $X_1, X_2,\ldots$ i.i.d.~we were looking
|
|
|
|
at $S_n \coloneqq \sum_{i=1}^n X_i$.
|
|
|
|
Then the LLN basically states, that $S_n$ can be approximated by $n \bE[X_1]$.
|
|
|
|
\begin{question}
|
|
|
|
What is the error of this approximation?
|
|
|
|
\end{question}
|
|
|
|
|
|
|
|
We set $\mu\coloneqq \bE[X_1]$ and $\sigma^2 \coloneqq \Var(X_1) \in (0,\infty)$.
|
|
|
|
We know that $\bE[S_n] = n \mu$ and $\Var(S_n) = n\sigma^2$.
|
|
|
|
|
|
|
|
The central limit theorem basically states, that the distribution of $S_n$
|
|
|
|
can be approximated by a normal distribution with mean $n \mu$ and
|
|
|
|
variance $n \sigma^2$,
|
|
|
|
i.e.~$S_n \approx n \mu + \sigma \sqrt{n} N$ for $N \sim \cN(0,1)$,
|
|
|
|
where $\approx$ is to be made precise.
|
|
|
|
|
|
|
|
For intuition, watch \url{https://3blue1brown.com/lessons/clt}.
|
|
|
|
|
|
|
|
\begin{example}
|
|
|
|
We throw a fair die $n = 100$ times and denote the sum of the faces
|
2023-07-06 00:36:26 +02:00
|
|
|
by $S_n \coloneqq X_1 + \ldots + X_n$, where $X_1,\ldots, X_n$
|
2023-05-11 17:51:08 +02:00
|
|
|
are i.i.d.~and uniformly distributed on $\{1,\ldots,6\}$.
|
|
|
|
Then $\bE[S_n] = 350$ and $\sqrt{\Var(S_n)} = \sigma \approx 17.07$.
|
|
|
|
\todo{Missing pictures}
|
|
|
|
\end{example}
|
|
|
|
|
|
|
|
\begin{question}
|
|
|
|
Why do statisticians care about $\sigma$ instead of $\sigma^2$?
|
|
|
|
\end{question}
|
|
|
|
By definition, $\Var(X) = \bE[(X- \bE(X))^2]$, hence $\sqrt{\Var(X)}$
|
|
|
|
can be interpreted as a distance.
|
2023-07-06 00:36:26 +02:00
|
|
|
One could also define $\Var(X)$ to be $\bE[|X - \bE(X)|]$ but this is not
|
2023-05-11 17:51:08 +02:00
|
|
|
well behaved.
|
|
|
|
|
|
|
|
|
|
|
|
\begin{example}
|
|
|
|
Let $X_1,\ldots,X_n$ be i.i.d.~and $X_1\sim \Exp(1)$.
|
|
|
|
We knot that for $n \in \N$, $\bE[S_n] = n$
|
|
|
|
and $\sqrt{\Var(S_n)} = \sqrt{n}$.
|
|
|
|
For $n = 100, 300, 500$, we get the following picture
|
|
|
|
\todo{Missing picture}
|
|
|
|
\end{example}
|
|
|
|
|
|
|
|
In order to make things nicer, we do the following:
|
|
|
|
\begin{enumerate}[1.]
|
|
|
|
\item center: $S_n - \bE[S_n]$,
|
|
|
|
\item normalize: $\frac{S_n - \bE[S_n]}{\sqrt{\Var(S_n)} }$.
|
|
|
|
\end{enumerate}
|
|
|
|
Then $\bE[\frac{S_n - \bE[S_n]}{\sqrt{\Var(S_n)}}] = 0$
|
|
|
|
and $\Var(\frac{S_n - \bE[S_n]}{\sqrt{\Var(S_n)}}) = 1$.
|
|
|
|
|
2023-07-28 03:45:37 +02:00
|
|
|
\begin{theorem}[Central limit theorem, 1920s, Lindeberg and Levy]%
|
|
|
|
\yalabel{Central Limit Theorem}{CLT}{clt}
|
2023-05-11 17:51:08 +02:00
|
|
|
Let $X_1,X_2,\ldots$ be i.i.d.~random variables
|
|
|
|
with $\bE[X_1] = \mu$ and $\Var(X_1) = \sigma^2 \in (0, \infty)$.
|
|
|
|
|
|
|
|
Let $S_n \coloneqq \sum_{i=1}^n X_i$.
|
|
|
|
Then
|
|
|
|
\[
|
2023-07-15 02:00:04 +02:00
|
|
|
\frac{S_n - n \nu}{\sigma \sqrt{n} } \xrightarrow{\text{d}} \cN(0,1),
|
2023-05-11 17:51:08 +02:00
|
|
|
\]
|
|
|
|
i.e.~$\forall x \in \R:$
|
|
|
|
\[
|
|
|
|
\lim_{n \to \infty} \bP\left[\frac{S_n - n \mu}{\sigma \sqrt{n} } \le x\right] = \Phi(x) = \int_{-\infty}^x \frac{1}{\sqrt{2 \pi} } e^{\frac{-t^2}{2}}dt.
|
|
|
|
\]
|
|
|
|
\end{theorem}
|
2023-05-16 17:49:09 +02:00
|
|
|
We will abbreviate the central limit theorem by \vocab{CLT}.
|
2023-05-11 17:51:08 +02:00
|
|
|
|
|
|
|
There exists a special case of this theorem, which was proved earlier:
|
|
|
|
\begin{theorem}[de-Moivre (1730, $p = 0.5$), Laplace (1812, general $p$ )]
|
|
|
|
\label{preclt}
|
|
|
|
Let $S_n = \Bin(n,p)$, where $p \in (0,1)$ is constant.
|
|
|
|
Then, for all $x \in \R$ :
|
|
|
|
\[
|
|
|
|
\lim_{n \to \infty} \bP\left[ \frac{ S_n - np}{\sqrt{n p(1-p)}} \le x\right] = \Phi(x).
|
|
|
|
\]
|
|
|
|
\end{theorem}
|
|
|
|
\begin{proof}
|
|
|
|
Let $X_1, X_2,\ldots$ i.i.d.~with $X_1 \sim \Ber(p)$.
|
|
|
|
Then $\bE[X_1] = p$ and $\Var(X_1) = p(1-p )$.
|
|
|
|
Furthermore $\sum_{i=1}^n X_i \sim \Bin(n,p)$,
|
2023-07-28 03:45:37 +02:00
|
|
|
and the special case follows from \yaref{clt}.
|
2023-05-11 17:51:08 +02:00
|
|
|
\end{proof}
|
2023-07-28 03:45:37 +02:00
|
|
|
\yaref{preclt} is a useful tool for approximating the Binomial distribution with the normal distribution.
|
2023-05-11 17:51:08 +02:00
|
|
|
If $S_n \sim \Bin(n,p)$ and $[a,b] \subseteq \R$, we have
|
|
|
|
\[\bP[a \le S_n \le b] = \bP\left[\frac{a - np}{\sqrt{np(1-p)}} \le \frac{S_n -np}{\sqrt{n p (1-p)}} \le \frac{b - np}{\sqrt{n p (1-p)} }\right] \approx \Phi(b') - \Phi(a').\]
|
|
|
|
|
|
|
|
\begin{example}
|
|
|
|
We consider a $n=40$-times Bernoulli trial with success probability $p = \frac{1}{2}$.
|
|
|
|
Then $0.9597 = \bP[S \le 25] \approx \Phi(\frac{5}{\sqrt{10}} \approx 0.9431$.
|
|
|
|
|
|
|
|
However, $S$ takes only integer values, which means $\bP[S \le 25] = \bP[S 26]$.
|
|
|
|
With this in mind, a better approximation is
|
|
|
|
\[
|
|
|
|
\bP[S \le 25] = \bP[S \le 25.5] \approx \Phi\left( \frac{5.5}{\sqrt{10} } \right) \approx 0.9541.
|
2023-07-06 00:36:26 +02:00
|
|
|
\]
|
2023-05-11 17:51:08 +02:00
|
|
|
\end{example}
|
|
|
|
|
|
|
|
\begin{example}
|
|
|
|
Consider a particle that start at $0$ and moves on the lattice $\Z$.
|
|
|
|
In every step, takes a step $+ 1$ with probability $\frac{1}{2}$
|
|
|
|
or $-1$ with probability $\frac{1}{2}$.
|
|
|
|
|
|
|
|
More formally: Let $X_1,X_2,\ldots$ be i.i.d.~with $\bP[X_1=1] = \bP[X_1=-1] = \frac{1}{2}$ and consider $S_n \coloneqq \sum_{i=1}^n X_i$.
|
|
|
|
|
2023-07-28 03:45:37 +02:00
|
|
|
Then the \yaref{clt} states, that $S_n \approx \cN(0,n)$.
|
2023-05-11 17:51:08 +02:00
|
|
|
\end{example}
|
|
|
|
|
|
|
|
\begin{example}
|
|
|
|
Consider an election with two candidates $A$ and $B$.
|
|
|
|
The relative number of votes for $A$ is $p \in (0,1)$ (constatn, but unknown)
|
|
|
|
How many ballots do we need to count to make sure that the probability of erring more than $1\%$ is not bigger than $5\%$?
|
|
|
|
|
|
|
|
Each ballot is a vote for $A$ with probability $p$.
|
|
|
|
We have $S_n \sim \Bin(n,p)$ and we want to find $n$ such that
|
|
|
|
$\bP[|S_n - np| \le 0.01 n] \le 0.05$.
|
|
|
|
We have that
|
|
|
|
\begin{IEEEeqnarray*}{rCl}
|
2023-05-14 22:49:50 +02:00
|
|
|
&&\bP[|S_n - np| \le 0.01n] \\
|
2023-05-11 17:51:08 +02:00
|
|
|
&=& \bP[ -0.01 n \le S_n - np \le 0.01n]\\
|
|
|
|
&=& \bP[-\frac{0.01 n}{\sqrt{n p (1-p)} } \le \frac{S_n - np}{\sqrt{n p (1-p)} } \le \frac{0.01 n}{\sqrt{n p (1-p)}}\\
|
|
|
|
&\approx& \Phi(0.01 \sqrt{\frac{n}{p(1-p)}}) - \Phi(-0.01 \sqrt{\frac{n}{p(1-p)}})\\
|
|
|
|
&=& 2\Phi(0.01 \sqrt{\frac{n}{p(1-p)}}) - 1\\
|
|
|
|
\end{IEEEeqnarray*}
|
2023-07-11 23:38:47 +02:00
|
|
|
Hence, we want $\Phi(0.01 \sqrt{\frac{n}{p(1-p)}}) \approx \frac{1.95}{2}$,
|
2023-05-11 17:51:08 +02:00
|
|
|
i.e.~$n = (1.96)^2 100^2 p\cdot (1-p)$
|
|
|
|
We have $p\cdot (1-p) \le \frac{1}{4}$,
|
|
|
|
thus $n \approx (1.96)^2 \cdot 100^2 \cdot \frac{1}{4} = 9600$ suffices.
|
|
|
|
\end{example}
|