491 lines
18 KiB
TeX
491 lines
18 KiB
TeX
\documentclass[12pt]{report}
|
|
|
|
%%%%% title %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\title{\tr{Introduction to Scientific Computing}{Einf\"uhrung in die wissenschaftliche Datenverarbeitung}}
|
|
\author{Jan Benda\\Abteilung Neuroethologie\\[2ex]\includegraphics[width=0.3\textwidth]{UT_WBMW_Rot_RGB}}
|
|
\date{WS 15/16}
|
|
|
|
%%%% language %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% \newcommand{\tr}[2]{#1} % en
|
|
% \usepackage[english]{babel}
|
|
\newcommand{\tr}[2]{#2} % de
|
|
\usepackage[german]{babel}
|
|
|
|
%%%%% packages %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\usepackage{pslatex} % nice font for pdf file
|
|
\usepackage[breaklinks=true,bookmarks=true,bookmarksopen=true,pdfpagemode=UseNone,pdfstartview=FitH,colorlinks=true,citecolor=blue]{hyperref}
|
|
|
|
%%%% layout %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\usepackage[left=25mm,right=25mm,top=20mm,bottom=30mm]{geometry}
|
|
\setcounter{tocdepth}{1}
|
|
|
|
%%%% graphics %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\usepackage{graphicx}
|
|
\usepackage{xcolor}
|
|
\newcommand{\texpicture}[1]{{\sffamily\small\input{#1.tex}}}
|
|
|
|
%%%%% listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\usepackage{listings}
|
|
\lstset{
|
|
inputpath=../code,
|
|
basicstyle=\ttfamily\footnotesize,
|
|
numbers=left,
|
|
showstringspaces=false,
|
|
language=Matlab,
|
|
commentstyle=\itshape\color{darkgray},
|
|
keywordstyle=\color{blue},
|
|
stringstyle=\color{green},
|
|
backgroundcolor=\color{blue!10},
|
|
breaklines=true,
|
|
breakautoindent=true,
|
|
columns=flexible,
|
|
frame=single,
|
|
caption={\protect\filename@parse{\lstname}\protect\filename@base},
|
|
captionpos=t,
|
|
xleftmargin=1em,
|
|
xrightmargin=1em,
|
|
aboveskip=10pt
|
|
}
|
|
|
|
%%%%% math stuff: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\usepackage{amsmath}
|
|
\usepackage{bm}
|
|
\usepackage{dsfont}
|
|
\newcommand{\naZ}{\mathds{N}}
|
|
\newcommand{\gaZ}{\mathds{Z}}
|
|
\newcommand{\raZ}{\mathds{Q}}
|
|
\newcommand{\reZ}{\mathds{R}}
|
|
\newcommand{\reZp}{\mathds{R^+}}
|
|
\newcommand{\reZpN}{\mathds{R^+_0}}
|
|
\newcommand{\koZ}{\mathds{C}}
|
|
|
|
|
|
%%%%% structure: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\usepackage{ifthen}
|
|
|
|
\newcommand{\code}[1]{\texttt{#1}}
|
|
|
|
\newcommand{\source}[1]{
|
|
\begin{flushright}
|
|
\color{gray}\scriptsize \url{#1}
|
|
\end{flushright}
|
|
}
|
|
|
|
\newenvironment{definition}[1][]{\medskip\noindent\textbf{Definition}\ifthenelse{\equal{#1}{}}{}{ #1}:\newline}%
|
|
{\medskip}
|
|
|
|
\newcommand{\showlisting}{yes}
|
|
%\newcommand{\showlisting}{no}
|
|
\newcounter{theexercise}
|
|
\setcounter{theexercise}{1}
|
|
\newenvironment{exercise}[1][]{\medskip\noindent\textbf{\tr{Exercise}{\"Ubung}
|
|
\arabic{theexercise}:} \stepcounter{theexercise}\newline \newcommand{\exercisesource}{#1}}%
|
|
{\ifthenelse{\equal{\exercisesource}{}}{}{\ifthenelse{\equal{\showlisting}{yes}}{\medskip\lstinputlisting{\exercisesource}}{}}\medskip}
|
|
|
|
\graphicspath{{figures/}}
|
|
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\begin{document}
|
|
|
|
\maketitle
|
|
|
|
%\tableofcontents
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\chapter{\tr{Descriptive statistics}{Deskriptive Statistik}}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\section{Statistics of real-valued data}
|
|
|
|
\begin{itemize}
|
|
\item Location, central tendency
|
|
\begin{itemize}
|
|
\item arithmetic mean
|
|
\item median
|
|
\item mode
|
|
\end{itemize}
|
|
\item Spread, dispersion
|
|
\begin{itemize}
|
|
\item variance
|
|
\item standard deviation
|
|
\item interquartile range
|
|
\item coefficient of variation
|
|
\item minimum, maximum
|
|
\end{itemize}
|
|
\item Shape
|
|
\begin{itemize}
|
|
\item skewnees
|
|
\item kurtosis
|
|
\end{itemize}
|
|
\item Dependence
|
|
\begin{itemize}
|
|
\item Pearson correlation coefficient
|
|
\item Spearman's rank correlation coefficient
|
|
\end{itemize}
|
|
\end{itemize}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\subsection{Median, Quartile, Percentile}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=1\textwidth]{median}
|
|
\caption{\label{medianfig} Median.}
|
|
\end{figure}
|
|
|
|
\begin{definition}[\tr{median}{Median}]
|
|
\tr{Half of the observations $X=(x_1, x_2, \ldots, x_n)$ are
|
|
larger than the median and half of them are smaller than the
|
|
median.} {Der Median teilt eine Liste von Messwerten so in zwei
|
|
H\"alften, dass die eine H\"alfte der Daten nicht gr\"o{\ss}er
|
|
und die andere H\"alfte nicht kleiner als der Median ist.}
|
|
\end{definition}
|
|
|
|
\begin{exercise}[mymedian.m]
|
|
\tr{Write a function that computes the median of a vector.}
|
|
{Schreibe eine Funktion, die den Median eines Vektors zur\"uckgibt.}
|
|
\end{exercise}
|
|
|
|
\code{matlab} stellt die Funktion \code{median()} zur Berechnung des Medians bereit.
|
|
|
|
\begin{exercise}[checkmymedian.m]
|
|
\tr{Write a script that tests whether your median function really
|
|
returns a median above which are the same number of data than
|
|
below. In particular the script should test data vectors of
|
|
different length.} {Schreibe ein Skript, das testet ob die
|
|
\code{mymedian} Funktion wirklich die Zahl zur\"uckgibt, \"uber
|
|
der genauso viele Datenwerte liegen wie darunter. Das Skript sollte
|
|
insbesondere verschieden lange Datenvektoren testen.}
|
|
\end{exercise}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=1\textwidth]{quartile}
|
|
\caption{\label{quartilefig} Median und Quartile.}
|
|
\end{figure}
|
|
|
|
\begin{definition}[\tr{quartile}{Quartile}]
|
|
Die Quartile Q1, Q2 und Q3 unterteilen die Daten in vier gleich
|
|
gro{\ss}e Gruppen, die jeweils ein Viertel der Daten enthalten.
|
|
Das mittlere Quartil entspricht dem Median.
|
|
\end{definition}
|
|
|
|
\begin{exercise}[quartiles.m]
|
|
\tr{Write a function that computes the first, second, and third quartile of a vector.}
|
|
{Schreibe eine Funktion, die das erste, zweite und dritte Quartil als Vektor zur\"uckgibt.}
|
|
\end{exercise}
|
|
|
|
\subsection{Histogram}
|
|
|
|
Histogramme z\"ahlen die H\"aufigkeit $n_i$ des Auftretens von
|
|
$N=\sum_{i=1}^M n_i$ Messwerten in $M$ Messbereichsklassen $i$ (Bins).
|
|
Die Klassen unterteilen den Wertebereich meist in angrenzende und
|
|
gleich gro{\ss}e Intervalle. Histogramme sch\"atzen die
|
|
Wahrscheinlichkeitsverteilung der Messwerte ab.
|
|
|
|
\begin{exercise}[rollthedie.m]
|
|
\tr{Write a function that simulates rolling a die $n$ times.}
|
|
{Schreibe eine Funktion, die das $n$-malige W\"urfeln mit einem W\"urfel simuliert.}
|
|
\end{exercise}
|
|
|
|
\begin{exercise}[diehistograms.m]
|
|
\tr{Plot histograms from rolling the die 20, 100, 1000 times. Use
|
|
the plain hist(x) function, force 6 bins via hist( x, 6 ), and set
|
|
meaningfull bins positions.} {Plotte Histogramme von 20, 100, und
|
|
1000-mal w\"urfeln. Benutze \code{hist(x)}, erzwinge sechs Bins
|
|
mit \code{hist(x,6)}, und setze selbst sinnvolle Bins. Normiere
|
|
anschliessend das Histogram auf geeignete Weise.}
|
|
\end{exercise}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=1\textwidth]{diehistograms}
|
|
\caption{\label{diehistogramsfig} \tr{Histograms of rolling a die
|
|
100 or 500 times. Left: plain histograms counting the frequency
|
|
of the six possible outcomes. Right: the same data normalized
|
|
to their sum.}{Histogramme des Ergebnisses von 100 oder 500 mal
|
|
W\"urfeln. Links: das absolute Histogramm z\"ahlt die Anzahl des
|
|
Auftretens jeder Augenzahl. Rechts: Normiert auf die Summe des
|
|
Histogramms werden die beiden Messungen vergleichbar.}}
|
|
\end{figure}
|
|
|
|
Bei ganzzahligen Messdaten (z.B. die Augenzahl eines W\"urfels)
|
|
kann f\"ur jede auftretende Zahl eine Klasse definiert werden.
|
|
Damit die H\"ohe der Histogrammbalken unabh\"angig von der Anzahl der Messwerte wird,
|
|
normiert man das Histogram auf die Anzahl der Messwerte.
|
|
Die H\"ohe der Histogrammbalken gibt dann die Wahrscheinlichkeit $P(x_i)$
|
|
des Auftretens der Gr\"o{\ss}e $x_i$ in der $i$-ten Klasse an
|
|
\[ P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \]
|
|
|
|
|
|
\subsection{Probability density function}
|
|
|
|
Meistens haben wir es jedoch mit reellen Messgr\"o{\ss}en zu tun.
|
|
|
|
\begin{exercise}[gaussianbins.m]
|
|
\tr{Draw 100 random data from a Gaussian distribution and plot
|
|
histograms with different bin sizes of the data.} {Ziehe 100
|
|
normalverteilte Zufallszahlen und erzeuge Histogramme mit
|
|
unterschiedlichen Klassenbreiten. Was f\"allt auf?}
|
|
\end{exercise}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=1\textwidth]{pdfhistogram}
|
|
\caption{\label{pdfhistogramfig} \tr{Histograms of normally
|
|
distributed data with different bin sizes.}{Histogramme mit
|
|
verschiednenen Klassenbreiten eines Datensatzes von
|
|
normalverteilten Messwerten. Links: Die H\"ohe des absoluten
|
|
Histogramms h\"angt von der Klassenbreite ab. Rechts: Bei auf
|
|
das Integral normierten Histogrammen werden auch
|
|
unterschiedliche Klassenbreiten vergleichbar.}}
|
|
\end{figure}
|
|
|
|
Histogramme von reellen Messwerten m\"ussen auf das Integral 1 normiert werden, so dass
|
|
das Integral (nicht die Summe) \"uber das Histogramm eins ergibt. Das Integral
|
|
ist die Fl\"ache des Histograms. Diese setzt sich zusammen aus der Fl\"ache der einzelnen
|
|
Histogrammbalken. Diese haben die H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache
|
|
$A$ des Histogramms ist also
|
|
\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \]
|
|
und das normierte Histogramm hat die H\"ohe
|
|
\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \]
|
|
Es muss also nicht nur durch die Summe, sondern auch durch die Breite $\Delta x$ der Klassen
|
|
geteilt werden.
|
|
|
|
$p(x_i)$ kann keine Wahrscheinlichkeit sein, da $p(x_i)$ nun eine
|
|
Einheit hat --- das Inverse der Einheit der Messgr\"osse $x$. Man
|
|
spricht von einer Wahrscheinlichkeitsdichte.
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=1\textwidth]{pdfprobabilities}
|
|
\caption{\label{pdfprobabilitiesfig} Wahrscheinlichkeiten bei
|
|
einer Wahrscheinlichkeitsdichtefunktion.}
|
|
\end{figure}
|
|
|
|
\begin{exercise}[gaussianpdf.m]
|
|
\tr{Plot the Gaussian probability density}{Plotte die Gauss'sche Wahrscheinlichkeitsdichte }
|
|
\[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\]
|
|
\tr{What does it mean?}{Was bedeutet die folgende Wahrscheinlichkeit?}
|
|
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \]
|
|
\tr{How large is}{Wie gro{\ss} ist}
|
|
\[ \int\limits_{-\infty}^{+\infty} p(x) \, dx \; ?\]
|
|
\tr{Why?}{Warum?}
|
|
\end{exercise}
|
|
|
|
\begin{exercise}[boxwhisker.m]
|
|
\tr{Generate eine $40 \times 10$ matrix of random numbers and
|
|
illustrate their distribution in a box-whicker plot
|
|
(\code{boxplot()} function). How to interpret the plot?}
|
|
{Erzeuge ein $40 \times 10$ Matrix
|
|
von Zufallszahlen und illustriere ihre Verteilungen in einem
|
|
Box-Whisker Plot (\code{boxplot()} Funktion, lies die Hilfe!). Wie ist der
|
|
Box-Whisker Plot zu interpretieren? Was hat es mit den Ausreissern auf sich?
|
|
Wie kann man erreichen, dass die Whisker den kleinsten und den gr\"o{\ss}ten
|
|
Datenwert anzeigen? Warum sind die unterschiedlichen Box-Whiskers nicht alle gleich,
|
|
obwohl sie aus der selben Verteilung gezogen worden sind?}
|
|
\end{exercise}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=1\textwidth]{boxwhisker}
|
|
\caption{\label{boxwhiskerfig} Box-whisker plots illustrate distributions.}
|
|
\end{figure}
|
|
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\subsection{Data types}
|
|
|
|
\subsubsection{Nominal scale}
|
|
\begin{itemize}
|
|
\item Binary
|
|
\begin{itemize}
|
|
\item ``yes/no'',
|
|
\item ``true/false'',
|
|
\item ``success/failure'', etc.
|
|
\end{itemize}
|
|
\item Categorial
|
|
\begin{itemize}
|
|
\item cell type (``rod/cone/horizontal cell/bipolar cell/ganglion cell''),
|
|
\item blood type (``A/B/AB/0''),
|
|
\item parts of speech (``noun/veerb/preposition/article/...''),
|
|
\item taxonomic groups (``Coleoptera/Lepidoptera/Diptera/Hymenoptera''), etc.
|
|
\end{itemize}
|
|
\item Each observation/measurement/sample is put into one category
|
|
\item There is no reasonable order among the categories.\\
|
|
example: [rods, cones] vs. [cones, rods]
|
|
\item Statistics: mode, i.e. the most common item
|
|
\end{itemize}
|
|
|
|
\subsubsection{Ordinal scale}
|
|
\begin{itemize}
|
|
\item Like nominal scale, but with an order
|
|
\item Examples: ranks, ratings
|
|
\begin{itemize}
|
|
\item ``bad/ok/good'',
|
|
\item ``cold/warm/hot'',
|
|
\item ``young/old'', etc.
|
|
\end{itemize}
|
|
\item {\bf But:} there is no reasonable measure of {\em distance}
|
|
between the classes
|
|
\item Statistics: mode, median
|
|
\end{itemize}
|
|
|
|
\subsubsection{Interval scale}
|
|
\begin{itemize}
|
|
\item Quantitative/metric values
|
|
\item Reasonable measure of distance between values, but no absolute zero
|
|
\item Examples:
|
|
\begin{itemize}
|
|
\item Temperature in $^\circ$C ($20^\circ$C is not twice as hot as $10^\circ$C)
|
|
\item Direction measured in degrees from magnetic or true north
|
|
\end{itemize}
|
|
\item Statistics:
|
|
\begin{itemize}
|
|
\item Central tendency: mode, median, arithmetic mean
|
|
\item Dispersion: range, standard deviation
|
|
\end{itemize}
|
|
\end{itemize}
|
|
|
|
\subsubsection{Absolute/ratio scale}
|
|
\begin{itemize}
|
|
\item Like interval scale, but with absolute origin/zero
|
|
\item Examples:
|
|
\begin{itemize}
|
|
\item Temperature in $^\circ$K
|
|
\item Length, mass, duration, electric charge, ...
|
|
\item Plane angle, etc.
|
|
\item Count (e.g. number of spikes in response to a stimulus)
|
|
\end{itemize}
|
|
\item Statistics:
|
|
\begin{itemize}
|
|
\item Central tendency: mode, median, arithmetic, geometric, harmonic mean
|
|
\item Dispersion: range, standard deviation
|
|
\item Coefficient of variation (ratio standard deviation/mean)
|
|
\item All other statistical measures
|
|
\end{itemize}
|
|
\end{itemize}
|
|
|
|
\subsubsection{Data types}
|
|
\begin{itemize}
|
|
\item Data type selects
|
|
\begin{itemize}
|
|
\item statistics
|
|
\item type of plots (bar graph versus x-y plot)
|
|
\item correct tests
|
|
\end{itemize}
|
|
\item Scales exhibit increasing information content from nominal
|
|
to absolute.\\
|
|
Conversion ,,downwards'' is always possible
|
|
\item For example: size measured in meter (ratio scale) $\rightarrow$
|
|
categories ``small/medium/large'' (ordinal scale)
|
|
\end{itemize}
|
|
|
|
\subsubsection{Examples from neuroscience}
|
|
\begin{itemize}
|
|
\item {\bf absolute:}
|
|
\begin{itemize}
|
|
\item size of neuron/brain
|
|
\item length of axon
|
|
\item ion concentration
|
|
\item membrane potential
|
|
\item firing rate
|
|
\end{itemize}
|
|
|
|
\item {\bf interval:}
|
|
\begin{itemize}
|
|
\item edge orientation
|
|
\end{itemize}
|
|
|
|
\item {\bf ordinal:}
|
|
\begin{itemize}
|
|
\item stages of a disease
|
|
\item ratings
|
|
\end{itemize}
|
|
|
|
\item {\bf nominal:}
|
|
\begin{itemize}
|
|
\item cell type
|
|
\item odor
|
|
\item states of an ion channel
|
|
\end{itemize}
|
|
|
|
\end{itemize}
|
|
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}}
|
|
|
|
Beim Bootstrap erzeugt man sich die Verteilung von Statistiken durch Resampling
|
|
aus der Stichprobe. Das hat mehrere Vorteile:
|
|
\begin{itemize}
|
|
\item Weniger Annahmen (z.B. muss eine Stichprobe nicht Normalverteilt sein).
|
|
\item H\"ohere Genauigkeit als klassische Methoden.
|
|
\item Allgemeing\"ultigkeit: Bootstrap Methoden sind sich sehr
|
|
\"ahnlich f\"ur viele verschiedene Statistiken und ben\"otigen nicht
|
|
f\"ur jede Statistik eine andere Formel.
|
|
\end{itemize}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[width=0.8\textwidth]{2012-10-29_16-26-05_771}\\[2ex]
|
|
\includegraphics[width=0.8\textwidth]{2012-10-29_16-41-39_523}\\[2ex]
|
|
\includegraphics[width=0.8\textwidth]{2012-10-29_16-29-35_312}
|
|
\caption{\tr{Why can we only measure a sample of the
|
|
population?}{Warum k\"onnen wir nur eine Stichprobe der
|
|
Grundgesamtheit messen?}}
|
|
\end{figure}
|
|
|
|
\begin{figure}[t]
|
|
\includegraphics[height=0.2\textheight]{srs1}\\[2ex]
|
|
\includegraphics[height=0.2\textheight]{srs2}\\[2ex]
|
|
\includegraphics[height=0.2\textheight]{srs3}
|
|
\caption{Bootstrap der Stichprobenvertielung (a) Von der
|
|
Grundgesamtheit (population) mit unbekanntem Parameter
|
|
(z.B. Mittelwert $\mu$) zieht man Stichproben (SRS: simple random
|
|
samples). Die Statistik (hier Bestimmung von $\bar x$) kann f\"ur
|
|
jede Stichprobe berechnet werden. Die erhaltenen Werte entstammen
|
|
der Stichprobenverteilung. Meisten wird aber nur eine Stichprobe
|
|
gezogen! (b) Mit bestimmten Annahmen und Theorien kann man auf
|
|
die Stichprobenverteilung schlie{\ss}en ohne sie gemessen zu
|
|
haben. (c) Alternativ k\"onnen aus der einen Stichprobe viele
|
|
Bootstrap-Stichproben generiert werden (resampling) und so
|
|
Eigenschaften der Stichprobenverteilung empirisch bestimmt
|
|
werden. Aus Hesterberg et al. 2003, Bootstrap Methods and
|
|
Permuation Tests}
|
|
\end{figure}
|
|
|
|
\section{Bootstrap des Standardfehlers}
|
|
|
|
Beim Bootstrap erzeugen wir durch resampling neue Stichproben und
|
|
benutzen diese um die Stichprobenverteilung einer Statistik zu
|
|
berechnen. Die Bootstrap Stichproben haben jeweils den gleichen Umfang
|
|
wie die urspr\"unglich gemessene Stichprobe und werden durch Ziehen
|
|
mit Zur\"ucklegen gewonnen. Jeder Wert der urspr\"unglichen Stichprobe
|
|
kann also einmal, mehrmals oder gar nicht in einer Bootstrap
|
|
Stichprobe vorkommen.
|
|
|
|
\begin{exercise}[bootstrapsem.m]
|
|
Ziehe 1000 normalverteilte Zufallszahlen und berechne deren Mittelwert,
|
|
Standardabweichung und Standardfehler ($\sigma/\sqrt{n}$).
|
|
|
|
Resample die Daten 1000 mal (Ziehen mit Zur\"ucklegen) und berechne jeweils
|
|
den Mittelwert.
|
|
|
|
Plotte ein Histogramm dieser Mittelwerte, sowie deren Mittelwert und
|
|
die Standardabweichung.
|
|
|
|
Was hat das mit dem Standardfehler zu tun?
|
|
\end{exercise}
|
|
|
|
\end{document}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\subsection{Statistics}
|
|
What is "a statistic"? % dt. Sch\"atzfunktion
|
|
\begin{definition}[statistic]
|
|
A statistic (singular) is a single measure of some attribute of a
|
|
sample (e.g., its arithmetic mean value). It is calculated by
|
|
applying a function (statistical algorithm) to the values of the
|
|
items of the sample, which are known together as a set of data.
|
|
|
|
\source{http://en.wikipedia.org/wiki/Statistic}
|
|
\end{definition}
|