diff --git a/statistics-fabian/assignments/day1_001.pdf b/statistics-fabian/assignments/day1_001.pdf deleted file mode 100644 index 752a33b..0000000 Binary files a/statistics-fabian/assignments/day1_001.pdf and /dev/null differ diff --git a/statistics-fabian/assignments/day1_002.pdf b/statistics-fabian/assignments/day1_002.pdf deleted file mode 100644 index f923151..0000000 Binary files a/statistics-fabian/assignments/day1_002.pdf and /dev/null differ diff --git a/statistics-fabian/assignments/example001.csv b/statistics-fabian/assignments/example001.csv deleted file mode 100755 index 3f3d985..0000000 --- a/statistics-fabian/assignments/example001.csv +++ /dev/null @@ -1,43 +0,0 @@ -MAO,Diagnosis -6.8,I -4.1,I -7.3,I -14.2,I -18.8,I -9.9,I -7.4,I -11.9,I -5.2,I -7.8,I -7.8,I -8.7,I -12.7,I -14.5,I -10.7,I -8.4,I -9.7,I -10.6,I -7.8,II -4.4,II -11.4,II -3.1,II -4.3,II -10.1,II -1.5,II -7.4,II -5.2,II -10,II -3.7,II -5.5,II -8.5,II -7.7,II -6.8,II -3.1,II -6.4,III -10.8,III -1.1,III -2.9,III -4.5,III -5.8,III -9.4,III -6.8,III diff --git a/statistics-fabian/assignments/example002.csv b/statistics-fabian/assignments/example002.csv deleted file mode 100755 index dd1ba9c..0000000 --- a/statistics-fabian/assignments/example002.csv +++ /dev/null @@ -1,186 +0,0 @@ -Weight,Sex -1607,m -1157,m -1248,m -1310,m -1398,m -1237,m -1232,m -1343,m -1380,m -1274,m -1245,m -1286,m -1508,m -1105,m -1123,m -1198,m -1300,m -1249,m -1185,m -915,m -1345,m -1107,m -1357,m -1227,m -1205,m -1435,m -1289,m -1093,m -1211,m -1260,m -1193,m -1330,m -1130,m -1357,m -1193,m -1232,m -1321,m -1260,m -1380,m -1230,m -1136,m -1029,m -1223,m -1240,m -1264,m -1020,m -1415,m -1410,m -1275,m -1230,m -1085,m -1048,m -1181,m -1103,m -1165,m -1547,m -1173,m -1660,m -1307,m -1535,m -1315,m -1257,m -1424,m -1309,m -1170,m -1412,m -1270,m -1230,m -1233,m -1561,m -1193,m -1272,m -1355,m -1137,m -1354,m -1110,m -1265,m -1407,m -1227,m -1330,m -1222,m -1305,m -1475,m -1177,m -1337,m -1145,m -1070,m -1305,m -1085,m -1303,m -1390,m -1532,m -1238,m -1233,m -1280,m -1245,m -1459,m -1157,m -1302,m -1385,m -1310,m -1342,m -1303,m -1248,m -1115,m -1365,m -1227,m -1353,m -1125,f -1027,f -1112,f -983,f -1090,f -1247,f -1045,f -983,f -972,f -1045,f -937,f -1245,f -1200,f -1270,f -1200,f -1145,f -1090,f -1040,f -1343,f -1010,f -1095,f -1180,f -1168,f -1095,f -1040,f -1235,f -1050,f -1038,f -1046,f -1255,f -1228,f -1000,f -1225,f -1220,f -1085,f -1067,f -1006,f -1138,f -1175,f -1252,f -1037,f -958,f -1020,f -1068,f -1107,f -1317,f -952,f -1056,f -1203,f -1183,f -1392,f -1130,f -1284,f -996,f -1228,f -1087,f -1035,f -1170,f -1064,f -1250,f -1129,f -1088,f -1037,f -1117,f -1095,f -1027,f -1027,f -1190,f -1153,f -1037,f -1120,f -1212,f -1024,f -1135,f -1177,f -1096,f -1114,f diff --git a/statistics/code/bootstrapsem.m b/statistics/code/bootstrapsem.m new file mode 100644 index 0000000..b5a7cbe --- /dev/null +++ b/statistics/code/bootstrapsem.m @@ -0,0 +1,23 @@ +nsamples = 1000 +resample = 500 + + x = randn( nsamples, 1 ); +sem = std(x)/sqrt(nsamples); + +mu = zeros( resample, 1 ); +for i = 1:resample + % resample: + xr = x(randi(nsamples, nsamples, 1)); + % compute statistics on sample: + mu(i) = mean(xr); +end +bootsem = std( mu ); + +hold on +hist( x, 20 ); +hist( mu, 20 ); +hold off + +disp(['bootstrap standard error: ', num2str(bootsem)]); +disp(['standard error: ', num2str(sem)]); + diff --git a/statistics/code/bootstraptymus.m b/statistics/code/bootstraptymus.m new file mode 100644 index 0000000..e92a5b7 --- /dev/null +++ b/statistics/code/bootstraptymus.m @@ -0,0 +1,24 @@ +resample = 500 + +load( 'thymusglandweights.dat' ); +x = thymusglandweights; +nsamples = length( x ); +sem = std(x)/sqrt(nsamples); + +mu = zeros( resample, 1 ); +for i = 1:resample + % resample: + xr = x(randi(nsamples, nsamples, 1)); + % compute statistics on sample: + mu(i) = mean(xr); +end +bootsem = std( mu ); + +hold on +hist( x, 20 ); +hist( mu, 20 ); +hold off + +disp(['bootstrap standard error: ', num2str(bootsem)]); +disp(['standard error: ', num2str(sem)]); + diff --git a/statistics/code/boxwhisker.m b/statistics/code/boxwhisker.m new file mode 100644 index 0000000..65e8f24 --- /dev/null +++ b/statistics/code/boxwhisker.m @@ -0,0 +1,3 @@ +x = randn(40, 10); +boxplot(x, 'whisker', 100.0 ); + diff --git a/statistics/code/correlations.m b/statistics/code/correlations.m new file mode 100644 index 0000000..7816457 --- /dev/null +++ b/statistics/code/correlations.m @@ -0,0 +1,14 @@ +n = 1000 +x = randn( n, 1 ); +y = randn( n, 1 ) + 0.2*x; +r = corr(x,y) + + nsamples = 500; + rs = zeros( nsamples, 1 ); +for i = 1:nsamples + xs = x(randi(n,n,1)); +ys = x(randi(n,n,1)); + rs(i) = corr(xs,ys); +end + +hist( rs, 20 ) diff --git a/statistics/code/diehistograms.m b/statistics/code/diehistograms.m index 8af7d68..d04ca98 100644 --- a/statistics/code/diehistograms.m +++ b/statistics/code/diehistograms.m @@ -3,16 +3,16 @@ nrolls = [ 20, 100, 1000 ]; for i = [1:length(nrolls)] d = rollthedie( nrolls(i) ); % plain hist: - % hist( d ) + %hist( d ) % check bin counts of plain hist: % h = hist( d ) % force 6 bins: - % hist( d, 6 ) + %hist( d, 6 ) % set the right bin centers: - bins = 1:6; + %bins = 1:6; %hist( d, bins ) % normalize histogram and compare to expectation: diff --git a/statistics/code/gaussianbins.m b/statistics/code/gaussianbins.m index 5f3cc6d..effe1fd 100644 --- a/statistics/code/gaussianbins.m +++ b/statistics/code/gaussianbins.m @@ -1,6 +1,6 @@ -x = randn( 100, 1 ); -bins1 = -4:2:4; -bins2 = -4:0.5:4; +x = randn( 100, 1 ); % generate some data +bins1 = -4:2:4; % large bins +bins2 = -4:0.5:4; % small bins subplot( 1, 2, 1 ); hold on; hist( x, bins1 ); @@ -10,6 +10,7 @@ ylabel('Frequeny') hold off; subplot( 1, 2, 2 ); hold on; +% normalize to the rigtht bin size: hist( x, bins1, 1.0/(bins1(2)-bins1(1)) ); hist( x, bins2, 1.0/(bins2(2)-bins2(1)) ); xlabel('x') diff --git a/statistics/code/gaussianpdf.m b/statistics/code/gaussianpdf.m index 6c33dc6..259d6c9 100644 --- a/statistics/code/gaussianpdf.m +++ b/statistics/code/gaussianpdf.m @@ -1,22 +1,30 @@ % plot Gaussian pdf: -dx=0.1 +dx=0.1; x = [-4.0:dx:4.0]; p = exp(-0.5*x.^2)/sqrt(2.0*pi); hold on -plot(x,p, 'linewidth', 10 ) +plot(x, p, 'linewidth', 10) +% show area of integral: +area(x((x>=x1)&(x<=x2)), p((x>=x1)&(x<=x2)), 'FaceColor', 'r' ) +hold off % compute integral between x1 and x2: -x1=1.0 -x2=2.0 -P = sum(p((x>=x1)&(x=x1)&(x=x1)&(r=x1)&(r", relpos=(0.8,1.0), + connectionstyle="angle3,angleA=-110,angleB=60") ) +ax.annotate('1. quartile', + xy=(5.8, -0.7), xycoords='data', + xytext=(5.5, -3.5), textcoords='data', ha='right', + arrowprops=dict(arrowstyle="->", relpos=(0.5,1.0), + connectionstyle="angle3,angleA=30,angleB=70") ) +ax.annotate('3. quartile', + xy=(6.1, 0.6), xycoords='data', + xytext=(6.5, 3.0), textcoords='data', ha='left', + arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0), + connectionstyle="angle3,angleA=30,angleB=70") ) +ax.annotate('minimum', + xy=(6.1, -2.3), xycoords='data', + xytext=(7.2, -3.3), textcoords='data', ha='left', + arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5), + connectionstyle="angle3,angleA=10,angleB=100") ) +ax.annotate('maximum', + xy=(5.9, 2.8), xycoords='data', + xytext=(4.9, 3.5), textcoords='data', ha='right', + arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5), + connectionstyle="angle3,angleA=0,angleB=120") ) +ax.boxplot( x, whis=100.0 ) +plt.tight_layout() +plt.savefig('boxwhisker.pdf') +plt.show() + diff --git a/statistics/lecture/boxwhiskerdata.npy b/statistics/lecture/boxwhiskerdata.npy new file mode 100644 index 0000000..6751afa Binary files /dev/null and b/statistics/lecture/boxwhiskerdata.npy differ diff --git a/statistics/lecture/correlation.py b/statistics/lecture/correlation.py new file mode 100644 index 0000000..db317c7 --- /dev/null +++ b/statistics/lecture/correlation.py @@ -0,0 +1,34 @@ +import numpy as np +import matplotlib.pyplot as plt + +plt.xkcd() +fig = plt.figure( figsize=(6,5) ) +n = 200 +for k, r in enumerate( [ 1.0, 0.6, 0.0, -0.9 ] ) : + print r + x = np.random.randn( n ) + y = r*x + np.sqrt(1.0-r*r)*np.random.randn( n ) + ax = fig.add_subplot( 2, 2, k+1 ) + ax.spines['right'].set_visible(False) + ax.spines['top'].set_visible(False) + ax.yaxis.set_ticks_position('left') + ax.xaxis.set_ticks_position('bottom') + ax.text( -2, 2.5, 'r=%.1f' % r ) + if k == 0 : + ax.text( 2.8, -2, 'positively\ncorrelated', ha='right' ) + elif k == 1 : + ax.text( 2.8, -2.5, 'weakly\ncorrelated', ha='right' ) + elif k == 2 : + ax.text( 2.8, -2.5, 'not\ncorrelated', ha='right' ) + elif k == 3 : + ax.text( -2.5, -2, 'negatively\ncorrelated', ha='left' ) + ax.set_xlabel('x') + ax.set_ylabel('y') + ax.set_xlim( -3.0, 3.0) + ax.set_ylim( -3.0, 3.0) + ax.scatter( x, y ) + +plt.tight_layout() +plt.savefig('correlation.pdf') +plt.show() + diff --git a/statistics/lecture/descriptivestatistics.tex b/statistics/lecture/descriptivestatistics.tex index 3ae32ac..ac99345 100644 --- a/statistics/lecture/descriptivestatistics.tex +++ b/statistics/lecture/descriptivestatistics.tex @@ -19,10 +19,141 @@ \usepackage[left=25mm,right=25mm,top=20mm,bottom=30mm]{geometry} \setcounter{tocdepth}{1} -%%%% graphics %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%% section style %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage[sf,bf,it,big,clearempty]{titlesec} +\setcounter{secnumdepth}{-1} + + +%%%%% units %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage[mediumspace,mediumqspace,Gray]{SIunits} % \ohm, \micro + + +%%%%% figures %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \usepackage{graphicx} \usepackage{xcolor} -\newcommand{\texpicture}[1]{{\sffamily\small\input{#1.tex}}} +\pagecolor{white} + +\newcommand{\ruler}{\par\noindent\setlength{\unitlength}{1mm}\begin{picture}(0,6)% + \put(0,4){\line(1,0){170}}% + \multiput(0,2)(10,0){18}{\line(0,1){4}}% + \multiput(0,3)(1,0){170}{\line(0,1){2}}% + \put(0,0){\makebox(0,0){{\tiny 0}}}% + \put(10,0){\makebox(0,0){{\tiny 1}}}% + \put(20,0){\makebox(0,0){{\tiny 2}}}% + \put(30,0){\makebox(0,0){{\tiny 3}}}% + \put(40,0){\makebox(0,0){{\tiny 4}}}% + \put(50,0){\makebox(0,0){{\tiny 5}}}% + \put(60,0){\makebox(0,0){{\tiny 6}}}% + \put(70,0){\makebox(0,0){{\tiny 7}}}% + \put(80,0){\makebox(0,0){{\tiny 8}}}% + \put(90,0){\makebox(0,0){{\tiny 9}}}% + \put(100,0){\makebox(0,0){{\tiny 10}}}% + \put(110,0){\makebox(0,0){{\tiny 11}}}% + \put(120,0){\makebox(0,0){{\tiny 12}}}% + \put(130,0){\makebox(0,0){{\tiny 13}}}% + \put(140,0){\makebox(0,0){{\tiny 14}}}% + \put(150,0){\makebox(0,0){{\tiny 15}}}% + \put(160,0){\makebox(0,0){{\tiny 16}}}% + \put(170,0){\makebox(0,0){{\tiny 17}}}% + \end{picture}\par} + +% figures: +\setlength{\fboxsep}{0pt} +\newcommand{\texpicture}[1]{{\sffamily\footnotesize\input{#1.tex}}} +%\newcommand{\texpicture}[1]{\fbox{\sffamily\footnotesize\input{#1.tex}}} +%\newcommand{\texpicture}[1]{\setlength{\fboxsep}{2mm}\fbox{#1}} +%\newcommand{\texpicture}[1]{} +\newcommand{\figlabel}[1]{\textsf{\textbf{\large \uppercase{#1}}}} + +% maximum number of floats: +\setcounter{topnumber}{2} +\setcounter{bottomnumber}{0} +\setcounter{totalnumber}{2} + +% float placement fractions: +\renewcommand{\textfraction}{0.2} +\renewcommand{\topfraction}{0.8} +\renewcommand{\bottomfraction}{0.0} +\renewcommand{\floatpagefraction}{0.5} + +% spacing for floats: +\setlength{\floatsep}{12pt plus 2pt minus 2pt} +\setlength{\textfloatsep}{20pt plus 4pt minus 2pt} +\setlength{\intextsep}{12pt plus 2pt minus 2pt} + +% spacing for a floating page: +\makeatletter + \setlength{\@fptop}{0pt} + \setlength{\@fpsep}{8pt plus 2.0fil} + \setlength{\@fpbot}{0pt plus 1.0fil} +\makeatother + +% rules for floats: +\newcommand{\topfigrule}{\vspace*{10pt}{\hrule height0.4pt}\vspace*{-10.4pt}} +\newcommand{\bottomfigrule}{\vspace*{-10.4pt}{\hrule height0.4pt}\vspace*{10pt}} + +% captions: +\usepackage[format=plain,singlelinecheck=off,labelfont=bf,font={small,sf}]{caption} + +% put caption on separate float: +\newcommand{\breakfloat}{\end{figure}\begin{figure}[t]} + +% references to panels of a figure within the caption: +\newcommand{\figitem}[1]{\textsf{\bfseries\uppercase{#1}}} +% references to figures: +\newcommand{\panel}[1]{\textsf{\uppercase{#1}}} +\newcommand{\fref}[1]{\textup{\ref{#1}}} +\newcommand{\subfref}[2]{\textup{\ref{#1}}\,\panel{#2}} +% references to figures in normal text: +\newcommand{\fig}{Fig.} +\newcommand{\Fig}{Figure} +\newcommand{\figs}{Figs.} +\newcommand{\Figs}{Figures} +\newcommand{\figref}[1]{\fig~\fref{#1}} +\newcommand{\Figref}[1]{\Fig~\fref{#1}} +\newcommand{\figsref}[1]{\figs~\fref{#1}} +\newcommand{\Figsref}[1]{\Figs~\fref{#1}} +\newcommand{\subfigref}[2]{\fig~\subfref{#1}{#2}} +\newcommand{\Subfigref}[2]{\Fig~\subfref{#1}{#2}} +\newcommand{\subfigsref}[2]{\figs~\subfref{#1}{#2}} +\newcommand{\Subfigsref}[2]{\Figs~\subfref{#1}{#2}} +% references to figures within bracketed text: +\newcommand{\figb}{Fig.} +\newcommand{\figsb}{Figs.} +\newcommand{\figrefb}[1]{\figb~\fref{#1}} +\newcommand{\figsrefb}[1]{\figsb~\fref{#1}} +\newcommand{\subfigrefb}[2]{\figb~\subfref{#1}{#2}} +\newcommand{\subfigsrefb}[2]{\figsb~\subfref{#1}{#2}} + +% references to tables: +\newcommand{\tref}[1]{\textup{\ref{#1}}} +% references to tables in normal text: +\newcommand{\tab}{Tab.} +\newcommand{\Tab}{Table} +\newcommand{\tabs}{Tabs.} +\newcommand{\Tabs}{Tables} +\newcommand{\tabref}[1]{\tab~\tref{#1}} +\newcommand{\Tabref}[1]{\Tab~\tref{#1}} +\newcommand{\tabsref}[1]{\tabs~\tref{#1}} +\newcommand{\Tabsref}[1]{\Tabs~\tref{#1}} +% references to tables within bracketed text: +\newcommand{\tabb}{Tab.} +\newcommand{\tabsb}{Tab.} +\newcommand{\tabrefb}[1]{\tabb~\tref{#1}} +\newcommand{\tabsrefb}[1]{\tabsb~\tref{#1}} + + +%%%%% equation references %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\newcommand{\eqref}[1]{(\ref{#1})} +\newcommand{\eqn}{Eq.} +\newcommand{\Eqn}{Eq.} +\newcommand{\eqns}{Eqs.} +\newcommand{\Eqns}{Eqs.} +\newcommand{\eqnref}[1]{\eqn~\eqref{#1}} +\newcommand{\Eqnref}[1]{\Eqn~\eqref{#1}} +\newcommand{\eqnsref}[1]{\eqns~\eqref{#1}} +\newcommand{\Eqnsref}[1]{\Eqns~\eqref{#1}} + %%%%% listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \usepackage{listings} @@ -74,14 +205,16 @@ \newenvironment{definition}[1][]{\medskip\noindent\textbf{Definition}\ifthenelse{\equal{#1}{}}{}{ #1}:\newline}% {\medskip} -%\newcommand{\showlisting}{yes} -\newcommand{\showlisting}{no} +\newcommand{\showlisting}{yes} +%\newcommand{\showlisting}{no} \newcounter{theexercise} \setcounter{theexercise}{1} \newenvironment{exercise}[1][]{\medskip\noindent\textbf{\tr{Exercise}{\"Ubung} \arabic{theexercise}:} \stepcounter{theexercise}\newline \newcommand{\exercisesource}{#1}}% {\ifthenelse{\equal{\exercisesource}{}}{}{\ifthenelse{\equal{\showlisting}{yes}}{\medskip\lstinputlisting{\exercisesource}}{}}\medskip} +\graphicspath{{figures/}} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -154,7 +287,7 @@ below. In particular the script should test data vectors of different length.} {Schreibe ein Skript, das testet ob die \code{mymedian} Funktion wirklich die Zahl zur\"uckgibt, \"uber - der genausoviele Datenwerte liegen wie darunter. Das Skript sollte + der genauso viele Datenwerte liegen wie darunter. Das Skript sollte insbesondere verschieden lange Datenvektoren testen.} \end{exercise} @@ -246,7 +379,7 @@ $A$ des Histogramms ist also \[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \] und das normierte Histogramm hat die H\"ohe \[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \] -Es muss also nicht nur durch die Summe, sondern auch durch die Breite der Klassen $\Delta x$ +Es muss also nicht nur durch die Summe, sondern auch durch die Breite $\Delta x$ der Klassen geteilt werden. $p(x_i)$ kann keine Wahrscheinlichkeit sein, da $p(x_i)$ nun eine @@ -258,17 +391,65 @@ spricht von einer Wahrscheinlichkeitsdichte. \caption{\label{pdfprobabilitiesfig} Wahrscheinlichkeiten bei einer Wahrscheinlichkeitsdichtefunktion.} \end{figure} - -\begin{exercise} + +\begin{exercise}[gaussianpdf.m] \tr{Plot the Gaussian probability density}{Plotte die Gauss'sche Wahrscheinlichkeitsdichte } - \[ p_g(x) = 1/\sqrt{2\pi\sigma^2}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\] + \[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\] \tr{What does it mean?}{Was bedeutet die folgende Wahrscheinlichkeit?} - \[ P(x_1 < x < x2) = \int_{x_1}^{x_2} p(x) \, dx \] + \[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \] \tr{How large is}{Wie gro{\ss} ist} - \[ \int_{-\infty}^{+\infty} p(x) \, dx \; ?\] + \[ \int\limits_{-\infty}^{+\infty} p(x) \, dx \; ?\] \tr{Why?}{Warum?} \end{exercise} +\begin{exercise}[boxwhisker.m] + \tr{Generate eine $40 \times 10$ matrix of random numbers and + illustrate their distribution in a box-whicker plot + (\code{boxplot()} function). How to interpret the plot?} + {Erzeuge ein $40 \times 10$ Matrix + von Zufallszahlen und illustriere ihre Verteilungen in einem + Box-Whisker Plot (\code{boxplot()} Funktion, lies die Hilfe!). Wie ist der + Box-Whisker Plot zu interpretieren? Was hat es mit den Ausreissern auf sich? + Wie kann man erreichen, dass die Whisker den kleinsten und den gr\"o{\ss}ten + Datenwert anzeigen? Warum sind die unterschiedlichen Box-Whiskers nicht alle gleich, + obwohl sie aus der selben Verteilung gezogen worden sind?} +\end{exercise} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{boxwhisker} + \caption{\label{boxwhiskerfig} Box-whisker plots illustrate distributions.} +\end{figure} + + +\subsection{Korrelation} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{correlation} + \caption{\label{correlationfig} Korrelationen zwischen zwei Datens\"atzen $x$ und $y$.} +\end{figure} + +Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e angeschaut. +Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach Abh\"angigkeiten gefragt werden. +Der Korrelationskoeffizient +\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle + (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle + (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y + \rangle)^2} \rangle} \] quantifiziert einfache lineare +Zusammenh\"ange. Perfekt korrelierte Variablen ergeben einen +Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen +Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen +Korrelationskoeffizienten nahe 0 (\figrefb{correlationfig}). + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{nonlincorrelation} + \caption{\label{nonlincorrelationfig} Nichtlineare Zusammenh\"ange + werden durch den Korrelationskoeffizienten nicht erfasst! Sowohl + die quadratische Abh\"angigkeit (links) als auch eine + Rauschkorrelation (rechts), bei der die Streuung der $y$-Werte von + $x$ abh\"angen, ergeben Korrelationskeffizienten nahe Null. + $\xi$ sind normalverteilte Zufallszahlen.} +\end{figure} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Data types} @@ -390,8 +571,72 @@ spricht von einer Wahrscheinlichkeitsdichte. \end{itemize} -\end{document} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}} + +Beim Bootstrap erzeugt man sich die Verteilung von Statistiken durch Resampling +aus der Stichprobe. Das hat mehrere Vorteile: +\begin{itemize} +\item Weniger Annahmen (z.B. muss eine Stichprobe nicht Normalverteilt sein). +\item H\"ohere Genauigkeit als klassische Methoden. +\item Allgemeing\"ultigkeit: Bootstrap Methoden sind sich sehr + \"ahnlich f\"ur viele verschiedene Statistiken und ben\"otigen nicht + f\"ur jede Statistik eine andere Formel. +\end{itemize} + +\begin{figure}[t] + \includegraphics[width=0.8\textwidth]{2012-10-29_16-26-05_771}\\[2ex] + \includegraphics[width=0.8\textwidth]{2012-10-29_16-41-39_523}\\[2ex] + \includegraphics[width=0.8\textwidth]{2012-10-29_16-29-35_312} + \caption{\tr{Why can we only measure a sample of the + population?}{Warum k\"onnen wir nur eine Stichprobe der + Grundgesamtheit messen?}} +\end{figure} + +\begin{figure}[t] + \includegraphics[height=0.2\textheight]{srs1}\\[2ex] + \includegraphics[height=0.2\textheight]{srs2}\\[2ex] + \includegraphics[height=0.2\textheight]{srs3} + \caption{Bootstrap der Stichprobenvertielung (a) Von der + Grundgesamtheit (population) mit unbekanntem Parameter + (z.B. Mittelwert $\mu$) zieht man Stichproben (SRS: simple random + samples). Die Statistik (hier Bestimmung von $\bar x$) kann f\"ur + jede Stichprobe berechnet werden. Die erhaltenen Werte entstammen + der Stichprobenverteilung. Meisten wird aber nur eine Stichprobe + gezogen! (b) Mit bestimmten Annahmen und Theorien kann man auf + die Stichprobenverteilung schlie{\ss}en ohne sie gemessen zu + haben. (c) Alternativ k\"onnen aus der einen Stichprobe viele + Bootstrap-Stichproben generiert werden (resampling) und so + Eigenschaften der Stichprobenverteilung empirisch bestimmt + werden. Aus Hesterberg et al. 2003, Bootstrap Methods and + Permuation Tests} +\end{figure} + +\section{Bootstrap des Standardfehlers} + +Beim Bootstrap erzeugen wir durch resampling neue Stichproben und +benutzen diese um die Stichprobenverteilung einer Statistik zu +berechnen. Die Bootstrap Stichproben haben jeweils den gleichen Umfang +wie die urspr\"unglich gemessene Stichprobe und werden durch Ziehen +mit Zur\"ucklegen gewonnen. Jeder Wert der urspr\"unglichen Stichprobe +kann also einmal, mehrmals oder gar nicht in einer Bootstrap +Stichprobe vorkommen. + +\begin{exercise}[bootstrapsem.m] + Ziehe 1000 normalverteilte Zufallszahlen und berechne deren Mittelwert, + Standardabweichung und Standardfehler ($\sigma/\sqrt{n}$). + Resample die Daten 1000 mal (Ziehen mit Zur\"ucklegen) und berechne jeweils + den Mittelwert. + + Plotte ein Histogramm dieser Mittelwerte, sowie deren Mittelwert und + die Standardabweichung. + + Was hat das mit dem Standardfehler zu tun? +\end{exercise} + +\end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Statistics} diff --git a/statistics/lecture/figures/2012-10-29_16-26-05_771.jpg b/statistics/lecture/figures/2012-10-29_16-26-05_771.jpg new file mode 100755 index 0000000..a997cdd Binary files /dev/null and b/statistics/lecture/figures/2012-10-29_16-26-05_771.jpg differ diff --git a/statistics/lecture/figures/2012-10-29_16-29-35_312.jpg b/statistics/lecture/figures/2012-10-29_16-29-35_312.jpg new file mode 100755 index 0000000..9f8843c Binary files /dev/null and b/statistics/lecture/figures/2012-10-29_16-29-35_312.jpg differ diff --git a/statistics/lecture/figures/2012-10-29_16-41-39_523.jpg b/statistics/lecture/figures/2012-10-29_16-41-39_523.jpg new file mode 100755 index 0000000..88892ea Binary files /dev/null and b/statistics/lecture/figures/2012-10-29_16-41-39_523.jpg differ diff --git a/statistics/lecture/figures/srs1.png b/statistics/lecture/figures/srs1.png new file mode 100644 index 0000000..a365a56 Binary files /dev/null and b/statistics/lecture/figures/srs1.png differ diff --git a/statistics/lecture/figures/srs2.png b/statistics/lecture/figures/srs2.png new file mode 100644 index 0000000..e3360ea Binary files /dev/null and b/statistics/lecture/figures/srs2.png differ diff --git a/statistics/lecture/figures/srs3.png b/statistics/lecture/figures/srs3.png new file mode 100644 index 0000000..e1b9632 Binary files /dev/null and b/statistics/lecture/figures/srs3.png differ diff --git a/statistics/lecture/nonlincorrelation.py b/statistics/lecture/nonlincorrelation.py new file mode 100644 index 0000000..c0ca723 --- /dev/null +++ b/statistics/lecture/nonlincorrelation.py @@ -0,0 +1,42 @@ +import numpy as np +import matplotlib.pyplot as plt + +plt.xkcd() +fig = plt.figure( figsize=(6,3) ) +n = 200 +x = np.random.randn( n ) +y = np.random.randn( n ) + +z = x*x+0.2*y +r =np.corrcoef(x,z)[0,1] +ax = fig.add_subplot( 1, 2, 1 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' ) +ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' ) +ax.set_xlabel('x') +ax.set_ylabel('y') +ax.set_xlim( -3.0, 3.0) +ax.set_ylim( -0.5, 6.0) +ax.scatter( x, z ) + +z = 0.5*x*y +r =np.corrcoef(x,z)[0,1] +ax = fig.add_subplot( 1, 2, 2 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' ) +ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' ) +ax.set_xlabel('x') +ax.set_ylabel('y') +ax.set_xlim( -3.0, 3.0) +ax.set_ylim( -3.0, 3.0) +ax.scatter( x, z ) + +plt.tight_layout() +plt.savefig('nonlincorrelation.pdf') +plt.show() diff --git a/statistics/material/Hesterberg2003.pdf b/statistics/material/Hesterberg2003.pdf new file mode 100644 index 0000000..0b50128 Binary files /dev/null and b/statistics/material/Hesterberg2003.pdf differ