improved on statistics

This commit is contained in:
Jan Benda 2017-11-24 19:34:16 +01:00
parent 35d1b908f3
commit f362788620
7 changed files with 144 additions and 44 deletions

View File

@ -27,8 +27,9 @@
\usepackage[makeindex]{splitidx} \usepackage[makeindex]{splitidx}
\makeindex \makeindex
\usepackage[totoc]{idxlayout} \usepackage[totoc]{idxlayout}
\newindex[Fachbegriffe]{term} \newindex[\tr{Glossary}{Fachbegriffe}]{term}
\newindex[Englische Fachbegriffe]{enterm} \newindex[Englische Fachbegriffe]{enterm}
\newindex[Deutsche Fachbegriffe]{determ}
\newindex[MATLAB Code]{mcode} \newindex[MATLAB Code]{mcode}
\newindex[Python Code]{pcode} \newindex[Python Code]{pcode}
@ -198,8 +199,8 @@
%%%%% english, german, code and file terms: %%%%%%%%%%%%%%% %%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
\usepackage{ifthen} \usepackage{ifthen}
\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}} \newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}}
\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}} \newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}}
\newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}} \newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
\newcommand{\file}[1]{\texttt{#1}} \newcommand{\file}[1]{\texttt{#1}}

View File

@ -92,7 +92,8 @@
%%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\printindex[term] \printindex[term]
\printindex[enterm] \printindex[determ] % for english text
% \printindex[enterm] % for german text
%\setindexprenote{Some explanations.} %\setindexprenote{Some explanations.}
%\printindex[pcode] %\printindex[pcode]

View File

@ -1,14 +1,18 @@
n = 1000 n = 200;
x = randn( n, 1 ); corrs = [ 1.0, 0.6, 0.0, -0.9 ];
y = randn( n, 1 ) + 0.2*x; for k = [1:length(corrs)]
r = corr(x,y) r = corrs(k);
x = randn(n, 1);
nsamples = 500; y = r*x; % linear dependence of y on x
rs = zeros( nsamples, 1 ); % add noise to destroy perfect correlations:
for i = 1:nsamples y = y + sqrt(1.0-r*r)*randn(n, 1);
xs = x(randi(n,n,1)); % compute correlation coefficient of data:
ys = x(randi(n,n,1)); rho = corr(x, y);
rs(i) = corr(xs,ys); subplot(2, 2, k)
scatter( x, y )
text( -2, 2.5, sprintf('r=%.1f', rho) )
xlabel('x')
ylabel('y')
xlim([-3.0, 3.0])
ylim([-3.0, 3.0])
end end
hist( rs, 20 )

View File

@ -0,0 +1,54 @@
import numpy as np
import matplotlib.pyplot as plt
rng = np.random.RandomState(981)
x = rng.randn(40, 10) + 4.0
plt.xkcd()
fig = plt.figure( figsize=(6,3.4) )
ax = fig.add_subplot(1, 1, 1)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel('Experiment')
ax.set_ylabel('x')
ax.set_ylim( 0.0, 8.0)
ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
ecolor='k', capsize=0, error_kw={'elinewidth':5})
## ax.annotate('Median',
## xy=(3.9, 0.0), xycoords='data',
## xytext=(3.5, -2.7), textcoords='data', ha='right',
## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
## connectionstyle="angle3,angleA=-110,angleB=60") )
## ax.annotate('1. quartile',
## xy=(5.8, -0.9), xycoords='data',
## xytext=(5.5, -3.4), textcoords='data', ha='right',
## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
## connectionstyle="angle3,angleA=30,angleB=70") )
## ax.annotate('3. quartile',
## xy=(6.1, 1.1), xycoords='data',
## xytext=(6.5, 3.0), textcoords='data', ha='left',
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
## connectionstyle="angle3,angleA=30,angleB=70") )
## ax.annotate('minimum',
## xy=(6.1, -1.9), xycoords='data',
## xytext=(7.2, -3.3), textcoords='data', ha='left',
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
## connectionstyle="angle3,angleA=10,angleB=100") )
## ax.annotate('maximum',
## xy=(5.9, 2.7), xycoords='data',
## xytext=(4.9, 3.5), textcoords='data', ha='right',
## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
## connectionstyle="angle3,angleA=0,angleB=120") )
#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
ax.set_xlim(0.0, 5.0)
ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
plt.tight_layout()
plt.savefig('displayunivariatedata.pdf')
#plt.show()

View File

@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left') ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom') ax.xaxis.set_ticks_position('bottom')
ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' ) ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' ) ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' )
ax.set_xlabel('x') ax.set_xlabel('x')
ax.set_ylabel('y') ax.set_ylabel('y')
ax.set_xlim( -3.0, 3.0) ax.set_xlim( -3.0, 3.0)
@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left') ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom') ax.xaxis.set_ticks_position('bottom')
ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' ) ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' ) ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' )
ax.set_xlabel('x') ax.set_xlabel('x')
ax.set_ylabel('y') ax.set_ylabel('y')
ax.set_xlim( -3.0, 3.0) ax.set_xlim( -3.0, 3.0)

View File

@ -5,7 +5,7 @@
Descriptive statistics characterizes data sets by means of a few measures. Descriptive statistics characterizes data sets by means of a few measures.
In addition to histograms that visualize the distribution of the data, In addition to histograms that visualize the distribution of the data,
the following measures are used for characterizing the data: the following measures are used for characterizing the univariate data:
\begin{description} \begin{description}
\item[Location, central tendency] (``Lagema{\ss}e''): \item[Location, central tendency] (``Lagema{\ss}e''):
arithmetic mean, median, mode. arithmetic mean, median, mode.
@ -13,10 +13,39 @@ the following measures are used for characterizing the data:
standard deviation, inter-quartile range,\linebreak coefficient of variation standard deviation, inter-quartile range,\linebreak coefficient of variation
(``Variationskoeffizient''). (``Variationskoeffizient'').
\item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung''). \item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
\end{description}
For bivariate and multivariate data sets we can also analyse their
\begin{description}
\item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient, \item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
Spearman's rank correlation coefficient. Spearman's rank correlation coefficient.
\end{description} \end{description}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mean, variance, and standard deviation}
The \enterm{arithmetic mean} is a measure of location. For $n$ data values
$x_i$ the arithmetic mean is computed by
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
The mean has the same unit as the data values.
The dispersion of the data values around the mean is quantified by
their \enterm{variance}
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
The unit of the variance is the unit of the data values squared.
Therefore, variances cannot be compared to the mean or the data values
themselves. In particular, variances cannot be used for plotting error
bars along with the mean.
The standard deviation
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
however, has the same unit as the data values and can (and should) be
used to display the dispersion of the data together withtheir mean.
\begin{figure}[t]
\includegraphics[width=1\textwidth]{displayunivariatedata}
\titlecaption{\label{displayunivariatefig} Display univariate
data.}{Bla.}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mode, median, quartile, etc.} \section{Mode, median, quartile, etc.}
@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}).
\matlab{} provides the function \code{median()} for computing the median. \matlab{} provides the function \code{median()} for computing the median.
\newpage
\begin{exercise}{checkmymedian.m}{} \begin{exercise}{checkmymedian.m}{}
Write a script that tests whether your median function really Write a script that tests whether your median function really
returns a median above which are the same number of data than returns a median above which are the same number of data than
@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
\section{Correlations} \section{Correlations}
\begin{figure}[tp] Until now we described properties of univariate data sets. In
\includegraphics[width=1\textwidth]{correlation} bivariate or multivariate data sets where we have pairs or tuples of
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{} data values (e.g. the size and the weight of elephants) we want to analyze
\end{figure} dependencies between the variables.
Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e The \enterm{correlation coefficient}
angeschaut. Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach
Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden. Der
\determ[Korrelationskoeffizient]{Korrelations\-koeffizient}
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle \[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
(x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
(x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
\rangle)^2} \rangle} \] \rangle)^2} \rangle} \]
quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der quantifies linear relationships between two variables
Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die \matlabfun{corr()}. The correlation coefficient is the
Standardabweichungen. Perfekt korrelierte Variablen ergeben einen \determ{covariance} normalized by the standard deviations of the
Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen single variables. Perfectly correlated variables result in a
Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen correlation coefficient of $+1$, anit-correlated or negatively
Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}). correlated data in a correlation coefficient of $-1$ and un-correlated
data in a correlation coefficient close to zero
(\figrefb{correlationfig}).
\begin{figure}[tp]
\includegraphics[width=1\textwidth]{correlation}
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
\end{figure}
\begin{exercise}{correlations.m}{}
Generate pairs of random numbers with four different correlations
(perfectly correlated, somehow correlated, uncorrelated, negatively
correlated). Plot them into a scatter plot and compute their
correlation coefficient.
\end{exercise}
Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten Note that non-linear dependencies between two variables are
nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}). insufficiently or not at all detected by the correlation coefficient
(\figref{nonlincorrelationfig}).
\begin{figure}[tp] \begin{figure}[tp]
\includegraphics[width=1\textwidth]{nonlincorrelation} \includegraphics[width=1\textwidth]{nonlincorrelation}
\titlecaption{\label{nonlincorrelationfig} Korrelationen bei \titlecaption{\label{nonlincorrelationfig} Correlations for
nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten non-linear dependencies.}{The correlation coefficient detects
erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische linear dependencies only. Both the quadratic dependency (left) and
Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts), the noise correlation (right), where the dispersal of the
bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben $y$-values depends on the $x$-value, result in correlation
Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte coefficients close to zero. $\xi$ denote normally distributed
Zufallszahlen.} random numbers.}
\end{figure} \end{figure}