improved on statistics

This commit is contained in:
Jan Benda 2017-11-24 19:34:16 +01:00
parent 35d1b908f3
commit f362788620
7 changed files with 144 additions and 44 deletions

View File

@ -27,8 +27,9 @@
\usepackage[makeindex]{splitidx} \usepackage[makeindex]{splitidx}
\makeindex \makeindex
\usepackage[totoc]{idxlayout} \usepackage[totoc]{idxlayout}
\newindex[Fachbegriffe]{term} \newindex[\tr{Glossary}{Fachbegriffe}]{term}
\newindex[Englische Fachbegriffe]{enterm} \newindex[Englische Fachbegriffe]{enterm}
\newindex[Deutsche Fachbegriffe]{determ}
\newindex[MATLAB Code]{mcode} \newindex[MATLAB Code]{mcode}
\newindex[Python Code]{pcode} \newindex[Python Code]{pcode}
@ -198,8 +199,8 @@
%%%%% english, german, code and file terms: %%%%%%%%%%%%%%% %%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
\usepackage{ifthen} \usepackage{ifthen}
\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}} \newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}}
\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}} \newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}}
\newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}} \newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
\newcommand{\file}[1]{\texttt{#1}} \newcommand{\file}[1]{\texttt{#1}}

View File

@ -92,7 +92,8 @@
%%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\printindex[term] \printindex[term]
\printindex[enterm] \printindex[determ] % for english text
% \printindex[enterm] % for german text
%\setindexprenote{Some explanations.} %\setindexprenote{Some explanations.}
%\printindex[pcode] %\printindex[pcode]

View File

@ -1,14 +1,18 @@
n = 1000 n = 200;
x = randn( n, 1 ); corrs = [ 1.0, 0.6, 0.0, -0.9 ];
y = randn( n, 1 ) + 0.2*x; for k = [1:length(corrs)]
r = corr(x,y) r = corrs(k);
x = randn(n, 1);
nsamples = 500; y = r*x; % linear dependence of y on x
rs = zeros( nsamples, 1 ); % add noise to destroy perfect correlations:
for i = 1:nsamples y = y + sqrt(1.0-r*r)*randn(n, 1);
xs = x(randi(n,n,1)); % compute correlation coefficient of data:
ys = x(randi(n,n,1)); rho = corr(x, y);
rs(i) = corr(xs,ys); subplot(2, 2, k)
scatter( x, y )
text( -2, 2.5, sprintf('r=%.1f', rho) )
xlabel('x')
ylabel('y')
xlim([-3.0, 3.0])
ylim([-3.0, 3.0])
end end
hist( rs, 20 )

View File

@ -0,0 +1,54 @@
import numpy as np
import matplotlib.pyplot as plt
rng = np.random.RandomState(981)
x = rng.randn(40, 10) + 4.0
plt.xkcd()
fig = plt.figure( figsize=(6,3.4) )
ax = fig.add_subplot(1, 1, 1)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel('Experiment')
ax.set_ylabel('x')
ax.set_ylim( 0.0, 8.0)
ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
ecolor='k', capsize=0, error_kw={'elinewidth':5})
## ax.annotate('Median',
## xy=(3.9, 0.0), xycoords='data',
## xytext=(3.5, -2.7), textcoords='data', ha='right',
## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
## connectionstyle="angle3,angleA=-110,angleB=60") )
## ax.annotate('1. quartile',
## xy=(5.8, -0.9), xycoords='data',
## xytext=(5.5, -3.4), textcoords='data', ha='right',
## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
## connectionstyle="angle3,angleA=30,angleB=70") )
## ax.annotate('3. quartile',
## xy=(6.1, 1.1), xycoords='data',
## xytext=(6.5, 3.0), textcoords='data', ha='left',
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
## connectionstyle="angle3,angleA=30,angleB=70") )
## ax.annotate('minimum',
## xy=(6.1, -1.9), xycoords='data',
## xytext=(7.2, -3.3), textcoords='data', ha='left',
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
## connectionstyle="angle3,angleA=10,angleB=100") )
## ax.annotate('maximum',
## xy=(5.9, 2.7), xycoords='data',
## xytext=(4.9, 3.5), textcoords='data', ha='right',
## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
## connectionstyle="angle3,angleA=0,angleB=120") )
#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
ax.set_xlim(0.0, 5.0)
ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
plt.tight_layout()
plt.savefig('displayunivariatedata.pdf')
#plt.show()

View File

@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left') ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom') ax.xaxis.set_ticks_position('bottom')
ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' ) ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' ) ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' )
ax.set_xlabel('x') ax.set_xlabel('x')
ax.set_ylabel('y') ax.set_ylabel('y')
ax.set_xlim( -3.0, 3.0) ax.set_xlim( -3.0, 3.0)
@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left') ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom') ax.xaxis.set_ticks_position('bottom')
ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' ) ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' ) ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' )
ax.set_xlabel('x') ax.set_xlabel('x')
ax.set_ylabel('y') ax.set_ylabel('y')
ax.set_xlim( -3.0, 3.0) ax.set_xlim( -3.0, 3.0)

View File

@ -5,7 +5,7 @@
Descriptive statistics characterizes data sets by means of a few measures. Descriptive statistics characterizes data sets by means of a few measures.
In addition to histograms that visualize the distribution of the data, In addition to histograms that visualize the distribution of the data,
the following measures are used for characterizing the data: the following measures are used for characterizing the univariate data:
\begin{description} \begin{description}
\item[Location, central tendency] (``Lagema{\ss}e''): \item[Location, central tendency] (``Lagema{\ss}e''):
arithmetic mean, median, mode. arithmetic mean, median, mode.
@ -13,10 +13,39 @@ the following measures are used for characterizing the data:
standard deviation, inter-quartile range,\linebreak coefficient of variation standard deviation, inter-quartile range,\linebreak coefficient of variation
(``Variationskoeffizient''). (``Variationskoeffizient'').
\item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung''). \item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
\end{description}
For bivariate and multivariate data sets we can also analyse their
\begin{description}
\item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient, \item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
Spearman's rank correlation coefficient. Spearman's rank correlation coefficient.
\end{description} \end{description}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mean, variance, and standard deviation}
The \enterm{arithmetic mean} is a measure of location. For $n$ data values
$x_i$ the arithmetic mean is computed by
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
The mean has the same unit as the data values.
The dispersion of the data values around the mean is quantified by
their \enterm{variance}
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
The unit of the variance is the unit of the data values squared.
Therefore, variances cannot be compared to the mean or the data values
themselves. In particular, variances cannot be used for plotting error
bars along with the mean.
The standard deviation
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
however, has the same unit as the data values and can (and should) be
used to display the dispersion of the data together withtheir mean.
\begin{figure}[t]
\includegraphics[width=1\textwidth]{displayunivariatedata}
\titlecaption{\label{displayunivariatefig} Display univariate
data.}{Bla.}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mode, median, quartile, etc.} \section{Mode, median, quartile, etc.}
@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}).
\matlab{} provides the function \code{median()} for computing the median. \matlab{} provides the function \code{median()} for computing the median.
\newpage
\begin{exercise}{checkmymedian.m}{} \begin{exercise}{checkmymedian.m}{}
Write a script that tests whether your median function really Write a script that tests whether your median function really
returns a median above which are the same number of data than returns a median above which are the same number of data than
@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
\section{Correlations} \section{Correlations}
Until now we described properties of univariate data sets. In
bivariate or multivariate data sets where we have pairs or tuples of
data values (e.g. the size and the weight of elephants) we want to analyze
dependencies between the variables.
The \enterm{correlation coefficient}
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
(x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
(x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
\rangle)^2} \rangle} \]
quantifies linear relationships between two variables
\matlabfun{corr()}. The correlation coefficient is the
\determ{covariance} normalized by the standard deviations of the
single variables. Perfectly correlated variables result in a
correlation coefficient of $+1$, anit-correlated or negatively
correlated data in a correlation coefficient of $-1$ and un-correlated
data in a correlation coefficient close to zero
(\figrefb{correlationfig}).
\begin{figure}[tp] \begin{figure}[tp]
\includegraphics[width=1\textwidth]{correlation} \includegraphics[width=1\textwidth]{correlation}
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{} \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
\end{figure} \end{figure}
Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e \begin{exercise}{correlations.m}{}
angeschaut. Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach Generate pairs of random numbers with four different correlations
Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden. Der (perfectly correlated, somehow correlated, uncorrelated, negatively
\determ[Korrelationskoeffizient]{Korrelations\-koeffizient} correlated). Plot them into a scatter plot and compute their
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle correlation coefficient.
(x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle \end{exercise}
(x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
\rangle)^2} \rangle} \]
quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der
Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die
Standardabweichungen. Perfekt korrelierte Variablen ergeben einen
Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen
Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen
Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}).
Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten Note that non-linear dependencies between two variables are
nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}). insufficiently or not at all detected by the correlation coefficient
(\figref{nonlincorrelationfig}).
\begin{figure}[tp] \begin{figure}[tp]
\includegraphics[width=1\textwidth]{nonlincorrelation} \includegraphics[width=1\textwidth]{nonlincorrelation}
\titlecaption{\label{nonlincorrelationfig} Korrelationen bei \titlecaption{\label{nonlincorrelationfig} Correlations for
nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten non-linear dependencies.}{The correlation coefficient detects
erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische linear dependencies only. Both the quadratic dependency (left) and
Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts), the noise correlation (right), where the dispersal of the
bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben $y$-values depends on the $x$-value, result in correlation
Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte coefficients close to zero. $\xi$ denote normally distributed
Zufallszahlen.} random numbers.}
\end{figure} \end{figure}