improved on statistics
This commit is contained in:
parent
35d1b908f3
commit
f362788620
Binary file not shown.
@ -27,8 +27,9 @@
|
|||||||
\usepackage[makeindex]{splitidx}
|
\usepackage[makeindex]{splitidx}
|
||||||
\makeindex
|
\makeindex
|
||||||
\usepackage[totoc]{idxlayout}
|
\usepackage[totoc]{idxlayout}
|
||||||
\newindex[Fachbegriffe]{term}
|
\newindex[\tr{Glossary}{Fachbegriffe}]{term}
|
||||||
\newindex[Englische Fachbegriffe]{enterm}
|
\newindex[Englische Fachbegriffe]{enterm}
|
||||||
|
\newindex[Deutsche Fachbegriffe]{determ}
|
||||||
\newindex[MATLAB Code]{mcode}
|
\newindex[MATLAB Code]{mcode}
|
||||||
\newindex[Python Code]{pcode}
|
\newindex[Python Code]{pcode}
|
||||||
|
|
||||||
@ -198,8 +199,8 @@
|
|||||||
|
|
||||||
%%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
|
%%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
|
||||||
\usepackage{ifthen}
|
\usepackage{ifthen}
|
||||||
\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}}
|
\newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}}
|
||||||
\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
|
\newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}}
|
||||||
\newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
|
\newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
|
||||||
\newcommand{\file}[1]{\texttt{#1}}
|
\newcommand{\file}[1]{\texttt{#1}}
|
||||||
|
|
||||||
|
@ -92,7 +92,8 @@
|
|||||||
%%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\printindex[term]
|
\printindex[term]
|
||||||
|
|
||||||
\printindex[enterm]
|
\printindex[determ] % for english text
|
||||||
|
% \printindex[enterm] % for german text
|
||||||
|
|
||||||
%\setindexprenote{Some explanations.}
|
%\setindexprenote{Some explanations.}
|
||||||
%\printindex[pcode]
|
%\printindex[pcode]
|
||||||
|
@ -1,14 +1,18 @@
|
|||||||
n = 1000
|
n = 200;
|
||||||
x = randn( n, 1 );
|
corrs = [ 1.0, 0.6, 0.0, -0.9 ];
|
||||||
y = randn( n, 1 ) + 0.2*x;
|
for k = [1:length(corrs)]
|
||||||
r = corr(x,y)
|
r = corrs(k);
|
||||||
|
x = randn(n, 1);
|
||||||
nsamples = 500;
|
y = r*x; % linear dependence of y on x
|
||||||
rs = zeros( nsamples, 1 );
|
% add noise to destroy perfect correlations:
|
||||||
for i = 1:nsamples
|
y = y + sqrt(1.0-r*r)*randn(n, 1);
|
||||||
xs = x(randi(n,n,1));
|
% compute correlation coefficient of data:
|
||||||
ys = x(randi(n,n,1));
|
rho = corr(x, y);
|
||||||
rs(i) = corr(xs,ys);
|
subplot(2, 2, k)
|
||||||
|
scatter( x, y )
|
||||||
|
text( -2, 2.5, sprintf('r=%.1f', rho) )
|
||||||
|
xlabel('x')
|
||||||
|
ylabel('y')
|
||||||
|
xlim([-3.0, 3.0])
|
||||||
|
ylim([-3.0, 3.0])
|
||||||
end
|
end
|
||||||
|
|
||||||
hist( rs, 20 )
|
|
||||||
|
54
statistics/lecture/displayunivariatedata.py
Normal file
54
statistics/lecture/displayunivariatedata.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
rng = np.random.RandomState(981)
|
||||||
|
x = rng.randn(40, 10) + 4.0
|
||||||
|
|
||||||
|
plt.xkcd()
|
||||||
|
fig = plt.figure( figsize=(6,3.4) )
|
||||||
|
ax = fig.add_subplot(1, 1, 1)
|
||||||
|
ax.spines['right'].set_visible(False)
|
||||||
|
ax.spines['top'].set_visible(False)
|
||||||
|
ax.yaxis.set_ticks_position('left')
|
||||||
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
|
ax.set_xlabel('Experiment')
|
||||||
|
ax.set_ylabel('x')
|
||||||
|
ax.set_ylim( 0.0, 8.0)
|
||||||
|
ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
|
||||||
|
ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
|
||||||
|
ecolor='k', capsize=0, error_kw={'elinewidth':5})
|
||||||
|
|
||||||
|
## ax.annotate('Median',
|
||||||
|
## xy=(3.9, 0.0), xycoords='data',
|
||||||
|
## xytext=(3.5, -2.7), textcoords='data', ha='right',
|
||||||
|
## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
|
||||||
|
## connectionstyle="angle3,angleA=-110,angleB=60") )
|
||||||
|
## ax.annotate('1. quartile',
|
||||||
|
## xy=(5.8, -0.9), xycoords='data',
|
||||||
|
## xytext=(5.5, -3.4), textcoords='data', ha='right',
|
||||||
|
## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
|
||||||
|
## connectionstyle="angle3,angleA=30,angleB=70") )
|
||||||
|
## ax.annotate('3. quartile',
|
||||||
|
## xy=(6.1, 1.1), xycoords='data',
|
||||||
|
## xytext=(6.5, 3.0), textcoords='data', ha='left',
|
||||||
|
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
|
||||||
|
## connectionstyle="angle3,angleA=30,angleB=70") )
|
||||||
|
## ax.annotate('minimum',
|
||||||
|
## xy=(6.1, -1.9), xycoords='data',
|
||||||
|
## xytext=(7.2, -3.3), textcoords='data', ha='left',
|
||||||
|
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
|
||||||
|
## connectionstyle="angle3,angleA=10,angleB=100") )
|
||||||
|
## ax.annotate('maximum',
|
||||||
|
## xy=(5.9, 2.7), xycoords='data',
|
||||||
|
## xytext=(4.9, 3.5), textcoords='data', ha='right',
|
||||||
|
## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
|
||||||
|
## connectionstyle="angle3,angleA=0,angleB=120") )
|
||||||
|
#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
|
||||||
|
#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
|
||||||
|
ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
|
||||||
|
ax.set_xlim(0.0, 5.0)
|
||||||
|
ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('displayunivariatedata.pdf')
|
||||||
|
#plt.show()
|
||||||
|
|
@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False)
|
|||||||
ax.yaxis.set_ticks_position('left')
|
ax.yaxis.set_ticks_position('left')
|
||||||
ax.xaxis.set_ticks_position('bottom')
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
|
ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
|
||||||
ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' )
|
ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' )
|
||||||
ax.set_xlabel('x')
|
ax.set_xlabel('x')
|
||||||
ax.set_ylabel('y')
|
ax.set_ylabel('y')
|
||||||
ax.set_xlim( -3.0, 3.0)
|
ax.set_xlim( -3.0, 3.0)
|
||||||
@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False)
|
|||||||
ax.yaxis.set_ticks_position('left')
|
ax.yaxis.set_ticks_position('left')
|
||||||
ax.xaxis.set_ticks_position('bottom')
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
|
ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
|
||||||
ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' )
|
ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' )
|
||||||
ax.set_xlabel('x')
|
ax.set_xlabel('x')
|
||||||
ax.set_ylabel('y')
|
ax.set_ylabel('y')
|
||||||
ax.set_xlim( -3.0, 3.0)
|
ax.set_xlim( -3.0, 3.0)
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
Descriptive statistics characterizes data sets by means of a few measures.
|
Descriptive statistics characterizes data sets by means of a few measures.
|
||||||
|
|
||||||
In addition to histograms that visualize the distribution of the data,
|
In addition to histograms that visualize the distribution of the data,
|
||||||
the following measures are used for characterizing the data:
|
the following measures are used for characterizing the univariate data:
|
||||||
\begin{description}
|
\begin{description}
|
||||||
\item[Location, central tendency] (``Lagema{\ss}e''):
|
\item[Location, central tendency] (``Lagema{\ss}e''):
|
||||||
arithmetic mean, median, mode.
|
arithmetic mean, median, mode.
|
||||||
@ -13,10 +13,39 @@ the following measures are used for characterizing the data:
|
|||||||
standard deviation, inter-quartile range,\linebreak coefficient of variation
|
standard deviation, inter-quartile range,\linebreak coefficient of variation
|
||||||
(``Variationskoeffizient'').
|
(``Variationskoeffizient'').
|
||||||
\item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
|
\item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
|
||||||
|
\end{description}
|
||||||
|
For bivariate and multivariate data sets we can also analyse their
|
||||||
|
\begin{description}
|
||||||
\item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
|
\item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
|
||||||
Spearman's rank correlation coefficient.
|
Spearman's rank correlation coefficient.
|
||||||
\end{description}
|
\end{description}
|
||||||
|
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
\section{Mean, variance, and standard deviation}
|
||||||
|
The \enterm{arithmetic mean} is a measure of location. For $n$ data values
|
||||||
|
$x_i$ the arithmetic mean is computed by
|
||||||
|
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
|
||||||
|
The mean has the same unit as the data values.
|
||||||
|
|
||||||
|
The dispersion of the data values around the mean is quantified by
|
||||||
|
their \enterm{variance}
|
||||||
|
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
|
||||||
|
The unit of the variance is the unit of the data values squared.
|
||||||
|
Therefore, variances cannot be compared to the mean or the data values
|
||||||
|
themselves. In particular, variances cannot be used for plotting error
|
||||||
|
bars along with the mean.
|
||||||
|
|
||||||
|
The standard deviation
|
||||||
|
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
|
||||||
|
however, has the same unit as the data values and can (and should) be
|
||||||
|
used to display the dispersion of the data together withtheir mean.
|
||||||
|
|
||||||
|
\begin{figure}[t]
|
||||||
|
\includegraphics[width=1\textwidth]{displayunivariatedata}
|
||||||
|
\titlecaption{\label{displayunivariatefig} Display univariate
|
||||||
|
data.}{Bla.}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\section{Mode, median, quartile, etc.}
|
\section{Mode, median, quartile, etc.}
|
||||||
|
|
||||||
@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}).
|
|||||||
|
|
||||||
\matlab{} provides the function \code{median()} for computing the median.
|
\matlab{} provides the function \code{median()} for computing the median.
|
||||||
|
|
||||||
\newpage
|
|
||||||
\begin{exercise}{checkmymedian.m}{}
|
\begin{exercise}{checkmymedian.m}{}
|
||||||
Write a script that tests whether your median function really
|
Write a script that tests whether your median function really
|
||||||
returns a median above which are the same number of data than
|
returns a median above which are the same number of data than
|
||||||
@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
|
|||||||
|
|
||||||
\section{Correlations}
|
\section{Correlations}
|
||||||
|
|
||||||
\begin{figure}[tp]
|
Until now we described properties of univariate data sets. In
|
||||||
\includegraphics[width=1\textwidth]{correlation}
|
bivariate or multivariate data sets where we have pairs or tuples of
|
||||||
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
|
data values (e.g. the size and the weight of elephants) we want to analyze
|
||||||
\end{figure}
|
dependencies between the variables.
|
||||||
|
|
||||||
Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e
|
The \enterm{correlation coefficient}
|
||||||
angeschaut. Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach
|
|
||||||
Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden. Der
|
|
||||||
\determ[Korrelationskoeffizient]{Korrelations\-koeffizient}
|
|
||||||
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
|
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
|
||||||
(x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
|
(x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
|
||||||
(x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
|
(x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
|
||||||
\rangle)^2} \rangle} \]
|
\rangle)^2} \rangle} \]
|
||||||
quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der
|
quantifies linear relationships between two variables
|
||||||
Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die
|
\matlabfun{corr()}. The correlation coefficient is the
|
||||||
Standardabweichungen. Perfekt korrelierte Variablen ergeben einen
|
\determ{covariance} normalized by the standard deviations of the
|
||||||
Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen
|
single variables. Perfectly correlated variables result in a
|
||||||
Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen
|
correlation coefficient of $+1$, anit-correlated or negatively
|
||||||
Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}).
|
correlated data in a correlation coefficient of $-1$ and un-correlated
|
||||||
|
data in a correlation coefficient close to zero
|
||||||
|
(\figrefb{correlationfig}).
|
||||||
|
|
||||||
|
\begin{figure}[tp]
|
||||||
|
\includegraphics[width=1\textwidth]{correlation}
|
||||||
|
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{exercise}{correlations.m}{}
|
||||||
|
Generate pairs of random numbers with four different correlations
|
||||||
|
(perfectly correlated, somehow correlated, uncorrelated, negatively
|
||||||
|
correlated). Plot them into a scatter plot and compute their
|
||||||
|
correlation coefficient.
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten
|
Note that non-linear dependencies between two variables are
|
||||||
nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}).
|
insufficiently or not at all detected by the correlation coefficient
|
||||||
|
(\figref{nonlincorrelationfig}).
|
||||||
|
|
||||||
\begin{figure}[tp]
|
\begin{figure}[tp]
|
||||||
\includegraphics[width=1\textwidth]{nonlincorrelation}
|
\includegraphics[width=1\textwidth]{nonlincorrelation}
|
||||||
\titlecaption{\label{nonlincorrelationfig} Korrelationen bei
|
\titlecaption{\label{nonlincorrelationfig} Correlations for
|
||||||
nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten
|
non-linear dependencies.}{The correlation coefficient detects
|
||||||
erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische
|
linear dependencies only. Both the quadratic dependency (left) and
|
||||||
Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts),
|
the noise correlation (right), where the dispersal of the
|
||||||
bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben
|
$y$-values depends on the $x$-value, result in correlation
|
||||||
Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte
|
coefficients close to zero. $\xi$ denote normally distributed
|
||||||
Zufallszahlen.}
|
random numbers.}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
Reference in New Issue
Block a user