improved on statistics
This commit is contained in:
parent
35d1b908f3
commit
f362788620
Binary file not shown.
@ -27,8 +27,9 @@
|
||||
\usepackage[makeindex]{splitidx}
|
||||
\makeindex
|
||||
\usepackage[totoc]{idxlayout}
|
||||
\newindex[Fachbegriffe]{term}
|
||||
\newindex[\tr{Glossary}{Fachbegriffe}]{term}
|
||||
\newindex[Englische Fachbegriffe]{enterm}
|
||||
\newindex[Deutsche Fachbegriffe]{determ}
|
||||
\newindex[MATLAB Code]{mcode}
|
||||
\newindex[Python Code]{pcode}
|
||||
|
||||
@ -198,8 +199,8 @@
|
||||
|
||||
%%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
|
||||
\usepackage{ifthen}
|
||||
\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}}
|
||||
\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
|
||||
\newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}}
|
||||
\newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}}
|
||||
\newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
|
||||
\newcommand{\file}[1]{\texttt{#1}}
|
||||
|
||||
|
@ -92,7 +92,8 @@
|
||||
%%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\printindex[term]
|
||||
|
||||
\printindex[enterm]
|
||||
\printindex[determ] % for english text
|
||||
% \printindex[enterm] % for german text
|
||||
|
||||
%\setindexprenote{Some explanations.}
|
||||
%\printindex[pcode]
|
||||
|
@ -1,14 +1,18 @@
|
||||
n = 1000
|
||||
x = randn( n, 1 );
|
||||
y = randn( n, 1 ) + 0.2*x;
|
||||
r = corr(x,y)
|
||||
|
||||
nsamples = 500;
|
||||
rs = zeros( nsamples, 1 );
|
||||
for i = 1:nsamples
|
||||
xs = x(randi(n,n,1));
|
||||
ys = x(randi(n,n,1));
|
||||
rs(i) = corr(xs,ys);
|
||||
n = 200;
|
||||
corrs = [ 1.0, 0.6, 0.0, -0.9 ];
|
||||
for k = [1:length(corrs)]
|
||||
r = corrs(k);
|
||||
x = randn(n, 1);
|
||||
y = r*x; % linear dependence of y on x
|
||||
% add noise to destroy perfect correlations:
|
||||
y = y + sqrt(1.0-r*r)*randn(n, 1);
|
||||
% compute correlation coefficient of data:
|
||||
rho = corr(x, y);
|
||||
subplot(2, 2, k)
|
||||
scatter( x, y )
|
||||
text( -2, 2.5, sprintf('r=%.1f', rho) )
|
||||
xlabel('x')
|
||||
ylabel('y')
|
||||
xlim([-3.0, 3.0])
|
||||
ylim([-3.0, 3.0])
|
||||
end
|
||||
|
||||
hist( rs, 20 )
|
||||
|
54
statistics/lecture/displayunivariatedata.py
Normal file
54
statistics/lecture/displayunivariatedata.py
Normal file
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
rng = np.random.RandomState(981)
|
||||
x = rng.randn(40, 10) + 4.0
|
||||
|
||||
plt.xkcd()
|
||||
fig = plt.figure( figsize=(6,3.4) )
|
||||
ax = fig.add_subplot(1, 1, 1)
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.yaxis.set_ticks_position('left')
|
||||
ax.xaxis.set_ticks_position('bottom')
|
||||
ax.set_xlabel('Experiment')
|
||||
ax.set_ylabel('x')
|
||||
ax.set_ylim( 0.0, 8.0)
|
||||
ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
|
||||
ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
|
||||
ecolor='k', capsize=0, error_kw={'elinewidth':5})
|
||||
|
||||
## ax.annotate('Median',
|
||||
## xy=(3.9, 0.0), xycoords='data',
|
||||
## xytext=(3.5, -2.7), textcoords='data', ha='right',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
|
||||
## connectionstyle="angle3,angleA=-110,angleB=60") )
|
||||
## ax.annotate('1. quartile',
|
||||
## xy=(5.8, -0.9), xycoords='data',
|
||||
## xytext=(5.5, -3.4), textcoords='data', ha='right',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
|
||||
## connectionstyle="angle3,angleA=30,angleB=70") )
|
||||
## ax.annotate('3. quartile',
|
||||
## xy=(6.1, 1.1), xycoords='data',
|
||||
## xytext=(6.5, 3.0), textcoords='data', ha='left',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
|
||||
## connectionstyle="angle3,angleA=30,angleB=70") )
|
||||
## ax.annotate('minimum',
|
||||
## xy=(6.1, -1.9), xycoords='data',
|
||||
## xytext=(7.2, -3.3), textcoords='data', ha='left',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
|
||||
## connectionstyle="angle3,angleA=10,angleB=100") )
|
||||
## ax.annotate('maximum',
|
||||
## xy=(5.9, 2.7), xycoords='data',
|
||||
## xytext=(4.9, 3.5), textcoords='data', ha='right',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
|
||||
## connectionstyle="angle3,angleA=0,angleB=120") )
|
||||
#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
|
||||
#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
|
||||
ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
|
||||
ax.set_xlim(0.0, 5.0)
|
||||
ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
|
||||
plt.tight_layout()
|
||||
plt.savefig('displayunivariatedata.pdf')
|
||||
#plt.show()
|
||||
|
@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False)
|
||||
ax.yaxis.set_ticks_position('left')
|
||||
ax.xaxis.set_ticks_position('bottom')
|
||||
ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
|
||||
ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' )
|
||||
ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' )
|
||||
ax.set_xlabel('x')
|
||||
ax.set_ylabel('y')
|
||||
ax.set_xlim( -3.0, 3.0)
|
||||
@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False)
|
||||
ax.yaxis.set_ticks_position('left')
|
||||
ax.xaxis.set_ticks_position('bottom')
|
||||
ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
|
||||
ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' )
|
||||
ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' )
|
||||
ax.set_xlabel('x')
|
||||
ax.set_ylabel('y')
|
||||
ax.set_xlim( -3.0, 3.0)
|
||||
|
@ -5,7 +5,7 @@
|
||||
Descriptive statistics characterizes data sets by means of a few measures.
|
||||
|
||||
In addition to histograms that visualize the distribution of the data,
|
||||
the following measures are used for characterizing the data:
|
||||
the following measures are used for characterizing the univariate data:
|
||||
\begin{description}
|
||||
\item[Location, central tendency] (``Lagema{\ss}e''):
|
||||
arithmetic mean, median, mode.
|
||||
@ -13,10 +13,39 @@ the following measures are used for characterizing the data:
|
||||
standard deviation, inter-quartile range,\linebreak coefficient of variation
|
||||
(``Variationskoeffizient'').
|
||||
\item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
|
||||
\end{description}
|
||||
For bivariate and multivariate data sets we can also analyse their
|
||||
\begin{description}
|
||||
\item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
|
||||
Spearman's rank correlation coefficient.
|
||||
\end{description}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Mean, variance, and standard deviation}
|
||||
The \enterm{arithmetic mean} is a measure of location. For $n$ data values
|
||||
$x_i$ the arithmetic mean is computed by
|
||||
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
|
||||
The mean has the same unit as the data values.
|
||||
|
||||
The dispersion of the data values around the mean is quantified by
|
||||
their \enterm{variance}
|
||||
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
|
||||
The unit of the variance is the unit of the data values squared.
|
||||
Therefore, variances cannot be compared to the mean or the data values
|
||||
themselves. In particular, variances cannot be used for plotting error
|
||||
bars along with the mean.
|
||||
|
||||
The standard deviation
|
||||
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
|
||||
however, has the same unit as the data values and can (and should) be
|
||||
used to display the dispersion of the data together withtheir mean.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{displayunivariatedata}
|
||||
\titlecaption{\label{displayunivariatefig} Display univariate
|
||||
data.}{Bla.}
|
||||
\end{figure}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Mode, median, quartile, etc.}
|
||||
|
||||
@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}).
|
||||
|
||||
\matlab{} provides the function \code{median()} for computing the median.
|
||||
|
||||
\newpage
|
||||
\begin{exercise}{checkmymedian.m}{}
|
||||
Write a script that tests whether your median function really
|
||||
returns a median above which are the same number of data than
|
||||
@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
|
||||
|
||||
\section{Correlations}
|
||||
|
||||
\begin{figure}[tp]
|
||||
\includegraphics[width=1\textwidth]{correlation}
|
||||
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
|
||||
\end{figure}
|
||||
Until now we described properties of univariate data sets. In
|
||||
bivariate or multivariate data sets where we have pairs or tuples of
|
||||
data values (e.g. the size and the weight of elephants) we want to analyze
|
||||
dependencies between the variables.
|
||||
|
||||
Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e
|
||||
angeschaut. Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach
|
||||
Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden. Der
|
||||
\determ[Korrelationskoeffizient]{Korrelations\-koeffizient}
|
||||
The \enterm{correlation coefficient}
|
||||
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
|
||||
(x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
|
||||
(x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
|
||||
\rangle)^2} \rangle} \]
|
||||
quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der
|
||||
Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die
|
||||
Standardabweichungen. Perfekt korrelierte Variablen ergeben einen
|
||||
Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen
|
||||
Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen
|
||||
Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}).
|
||||
quantifies linear relationships between two variables
|
||||
\matlabfun{corr()}. The correlation coefficient is the
|
||||
\determ{covariance} normalized by the standard deviations of the
|
||||
single variables. Perfectly correlated variables result in a
|
||||
correlation coefficient of $+1$, anit-correlated or negatively
|
||||
correlated data in a correlation coefficient of $-1$ and un-correlated
|
||||
data in a correlation coefficient close to zero
|
||||
(\figrefb{correlationfig}).
|
||||
|
||||
\begin{figure}[tp]
|
||||
\includegraphics[width=1\textwidth]{correlation}
|
||||
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
|
||||
\end{figure}
|
||||
|
||||
\begin{exercise}{correlations.m}{}
|
||||
Generate pairs of random numbers with four different correlations
|
||||
(perfectly correlated, somehow correlated, uncorrelated, negatively
|
||||
correlated). Plot them into a scatter plot and compute their
|
||||
correlation coefficient.
|
||||
\end{exercise}
|
||||
|
||||
Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten
|
||||
nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}).
|
||||
Note that non-linear dependencies between two variables are
|
||||
insufficiently or not at all detected by the correlation coefficient
|
||||
(\figref{nonlincorrelationfig}).
|
||||
|
||||
\begin{figure}[tp]
|
||||
\includegraphics[width=1\textwidth]{nonlincorrelation}
|
||||
\titlecaption{\label{nonlincorrelationfig} Korrelationen bei
|
||||
nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten
|
||||
erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische
|
||||
Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts),
|
||||
bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben
|
||||
Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte
|
||||
Zufallszahlen.}
|
||||
\titlecaption{\label{nonlincorrelationfig} Correlations for
|
||||
non-linear dependencies.}{The correlation coefficient detects
|
||||
linear dependencies only. Both the quadratic dependency (left) and
|
||||
the noise correlation (right), where the dispersal of the
|
||||
$y$-values depends on the $x$-value, result in correlation
|
||||
coefficients close to zero. $\xi$ denote normally distributed
|
||||
random numbers.}
|
||||
\end{figure}
|
||||
|
Reference in New Issue
Block a user