diff --git a/programming/lectures/Makefile b/programming/lectures/Makefile new file mode 100644 index 0000000..dec2459 --- /dev/null +++ b/programming/lectures/Makefile @@ -0,0 +1,20 @@ +TEXFILES=$(wildcard *.tex) +TEXFILES=boolean_logical_indexing.tex control_structures.tex data_structures.tex plotting_spike_trains.tex programming_basics.tex scripts_functions.tex sta_stc.tex variables_datatypes.tex vectors_matrices.tex + +PDFFILES=$(TEXFILES:.tex=.pdf) + +pdf : $(PDFFILES) + +$(PDFFILES) : %.pdf : %.tex + pdflatex -interaction=scrollmode $< | tee /dev/stderr | fgrep -q "Rerun to get cross-references right" && pdflatex -interaction=scrollmode $< || true + +clean : + rm -f *~ $(TEXFILES:.tex=.aux) $(TEXFILES:.tex=.log) $(TEXFILES:.tex=.out) $(TEXFILES:.tex=.nav) $(TEXFILES:.tex=.snm) $(TEXFILES:.tex=.toc) $(TEXFILES:.tex=.vrb) + +cleanall : clean + rm -f $(PDFFILES) + +watch : + while true; do ! make -q pdf && make pdf; sleep 0.5; done + + diff --git a/statistics/code/checkmymedian.m b/statistics/code/checkmymedian.m new file mode 100644 index 0000000..5c90834 --- /dev/null +++ b/statistics/code/checkmymedian.m @@ -0,0 +1,12 @@ +% check whether the median returned by mymedian +% really separates a vector into two halfs +for i = 1:140 % loop over different length + for k = 1:10 % try several times + a = randn( i, 1 ); % generate some data + m = mymedian( a ) % compute median + if length( a(a>m) ) ~= length( a(a=x1)&(x=x1)&(r=q(1)) & (b=q(1)) & (b=q(2)) & (b=q(2)) & (b=q(3)), h(b>=q(3)), 'FaceColor', [0.5 0 0.5] ); +hold off; diff --git a/statistics/code/mymedian.m b/statistics/code/mymedian.m new file mode 100644 index 0000000..d2af4d0 --- /dev/null +++ b/statistics/code/mymedian.m @@ -0,0 +1,13 @@ +function m = mymedian( x ) +% returns the median of the vector x + xs = sort( x ); + if ( length( xs ) == 0 ) + m = NaN; + elseif ( rem( length( xs ), 2 ) == 0 ) + index = length( xs )/2; + m = (xs( index ) + xs( index+1 ))/2; + else + index = (length( xs ) + 1)/2; + m = xs( index ); + end +end diff --git a/statistics/code/quartiles.m b/statistics/code/quartiles.m index 9b38af1..3f9ebe5 100644 --- a/statistics/code/quartiles.m +++ b/statistics/code/quartiles.m @@ -1,25 +1,15 @@ -% generate data: -x = randn( 1, 100000 ); - -% histogram: -[h,b] = hist( x, 100 ); -% normalize: -bs = b(2)-b(1); -h = h/sum(h)/bs; - -% plot: -bar( b, h ); -xlabel( 'x' ); - -% median, quartile: -xs = sort( x ) -q = [ xs(length(xs)/4), xs(length(xs)/2), xs(3*length(xs)/4) ]; -%q = quantile( x, [0.25, 0.5, 0.75 ] ); - -% plot: -bar( b(b=q(1)) & (b=q(1)) & (b=q(2)) & (b=q(2)) & (b=q(3)), h(b>=q(3)), 'FaceColor', [0.5 0 0.5] ); -hold off; +function q = quartiles( x ) + % returns a vector with the first, second, and third quartile of the vector x + xs = sort( x ); + if ( length( xs ) == 0 ) + q = []; + elseif ( rem( length( xs ), 2 ) == 0 ) + index = length( xs )/2; + m = (xs( index ) + xs( index+1 ))/2; + q = [ round( xs(length(xs)/4) ), m, xs(round(3*length(xs)/4)) ]; + else + index = (length( xs ) + 1)/2; + m = xs( index ); + q = [ round( xs(length(xs)/4) ), m, xs(round(3*length(xs)/4)) ]; + end +end diff --git a/statistics/code/randomwalk.m b/statistics/code/randomwalk.m index a442159..a8b1334 100644 --- a/statistics/code/randomwalk.m +++ b/statistics/code/randomwalk.m @@ -1,4 +1,6 @@ function x = randomwalk(n,p) +% returns a random wolk with n steps and +% probability p for positive steps. r = rand(n,1); r(r=p) = +1.0; diff --git a/statistics/code/rollthedie.m b/statistics/code/rollthedie.m new file mode 100644 index 0000000..1842da8 --- /dev/null +++ b/statistics/code/rollthedie.m @@ -0,0 +1,4 @@ +function x = rollthedie( n ) +% return a vector with the result of rolling a die n times + x = randi( [1, 6], n, 1 ); +end diff --git a/statistics/lecture/Makefile b/statistics/lecture/Makefile new file mode 100644 index 0000000..b68ae15 --- /dev/null +++ b/statistics/lecture/Makefile @@ -0,0 +1,18 @@ +TEXFILES=$(wildcard *.tex) +PDFFILES=$(TEXFILES:.tex=.pdf) + +pdf : $(PDFFILES) + +$(PDFFILES) : %.pdf : %.tex + pdflatex -interaction=scrollmode $< | tee /dev/stderr | fgrep -q "Rerun to get cross-references right" && pdflatex -interaction=scrollmode $< || true + +clean : + rm -f *~ $(TEXFILES:.tex=.aux) $(TEXFILES:.tex=.log) $(TEXFILES:.tex=.out) $(TEXFILES:.tex=.nav) $(TEXFILES:.tex=.snm) $(TEXFILES:.tex=.toc) $(TEXFILES:.tex=.vrb) + +cleanall : clean + rm -f $(PDFFILES) + +watch : + while true; do ! make -q pdf && make pdf; sleep 0.5; done + + diff --git a/statistics/lecture/descriptivestatistics.tex b/statistics/lecture/descriptivestatistics.tex index c6e49f5..3ae32ac 100644 --- a/statistics/lecture/descriptivestatistics.tex +++ b/statistics/lecture/descriptivestatistics.tex @@ -1,43 +1,55 @@ -\documentclass{beamer} +\documentclass[12pt]{report} %%%%% title %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\title[]{Scientific Computing --- Descriptive Statistics} -\author[]{Jan Benda} -\institute[]{Neuroethology} -\date[]{WS 15/16} -\titlegraphic{\includegraphics[width=0.3\textwidth]{UT_WBMW_Rot_RGB}} - -%%%%% beamer %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\mode -{ - \usetheme{Singapore} - \setbeamercovered{opaque} - \usecolortheme{tuebingen} - \setbeamertemplate{navigation symbols}{} - \usefonttheme{default} - \useoutertheme{infolines} - % \useoutertheme{miniframes} -} +\title{\tr{Introduction to Scientific Computing}{Einf\"uhrung in die wissenschaftliche Datenverarbeitung}} +\author{Jan Benda\\Abteilung Neuroethologie\\[2ex]\includegraphics[width=0.3\textwidth]{UT_WBMW_Rot_RGB}} +\date{WS 15/16} + +%%%% language %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \newcommand{\tr}[2]{#1} % en +% \usepackage[english]{babel} +\newcommand{\tr}[2]{#2} % de +\usepackage[german]{babel} -%\AtBeginSection[] -%{ -% \begin{frame} -% \begin{center} -% \Huge \insertsectionhead -% \end{center} -% \end{frame} -%} +%%%%% packages %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage{pslatex} % nice font for pdf file +\usepackage[breaklinks=true,bookmarks=true,bookmarksopen=true,pdfpagemode=UseNone,pdfstartview=FitH,colorlinks=true,citecolor=blue]{hyperref} -\setbeamertemplate{blocks}[rounded][shadow=true] +%%%% layout %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage[left=25mm,right=25mm,top=20mm,bottom=30mm]{geometry} \setcounter{tocdepth}{1} -%%%%% packages %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\usepackage[english]{babel} +%%%% graphics %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage{graphicx} +\usepackage{xcolor} +\newcommand{\texpicture}[1]{{\sffamily\small\input{#1.tex}}} + +%%%%% listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage{listings} +\lstset{ + inputpath=../code, + basicstyle=\ttfamily\footnotesize, + numbers=left, + showstringspaces=false, + language=Matlab, + commentstyle=\itshape\color{darkgray}, + keywordstyle=\color{blue}, + stringstyle=\color{green}, + backgroundcolor=\color{blue!10}, + breaklines=true, + breakautoindent=true, + columns=flexible, + frame=single, + caption={\protect\filename@parse{\lstname}\protect\filename@base}, + captionpos=t, + xleftmargin=1em, + xrightmargin=1em, + aboveskip=10pt +} + +%%%%% math stuff: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \usepackage{amsmath} \usepackage{bm} -\usepackage{pslatex} % nice font for pdf file -%\usepackage{multimedia} - \usepackage{dsfont} \newcommand{\naZ}{\mathds{N}} \newcommand{\gaZ}{\mathds{Z}} @@ -47,59 +59,45 @@ \newcommand{\reZpN}{\mathds{R^+_0}} \newcommand{\koZ}{\mathds{C}} -%%%% graphics %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\usepackage{graphicx} -\newcommand{\texpicture}[1]{{\sffamily\small\input{#1.tex}}} -%%%%% listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\usepackage{listings} -\lstset{ - basicstyle=\ttfamily, - numbers=left, - showstringspaces=false, - language=Matlab, - commentstyle=\itshape\color{darkgray}, - keywordstyle=\color{blue}, - stringstyle=\color{green}, - backgroundcolor=\color{blue!10}, - breaklines=true, - breakautoindent=true, - columns=flexible, - frame=single, - captionpos=b, - xleftmargin=1em, - xrightmargin=1em, - aboveskip=10pt - } - - +%%%%% structure: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage{ifthen} + +\newcommand{\code}[1]{\texttt{#1}} + +\newcommand{\source}[1]{ + \begin{flushright} + \color{gray}\scriptsize \url{#1} + \end{flushright} +} + +\newenvironment{definition}[1][]{\medskip\noindent\textbf{Definition}\ifthenelse{\equal{#1}{}}{}{ #1}:\newline}% + {\medskip} + +%\newcommand{\showlisting}{yes} +\newcommand{\showlisting}{no} +\newcounter{theexercise} +\setcounter{theexercise}{1} +\newenvironment{exercise}[1][]{\medskip\noindent\textbf{\tr{Exercise}{\"Ubung} + \arabic{theexercise}:} \stepcounter{theexercise}\newline \newcommand{\exercisesource}{#1}}% + {\ifthenelse{\equal{\exercisesource}{}}{}{\ifthenelse{\equal{\showlisting}{yes}}{\medskip\lstinputlisting{\exercisesource}}{}}\medskip} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{document} -\begin{frame}[plain] - \frametitle{} - \vspace{-1cm} - \titlepage % erzeugt Titelseite -\end{frame} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\begin{frame} - \frametitle{Content} - \tableofcontents -\end{frame} +\maketitle +%\tableofcontents %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Descriptive statistics} - +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\chapter{\tr{Descriptive statistics}{Deskriptive Statistik}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Statistics of ratio data} +\section{Statistics of real-valued data} -%------------------------------------------------------------- -\begin{frame} - \frametitle{Statistics of ratio data} \begin{itemize} \item Location, central tendency \begin{itemize} @@ -107,7 +105,6 @@ \item median \item mode \end{itemize} - \item Spread, dispersion \begin{itemize} \item variance @@ -116,163 +113,294 @@ \item coefficient of variation \item minimum, maximum \end{itemize} - \item Shape \begin{itemize} \item skewnees \item kurtosis \end{itemize} - \item Dependence \begin{itemize} \item Pearson correlation coefficient \item Spearman's rank correlation coefficient \end{itemize} - \end{itemize} -\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Median, Quartile, Percentile} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{median} + \caption{\label{medianfig} Median.} +\end{figure} + +\begin{definition}[\tr{median}{Median}] + \tr{Half of the observations $X=(x_1, x_2, \ldots, x_n)$ are + larger than the median and half of them are smaller than the + median.} {Der Median teilt eine Liste von Messwerten so in zwei + H\"alften, dass die eine H\"alfte der Daten nicht gr\"o{\ss}er + und die andere H\"alfte nicht kleiner als der Median ist.} +\end{definition} + +\begin{exercise}[mymedian.m] + \tr{Write a function that computes the median of a vector.} + {Schreibe eine Funktion, die den Median eines Vektors zur\"uckgibt.} +\end{exercise} + +\code{matlab} stellt die Funktion \code{median()} zur Berechnung des Medians bereit. + +\begin{exercise}[checkmymedian.m] + \tr{Write a script that tests whether your median function really + returns a median above which are the same number of data than + below. In particular the script should test data vectors of + different length.} {Schreibe ein Skript, das testet ob die + \code{mymedian} Funktion wirklich die Zahl zur\"uckgibt, \"uber + der genausoviele Datenwerte liegen wie darunter. Das Skript sollte + insbesondere verschieden lange Datenvektoren testen.} +\end{exercise} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{quartile} + \caption{\label{quartilefig} Median und Quartile.} +\end{figure} + +\begin{definition}[\tr{quartile}{Quartile}] + Die Quartile Q1, Q2 und Q3 unterteilen die Daten in vier gleich + gro{\ss}e Gruppen, die jeweils ein Viertel der Daten enthalten. + Das mittlere Quartil entspricht dem Median. +\end{definition} + +\begin{exercise}[quartiles.m] + \tr{Write a function that computes the first, second, and third quartile of a vector.} + {Schreibe eine Funktion, die das erste, zweite und dritte Quartil als Vektor zur\"uckgibt.} +\end{exercise} + +\subsection{Histogram} + +Histogramme z\"ahlen die H\"aufigkeit $n_i$ des Auftretens von +$N=\sum_{i=1}^M n_i$ Messwerten in $M$ Messbereichsklassen $i$ (Bins). +Die Klassen unterteilen den Wertebereich meist in angrenzende und +gleich gro{\ss}e Intervalle. Histogramme sch\"atzen die +Wahrscheinlichkeitsverteilung der Messwerte ab. + +\begin{exercise}[rollthedie.m] + \tr{Write a function that simulates rolling a die $n$ times.} + {Schreibe eine Funktion, die das $n$-malige W\"urfeln mit einem W\"urfel simuliert.} +\end{exercise} + +\begin{exercise}[diehistograms.m] + \tr{Plot histograms from rolling the die 20, 100, 1000 times. Use + the plain hist(x) function, force 6 bins via hist( x, 6 ), and set + meaningfull bins positions.} {Plotte Histogramme von 20, 100, und + 1000-mal w\"urfeln. Benutze \code{hist(x)}, erzwinge sechs Bins + mit \code{hist(x,6)}, und setze selbst sinnvolle Bins. Normiere + anschliessend das Histogram auf geeignete Weise.} +\end{exercise} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{diehistograms} + \caption{\label{diehistogramsfig} \tr{Histograms of rolling a die + 100 or 500 times. Left: plain histograms counting the frequency + of the six possible outcomes. Right: the same data normalized + to their sum.}{Histogramme des Ergebnisses von 100 oder 500 mal + W\"urfeln. Links: das absolute Histogramm z\"ahlt die Anzahl des + Auftretens jeder Augenzahl. Rechts: Normiert auf die Summe des + Histogramms werden die beiden Messungen vergleichbar.}} +\end{figure} + +Bei ganzzahligen Messdaten (z.B. die Augenzahl eines W\"urfels) +kann f\"ur jede auftretende Zahl eine Klasse definiert werden. +Damit die H\"ohe der Histogrammbalken unabh\"angig von der Anzahl der Messwerte wird, +normiert man das Histogram auf die Anzahl der Messwerte. +Die H\"ohe der Histogrammbalken gibt dann die Wahrscheinlichkeit $P(x_i)$ +des Auftretens der Gr\"o{\ss}e $x_i$ in der $i$-ten Klasse an +\[ P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \] + + +\subsection{Probability density function} + +Meistens haben wir es jedoch mit reellen Messgr\"o{\ss}en zu tun. + +\begin{exercise}[gaussianbins.m] + \tr{Draw 100 random data from a Gaussian distribution and plot + histograms with different bin sizes of the data.} {Ziehe 100 + normalverteilte Zufallszahlen und erzeuge Histogramme mit + unterschiedlichen Klassenbreiten. Was f\"allt auf?} +\end{exercise} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{pdfhistogram} + \caption{\label{pdfhistogramfig} \tr{Histograms of normally + distributed data with different bin sizes.}{Histogramme mit + verschiednenen Klassenbreiten eines Datensatzes von + normalverteilten Messwerten. Links: Die H\"ohe des absoluten + Histogramms h\"angt von der Klassenbreite ab. Rechts: Bei auf + das Integral normierten Histogrammen werden auch + unterschiedliche Klassenbreiten vergleichbar.}} +\end{figure} + +Histogramme von reellen Messwerten m\"ussen auf das Integral 1 normiert werden, so dass +das Integral (nicht die Summe) \"uber das Histogramm eins ergibt. Das Integral +ist die Fl\"ache des Histograms. Diese setzt sich zusammen aus der Fl\"ache der einzelnen +Histogrammbalken. Diese haben die H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache +$A$ des Histogramms ist also +\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \] +und das normierte Histogramm hat die H\"ohe +\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \] +Es muss also nicht nur durch die Summe, sondern auch durch die Breite der Klassen $\Delta x$ +geteilt werden. + +$p(x_i)$ kann keine Wahrscheinlichkeit sein, da $p(x_i)$ nun eine +Einheit hat --- das Inverse der Einheit der Messgr\"osse $x$. Man +spricht von einer Wahrscheinlichkeitsdichte. + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{pdfprobabilities} + \caption{\label{pdfprobabilitiesfig} Wahrscheinlichkeiten bei + einer Wahrscheinlichkeitsdichtefunktion.} +\end{figure} + +\begin{exercise} + \tr{Plot the Gaussian probability density}{Plotte die Gauss'sche Wahrscheinlichkeitsdichte } + \[ p_g(x) = 1/\sqrt{2\pi\sigma^2}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\] + \tr{What does it mean?}{Was bedeutet die folgende Wahrscheinlichkeit?} + \[ P(x_1 < x < x2) = \int_{x_1}^{x_2} p(x) \, dx \] + \tr{How large is}{Wie gro{\ss} ist} + \[ \int_{-\infty}^{+\infty} p(x) \, dx \; ?\] + \tr{Why?}{Warum?} +\end{exercise} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Data types} -%------------------------------------------------------------- -\begin{frame} - \frametitle{Data types: nominal scale} +\subsubsection{Nominal scale} +\begin{itemize} +\item Binary \begin{itemize} - \item Binary - \begin{itemize} - \item ``yes/no'', - \item ``true/false'', - \item ``success/failure'', etc. - \end{itemize} - \item Categorial - \begin{itemize} - \item cell type (``rod/cone/horizontal cell/bipolar cell/ganglion cell''), - \item blood type (``A/B/AB/0''), - \item parts of speech (``noun/veerb/preposition/article/...''), - \item taxonomic groups (``Coleoptera/Lepidoptera/Diptera/Hymenoptera''), etc. - \end{itemize} - \item Each observation/measurement/sample is put into one category - \item There is no reasonable order among the categories.\\ - example: [rods, cones] vs. [cones, rods] - \pause - \item Statistics: mode, i.e. the most common item + \item ``yes/no'', + \item ``true/false'', + \item ``success/failure'', etc. \end{itemize} -\end{frame} - -%------------------------------------------------------------- -\begin{frame} - \frametitle{Data types: ordinal scale} +\item Categorial \begin{itemize} - \item Like nominal scale, but with an order - \item Examples: ranks, ratings - \begin{itemize} - \item ``bad/ok/good'', - \item ``cold/warm/hot'', - \item ``young/old'', etc. - \end{itemize} - \item {\bf But:} there is no reasonable measure of {\em distance} - between the classes - \pause - \item Statistics: mode, median + \item cell type (``rod/cone/horizontal cell/bipolar cell/ganglion cell''), + \item blood type (``A/B/AB/0''), + \item parts of speech (``noun/veerb/preposition/article/...''), + \item taxonomic groups (``Coleoptera/Lepidoptera/Diptera/Hymenoptera''), etc. \end{itemize} -\end{frame} - -%------------------------------------------------------------- -\begin{frame} - \frametitle{Data types: interval scale} +\item Each observation/measurement/sample is put into one category +\item There is no reasonable order among the categories.\\ + example: [rods, cones] vs. [cones, rods] +\item Statistics: mode, i.e. the most common item +\end{itemize} + +\subsubsection{Ordinal scale} +\begin{itemize} +\item Like nominal scale, but with an order +\item Examples: ranks, ratings \begin{itemize} - \item Quantitative/metric values - \item Reasonable measure of distance between values, but no absolute zero - \item Examples: - \begin{itemize} - \item Temperature in $^\circ$C ($20^\circ$C is not twice as hot as $10^\circ$C) - \item Direction measured in degrees from magnetic or true north - \end{itemize} - \pause - \item Statistics: - \begin{itemize} - \item Central tendency: mode, median, arithmetic mean - \item Dispersion: range, standard deviation - \end{itemize} + \item ``bad/ok/good'', + \item ``cold/warm/hot'', + \item ``young/old'', etc. + \end{itemize} +\item {\bf But:} there is no reasonable measure of {\em distance} + between the classes +\item Statistics: mode, median +\end{itemize} + +\subsubsection{Interval scale} +\begin{itemize} +\item Quantitative/metric values +\item Reasonable measure of distance between values, but no absolute zero +\item Examples: + \begin{itemize} + \item Temperature in $^\circ$C ($20^\circ$C is not twice as hot as $10^\circ$C) + \item Direction measured in degrees from magnetic or true north + \end{itemize} +\item Statistics: + \begin{itemize} + \item Central tendency: mode, median, arithmetic mean + \item Dispersion: range, standard deviation \end{itemize} -\end{frame} +\end{itemize} -%------------------------------------------------------------- -\begin{frame} - \frametitle{Data types: absolute/ratio scale} +\subsubsection{Absolute/ratio scale} +\begin{itemize} +\item Like interval scale, but with absolute origin/zero +\item Examples: \begin{itemize} - \item Like interval scale, but with absolute origin/zero - \item Examples: - \begin{itemize} - \item Temperature in $^\circ$K - \item Length, mass, duration, electric charge, ... - \item Plane angle, etc. - \item Count (e.g. number of spikes in response to a stimulus) - \end{itemize} - \pause - \item Statistics: - \begin{itemize} - \item Central tendency: mode, median, arithmetic, geometric, harmonic mean - \item Dispersion: range, standard deviation - \item Coefficient of variation (ratio standard deviation/mean) - \item All other statistical measures - \end{itemize} + \item Temperature in $^\circ$K + \item Length, mass, duration, electric charge, ... + \item Plane angle, etc. + \item Count (e.g. number of spikes in response to a stimulus) + \end{itemize} +\item Statistics: + \begin{itemize} + \item Central tendency: mode, median, arithmetic, geometric, harmonic mean + \item Dispersion: range, standard deviation + \item Coefficient of variation (ratio standard deviation/mean) + \item All other statistical measures \end{itemize} -\end{frame} +\end{itemize} -%------------------------------------------------------------- -\begin{frame} - \frametitle{Data types} +\subsubsection{Data types} +\begin{itemize} +\item Data type selects \begin{itemize} - \item Data type selects - \begin{itemize} - \item statistics - \item type of plots (bar graph versus x-y plot) - \item correct tests - \end{itemize} - \item Scales exhibit increasing information content from nominal - to absolute.\\ - Conversion ,,downwards'' is always possible - \item For example: size measured in meter (ratio scale) $\rightarrow$ - categories ``small/medium/large'' (ordinal scale) + \item statistics + \item type of plots (bar graph versus x-y plot) + \item correct tests + \end{itemize} +\item Scales exhibit increasing information content from nominal + to absolute.\\ + Conversion ,,downwards'' is always possible +\item For example: size measured in meter (ratio scale) $\rightarrow$ + categories ``small/medium/large'' (ordinal scale) +\end{itemize} + +\subsubsection{Examples from neuroscience} +\begin{itemize} +\item {\bf absolute:} + \begin{itemize} + \item size of neuron/brain + \item length of axon + \item ion concentration + \item membrane potential + \item firing rate \end{itemize} -\end{frame} -%------------------------------------------------------------- -\begin{frame} - \frametitle{Examples from neuroscience} +\item {\bf interval:} \begin{itemize} + \item edge orientation + \end{itemize} - \item {\bf absolute:}\pause - \begin{itemize} - \item size of neuron/brain - \item length of axon - \item ion concentration - \item membrane potential - \item firing rate - \end{itemize} +\item {\bf ordinal:} + \begin{itemize} + \item stages of a disease + \item ratings + \end{itemize} - \item {\bf interval:}\pause - \begin{itemize} - \item edge orientation - \end{itemize} +\item {\bf nominal:} + \begin{itemize} + \item cell type + \item odor + \item states of an ion channel + \end{itemize} - \item {\bf ordinal:} \pause - \begin{itemize} - \item stages of a disease - \item ratings - \end{itemize} +\end{itemize} - \item {\bf nominal:}\pause - \begin{itemize} - \item cell type - \item odor - \item states of an ion channel - \end{itemize} - \end{itemize} -\end{frame} +\end{document} -\end{document} \ No newline at end of file +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Statistics} +What is "a statistic"? % dt. Sch\"atzfunktion +\begin{definition}[statistic] + A statistic (singular) is a single measure of some attribute of a + sample (e.g., its arithmetic mean value). It is calculated by + applying a function (statistical algorithm) to the values of the + items of the sample, which are known together as a set of data. + + \source{http://en.wikipedia.org/wiki/Statistic} +\end{definition} diff --git a/statistics/lecture/diehistograms.py b/statistics/lecture/diehistograms.py new file mode 100644 index 0000000..d5e0380 --- /dev/null +++ b/statistics/lecture/diehistograms.py @@ -0,0 +1,32 @@ +import numpy as np +import matplotlib.pyplot as plt + +# roll the die: +x1 = np.random.random_integers( 1, 6, 100 ) +x2 = np.random.random_integers( 1, 6, 500 ) +bins = np.arange(0.5, 7, 1.0) + +plt.xkcd() + +fig = plt.figure( figsize=(6,4) ) +ax = fig.add_subplot( 1, 2, 1 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_ylabel( 'Frequency' ) +ax.hist([x2, x1], bins, color=['#FFCC00', '#FFFF66' ]) + +ax = fig.add_subplot( 1, 2, 2 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_ylabel( 'Probability' ) +ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ]) +plt.tight_layout() +fig.savefig( 'diehistograms.pdf' ) +plt.show() + diff --git a/statistics/lecture/median.py b/statistics/lecture/median.py new file mode 100644 index 0000000..2bf420c --- /dev/null +++ b/statistics/lecture/median.py @@ -0,0 +1,33 @@ +import numpy as np +import matplotlib.pyplot as plt + +# normal distribution: +x = np.arange( -4.0, 4.0, 0.01 ) +g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) + +plt.xkcd() +fig = plt.figure( figsize=(6,4) ) +ax = fig.add_subplot( 1, 1, 1 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_ylabel( 'Probability density p(x)' ) +ax.set_ylim( 0.0, 0.46 ) +ax.set_yticks( np.arange( 0.0, 0.45, 0.1 ) ) +ax.text(-1.0, 0.1, '50%', ha='center' ) +ax.text(+1.0, 0.1, '50%', ha='center' ) +ax.annotate('Median', + xy=(0.1, 0.3), xycoords='data', + xytext=(1.6, 0.35), textcoords='data', ha='left', + arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5), + connectionstyle="angle3,angleA=10,angleB=40") ) +ax.fill_between( x[x<0], 0.0, g[x<0], color='#ffcc00' ) +ax.fill_between( x[x>0], 0.0, g[x>0], color='#99ff00' ) +ax.plot(x,g, 'b', lw=4) +ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 ) +plt.tight_layout() +fig.savefig( 'median.pdf' ) +plt.show() + diff --git a/statistics/lecture/pdfhistogram.py b/statistics/lecture/pdfhistogram.py new file mode 100644 index 0000000..039b524 --- /dev/null +++ b/statistics/lecture/pdfhistogram.py @@ -0,0 +1,39 @@ +import numpy as np +import matplotlib.pyplot as plt + +# normal distribution: +x = np.arange( -4.0, 4.0, 0.01 ) +g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) +r = np.random.randn( 100 ) + +plt.xkcd() + +fig = plt.figure( figsize=(6,4) ) +ax = fig.add_subplot( 1, 2, 1 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_ylabel( 'Frequency' ) +#ax.set_ylim( 0.0, 0.46 ) +#ax.set_yticks( np.arange( 0.0, 0.45, 0.1 ) ) +ax.hist(r, 5, color='#CC0000') +ax.hist(r, 20, color='#FFCC00') + +ax = fig.add_subplot( 1, 2, 2 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_ylabel( 'Probability density p(x)' ) +#ax.set_ylim( 0.0, 0.46 ) +#ax.set_yticks( np.arange( 0.0, 0.45, 0.1 ) ) +ax.hist(r, 5, normed=True, color='#CC0000') +ax.hist(r, 20, normed=True, color='#FFCC00') + +plt.tight_layout() +fig.savefig( 'pdfhistogram.pdf' ) +plt.show() + diff --git a/statistics/lecture/pdfprobabilities.py b/statistics/lecture/pdfprobabilities.py new file mode 100644 index 0000000..6481da1 --- /dev/null +++ b/statistics/lecture/pdfprobabilities.py @@ -0,0 +1,36 @@ +import numpy as np +import matplotlib.pyplot as plt + +# normal distribution: +x = np.arange( -3.0, 5.0, 0.01 ) +g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) +x1=0.0 +x2=1.0 + +plt.xkcd() +fig = plt.figure( figsize=(6,4) ) +ax = fig.add_subplot( 1, 1, 1 ) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_ylabel( 'Probability density p(x)' ) +ax.set_ylim( 0.0, 0.46 ) +ax.set_yticks( np.arange( 0.0, 0.45, 0.1 ) ) +ax.annotate('Gaussian', + xy=(-1.0, 0.28), xycoords='data', + xytext=(-2.5, 0.35), textcoords='data', ha='left', + arrowprops=dict(arrowstyle="->", relpos=(0.5,0.0), + connectionstyle="angle3,angleA=10,angleB=110") ) +ax.annotate('$P(0", relpos=(0.0,0.5), + connectionstyle="angle3,angleA=10,angleB=80") ) +ax.fill_between( x[(x>x1)&(xx1)&(x", relpos=(1.0,0.5), + connectionstyle="angle3,angleA=170,angleB=120") ) +ax.annotate('3. quartile', + xy=(0.75, 0.17), xycoords='data', + xytext=(1.7, 0.22), textcoords='data', ha='left', + arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5), + connectionstyle="angle3,angleA=10,angleB=70") ) +ax.annotate('Median', + xy=(0.1, 0.3), xycoords='data', + xytext=(1.6, 0.35), textcoords='data', ha='left', + arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5), + connectionstyle="angle3,angleA=10,angleB=40") ) +ax.fill_between( x[xq[0])&(xq[0])&(xq[1])&(xq[1])&(xq[2]], 0.0, g[x>q[2]], color='#ffff66' ) +ax.plot(x,g, 'b', lw=4) +ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 ) +ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 ) +ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 ) +plt.tight_layout() +fig.savefig( 'quartile.pdf' ) +plt.show() +