\documentclass{beamer} \usepackage{xcolor} \usepackage{listings} \usepackage{pgf} %\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade} %\usepackage{multimedia} \usepackage[latin1]{inputenc} \usepackage{amsmath} \usepackage{bm} \usepackage[T1]{fontenc} \usepackage{hyperref} \usepackage{ulem} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \mode { \usetheme{Singapore} \setbeamercovered{opaque} \usecolortheme{tuebingen} \setbeamertemplate{navigation symbols}{} \usefonttheme{default} \useoutertheme{infolines} % \useoutertheme{miniframes} } \AtBeginSection[] { \begin{frame} \begin{center} \Huge \insertsectionhead \end{center} % \frametitle{\insertsectionhead} % \tableofcontents[currentsection,hideothersubsections] \end{frame} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 \setbeamertemplate{blocks}[rounded][shadow=true] \title[]{Scientific Computing -- Statistics} \author[Statistics]{Fabian Sinz\\Dept. Neuroethology, University T\"ubingen\\ Bernstein Center T\"ubingen} \institute[Scientific Computing]{} \date{11/27/2013} %\logo{\pgfuseimage{logo}} \subject{Lectures} %%%%%%%%%% configuration for code \lstset{ basicstyle=\ttfamily, numbers=left, showstringspaces=false, language=Matlab, commentstyle=\itshape\color{darkgray}, keywordstyle=\color{blue}, stringstyle=\color{green}, backgroundcolor=\color{blue!10}, breaklines=true, breakautoindent=true, columns=flexible, frame=single, captionpos=b, xleftmargin=1em, xrightmargin=1em, aboveskip=10pt } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\mycite}[1]{ \begin{flushright} \tiny \color{black!80} #1 \end{flushright} } \input{../latex/environments.tex} \makeatother \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{plan} \setcounter{tocdepth}{1} \tableofcontents \end{frame} \begin{frame} \frametitle{information} \begin{itemize} \item Samuels, M. L., Wittmer, J. A., \& Schaffner, A. A. (2010). Statistics for the Life Sciences (4th ed., p. 668). Prentice Hall. \item Zar, J. H. (1999). Biostatistical Analysis. (D. Lynch, Ed.)Prentice Hall New Jersey (4th ed., Vol. 4th, p. 663). Prentice Hall. doi:10.1037/0012764 \item \url{http://stats.stackexchange.com} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % errorbars (error bar paper) % confidence intervals (sources of error) % plotting (the right plot for the right data, Dan plotting paper) % statistical test structure (bootstrapping, resampling, permutation) % Don'ts: repeated testing, exclude data points % study design % PCA %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section[Prelude]{Prelude} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ---------------------------------------------------------- \begin{frame} \frametitle{my expectations to this course} \begin{itemize} \item interest and participation \item motivation to understand and question concepts \item high scientific standard \item intellectual honesty \item sincere cooperation \end{itemize} \end{frame} % ---------------------------------------------------------- \begin{frame} \frametitle{this week will be ...} \only<1>{ \framesubtitle{... no \sout{fun} piece of cake} \begin{center} \includegraphics[height=0.7\textheight]{figs/feeding.jpg} \end{center} } \only<2>{ \framesubtitle{... no \sout{fun} piece of cake} \begin{center} \includegraphics[height=0.7\textheight]{figs/nacho-trainer.jpg} \end{center} } \only<3>{ \framesubtitle{... no lecture (please!)} \begin{center} \includegraphics[height=0.7\textheight]{figs/soccer.jpg} \end{center} } \end{frame} % ---------------------------------------------------------- \begin{frame} \frametitle{What you should learn this week} \begin{itemize} \item What makes good plots? \item What is descriptive/inferential statistics? \item What is the general structure of a statistical test? \item What does a p-value mean? \item How can I build my own tests? \item How large should my $n$ be? \item What is {\em maximum likelihood} and why is it important? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section[descriptive statistics, errorbars, and plots]{Day 1 -- descriptive statistics, errorbars, and plots} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{types of data} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{data scales} \framesubtitle{What data types are distinguished in statistics?} \Large {\bf Why are data types important?} \pause \begin{itemize} \item selection of statistics \item selection of plots \item selection of correct tests \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{nominal/categorial scale} \begin{itemize} \item properties like cell type, experimental group (i.e. treatment 1, treatment 2, control) \item each observation/sample is put into one category \item there is no reasonable order among the categories \item example: [rods, cones] vs. [cones, rods] \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{ordinal scale} \begin{itemize} \item like nominal scale, but there is an order \item {\bf but:} there is no reasonable measure of {\em distance} between the classes \item examples: ranks, ratings \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{interval scale} \begin{itemize} \item quantitative/metric values \item reasonable measure of distance between values but no absolute zero \item examples: temperature in $^\circ$C \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{absolut/ratio scale} \begin{itemize} \item like interval scale but with absolute zero \item example: temperature in $^\circ$K \end{itemize} \pause \begin{emphasize}{relationsships between scales} \begin{itemize} \item scales exhibit increasing information content from nominal to absolute \item conversion ,,downwards'' always possible \end{itemize} \end{emphasize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{examples from neuroscience and psychology} \begin{itemize} \item {\bf nominal:}\pause \begin{itemize} \item treatment group \item stimulus class \item cell type \end{itemize} \item {\bf ordinal:} \pause \begin{itemize} \item ratings \item clinical stages of a disease \item states of an ion channel \end{itemize} \item {\bf Absolut-/Ratioskala:}\pause \begin{itemize} \item firing rate \item membrane potential \item ion concentration \end{itemize} \end{itemize} \end{frame} %------------------------------------------------------------- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{statistics} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %------------------------------------------------------------- \begin{frame} \frametitle{What is "a statistic"?} \begin{definition}{statistic} A statistic (singular) is a single measure of some attribute of a sample (e.g., its arithmetic mean value). It is calculated by applying a function (statistical algorithm) to the values of the items of the sample, which are known together as a set of data. \source{http://en.wikipedia.org/wiki/Statistic} \end{definition} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{Beispiele f\"ur Teststatistiken} \begin{itemize} \item {\bf nominal:}\pause \begin{itemize} \item count \item relative frequency/proportion \end{itemize} \item {\bf ordinal:} \pause \begin{itemize} \item median \item quantile/percentile \item rank correlation \end{itemize} \item {\bf absolute/ratio:}\pause \begin{itemize} \item mean \item variance/ standard deviation \item Pearson correlation \end{itemize} \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{exercise} \begin{task}{Spearman rank correlation} \begin{enumerate} \item Use {\tt randi} to generate two 100-dimensional vectors {\tt x,y} of random integers between $0$ and $10$. \item Find out how to compute the Spearman rank correlation $$\rho = 1- {\frac {6 \sum d_i^2}{n(n^2 - 1)}}$$ with Matlab. $d_i = x_i - y_i$ is the difference in the rank between the single data points. \item Compute $\rho$ between $x$ and $y$, between $x$ and $y^2$, between $\log(x+1)$ and $y^2$. \item Compute the "standard" (Pearson) correlation coefficient between these values. \item What can you observe and why does it make sense? \end{enumerate} \end{task} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{solution} \begin{solution}{Spearman rank correlation } \scriptsize \begin{lstlisting} >>> x = randi(10, 100, 1); >>> y = randi(10, 100, 1); >>> corr(x,y,'type','Spearman') ans = 0.1220 >>> corr(x,y.^2,'type','Spearman') ans = 0.1220 >>> corr(x,y,'type','Pearson') ans = 0.1074 >>> corr(x,y.^2,'type','Pearson') ans = 0.0551 \end{lstlisting} The rank correlation does not change under a monotone transformation of the data. Therefore, it can be used for ordinal data. The Pearson correlation coefficient does not have that property. \end{solution} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{description of data and plotting} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{nominal scale} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{} \begin{center} \Huge What makes a good plot? \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} A good plot \begin{itemize} \item helps the reader to clearly understand your point.\pause \item is not misleading and let's the reader judge the information on her own (different y-axis/length scales in two related plots, "squeezing" via log-plots). \pause \item contains information about the data (a comic might be illustrative, but does not contain information about the data).\pause \item adheres to the principle of {\em ink minimization}. \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{design/organization} \begin{itemize} \item Is the display consistent with the model or hypothesis being tested?\pause \item Are there "empty dimensions" in the display that could be removed (A 3D pie chart for 2D categorical data, extraneous colors that do not encode meaningful information)?\pause \item Does the display provide an honest and transparent portrayal of the data (hiding, smoothing, modifying data points should be avoided or explicitly mentioned)? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{axes} \begin{itemize} \item Are axes scales defined as linear, log, or radial?\pause \item Does each axis label describe the variable and its units (use "a.u." for arbitrary units)?\pause \item Are axes limits appropriate for the data (The graphic should not be bounded at zero if the data can take on both positive and negative values.)?\pause \item Is the aspect ratio appropriate for the data (When x and y axes contrast the same variable under different conditions the graphic should be square.)? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{color mapping} \begin{itemize} \item Is a color bar provided?\pause \item Is the color map sensible for the data type (does the data extend to both $\pm$, does it live in an interval, is it circular)?\pause \item Are contrasting colors consistent with a natural interpretation? \item Can features be discriminated when printed in grayscale? \item Has red/green contrast been avoided to accommodate common forms of colorblindness? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{uncertainty} \begin{itemize} \item Does the display indicate the uncertainty of estimated parameters?\pause \item Is the type of error surface appropriate for the data? \begin{itemize} \item Use standard deviations to describe variability in the population.\pause \item Use standard errors or confidence intervals to make inferences about parameters estimated from a sample.\pause \item Parametric confidence intervals should only be used if data meet the assumptions of the underlying model.\pause \end{itemize} \item Are the units of uncertainty defined (is it standard error, is it $95\%$ confidence interval)? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{annotation} \begin{itemize} \item Are all symbols defined, preferably by directly labeling objects?\pause \item Is the directionality of a contrast between conditions obvious?\pause \item Is the number of samples or independent experiments indicated?\pause \item Are statistical procedures and criteria for significance described?\pause \item Are uncommon abbreviations avoided or clearly defined?\pause \item Are abbreviations consistent with those used in the text? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{suboptimal example} \begin{center} \includegraphics[width=.5\linewidth]{figs/nobelbad} \end{center} \mycite{Hafting et al. 2005, nature} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{different axes} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{Bad bar plot} \begin{center} \includegraphics[width=.8\linewidth]{figs/badbarplot} \end{center} \source{www.enfovis.com} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{bar plot for count and relative frequency} \begin{center} \includegraphics[width=.8\linewidth]{figs/nominaldataplot} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{bar plot for count and relative frequency} \scriptsize \begin{lstlisting} % plot bar([1,2], [50, 90], 'facecolor', 'k') % labels axes ylabel('cell count') xlabel('cell type') % cosmetics xlim([0.5,2.5]) ylim([0, 100]) box('off') set(gca,'XTick',1:2,'XTickLabel',{'pyramidal','interneuron'},'FontSize',20) % settings for saving the figure set(gcf, 'PaperUnits', 'centimeters'); set(gcf, 'PaperSize', [11.7 9.0]); set(gcf, 'PaperPosition',[0.0 0.0 11.7 9.0]); \end{lstlisting} \end{frame} %---------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{pie chart for count and relative frequency} \begin{center} \includegraphics[width=.8\linewidth]{figs/nominaldataplot2} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{Darstellung nominaler Daten} \framesubtitle{exercise} \begin{task}{pie chart} Plot the same data ($n_{py}=50$, $n_{in}=90$) as a pie chart in Matlab. \end{task} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{Darstellung nominaler Daten} \framesubtitle{pie chart for relative frequency} \scriptsize \begin{lstlisting} data = [50, 90]; h = pie(data, [1,0], {'pyramidal (n=50)', 'interneuron (n=90)'}) hText = findobj(h,'Type','text') % text object handles set(h(1), 'FaceColor', [.2,.2,.2]); set(h(2), 'Rotation', 45); set(h(3), 'FaceColor', [.8,.8,.8]); set(h(4), 'Rotation', 45); title('cell count') set(gca,'XTick',1:2,'XTickLabel',{'pyramidal', 'interneuron'}) box('off') set(gcf, 'PaperUnits', 'centimeters'); set(gcf, 'PaperSize', [11.7 9.0]); set(gcf, 'PaperPosition',[0.0 0.0 11.7 9.0]); \end{lstlisting} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{histogram} \begin{center} \includegraphics[width=.8\linewidth]{figs/histogram} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{bad choice of bins} \begin{center} \includegraphics[width=.4\linewidth]{figs/histogrambad} \includegraphics[width=.4\linewidth]{figs/histogrambad2} \end{center} \begin{summary}{Rule of thumb} Choose the bins $b\approx n/20$. \end{summary} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{how to do in Matlab} \scriptsize \begin{lstlisting} x = randn(2000,1); % generate Gaussian data hist(x, 50); % generate histogram % set facecolor to gray h = findobj(gca, 'Type','patch'); set(h(1), 'FaceColor',[.2,.2,.2], 'EdgeColor','w', 'linewidth',2) % plot a white grid over it h = gridxy([],get(gca,'ytick'),'color','w','linewidth',2) uistack(h, 'top') % cosmetics box('off'); xlabel('Data') ylabel('Count') \end{lstlisting} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{other ways} There are other ways to plot a sample $x_1, ..., x_n$ of interval/ratio/absolute scale data. E.g. \begin{itemize} \item box plot \item bar plot \item smoothed histogram \item ... \end{itemize} We will look at them while plotting mixed data in the following. \end{frame} \end{document}