\documentclass{beamer} \usepackage{xcolor} \usepackage{listings} \usepackage{pgf} %\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade} %\usepackage{multimedia} \usepackage[latin1]{inputenc} \usepackage{amsmath} \usepackage{bm} \usepackage[T1]{fontenc} \usepackage{hyperref} \usepackage{ulem} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \mode { \usetheme{Singapore} \setbeamercovered{opaque} \usecolortheme{tuebingen} \setbeamertemplate{navigation symbols}{} \usefonttheme{default} \useoutertheme{infolines} % \useoutertheme{miniframes} } \AtBeginSubsection[] { \begin{frame} \begin{center} \Huge \insertsectionhead \end{center} \tableofcontents[ currentsubsection, hideothersubsections, sectionstyle=show/hide, subsectionstyle=show/shaded, ] % \frametitle{\insertsectionhead} \end{frame} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 \setbeamertemplate{blocks}[rounded][shadow=true] \title[]{Scientific Computing -- Statistics} \author[Statistics]{Fabian Sinz\\Dept. Neuroethology, University T\"ubingen\\ Bernstein Center T\"ubingen} \institute[Scientific Computing]{} \date{10/20/2014} %\logo{\pgfuseimage{logo}} \subject{Lectures} %%%%%%%%%% configuration for code \lstset{ basicstyle=\ttfamily, numbers=left, showstringspaces=false, language=Matlab, commentstyle=\itshape\color{darkgray}, keywordstyle=\color{blue}, stringstyle=\color{green}, backgroundcolor=\color{blue!10}, breaklines=true, breakautoindent=true, columns=flexible, frame=single, captionpos=b, xleftmargin=1em, xrightmargin=1em, aboveskip=10pt } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\mycite}[1]{ \begin{flushright} \tiny \color{black!80} #1 \end{flushright} } \input{../latex/environments.tex} \makeatother \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % errorbars (error bar paper) % confidence intervals (sources of error) % plotting (the right plot for the right data, Dan plotting paper) % statistical test structure (bootstrapping, resampling, permutation) % Don'ts: repeated testing, exclude data points % study design % PCA %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section[Prelude]{Prelude} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ---------------------------------------------------------- \begin{frame} \frametitle{my expectations to this course} \begin{itemize} \item interest and participation \item motivation to understand and question concepts \item high scientific standard \item intellectual honesty \item sincere cooperation \end{itemize} \end{frame} % ---------------------------------------------------------- \begin{frame} \frametitle{this week will be ...} \only<1>{ \framesubtitle{... no \sout{fun} piece of cake} \begin{center} \includegraphics[height=0.7\textheight]{figs/feeding.jpg} \end{center} } \only<2>{ \framesubtitle{... no \sout{fun} piece of cake} \begin{center} \includegraphics[height=0.7\textheight]{figs/nacho-trainer.jpg} \end{center} } \only<3>{ \framesubtitle{... no lecture (please!)} \begin{center} \includegraphics[height=0.7\textheight]{figs/soccer.jpg} \end{center} } \end{frame} % ---------------------------------------------------------- \begin{frame} \frametitle{What you should learn this week} \begin{itemize} \item What makes good plots? \item What is descriptive/inferential statistics? \item What is the general structure of a statistical test? \item What does a p-value mean? \item How can I build my own tests? \item How large should my $n$ be? \item What is {\em maximum likelihood} and why is it important? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Day 1 -- descriptive statistics and plots} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{types of data} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{data scales} \framesubtitle{What data types are distinguished in statistics?} \Large {\bf Why are data types important?} \pause \begin{itemize} \item selection of statistics \item selection of plots \item selection of correct tests \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{nominal/categorial scale} \begin{itemize} \item properties like cell type, experimental group (i.e. treatment 1, treatment 2, control) \item each observation/sample is put into one category \item there is no reasonable order among the categories \item example: [rods, cones] vs. [cones, rods] \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{ordinal scale} \begin{itemize} \item like nominal scale, but there is an order \item {\bf but:} there is no reasonable measure of {\em distance} between the classes \item examples: ranks, ratings \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{interval scale} \begin{itemize} \item quantitative/metric values \item reasonable measure of distance between values but no absolute zero \item examples: temperature in $^\circ$C \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{data scales} \framesubtitle{absolut/ratio scale} \begin{itemize} \item like interval scale but with absolute zero \item example: temperature in $^\circ$K \end{itemize} \pause \begin{emphasize}{relationsships between scales} \begin{itemize} \item scales exhibit increasing information content from nominal to absolute \item conversion ,,downwards'' always possible \end{itemize} \end{emphasize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{examples from neuroscience and psychology} \begin{itemize} \item {\bf nominal:}\pause \begin{itemize} \item treatment group \item stimulus class \item cell type \end{itemize} \item {\bf ordinal:} \pause \begin{itemize} \item ratings \item clinical stages of a disease \item states of an ion channel \end{itemize} \item {\bf Absolut-/Ratioskala:}\pause \begin{itemize} \item firing rate \item membrane potential \item ion concentration \end{itemize} \end{itemize} \end{frame} %------------------------------------------------------------- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{statistics} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %------------------------------------------------------------- \begin{frame} \frametitle{What is "a statistic"?} \begin{definition}{statistic} A statistic (singular) is a single measure of some attribute of a sample (e.g., its arithmetic mean value). It is calculated by applying a function (statistical algorithm) to the values of the items of the sample, which are known together as a set of data. \source{http://en.wikipedia.org/wiki/Statistic} \end{definition} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{Beispiele f\"ur Teststatistiken} \begin{itemize} \item {\bf nominal:}\pause \begin{itemize} \item count \item relative frequency/proportion \end{itemize} \item {\bf ordinal:} \pause \begin{itemize} \item median \item quantile/percentile \item rank correlation \end{itemize} \item {\bf absolute/ratio:}\pause \begin{itemize} \item mean \item variance/ standard deviation \item Pearson correlation \end{itemize} \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{exercise} \begin{task}{Spearman rank correlation} \begin{enumerate} \item Use {\tt randi} to generate two vectors {\tt x,y} with $100$ random integers between $0$ and $10$ each. \item Find out how to compute the Spearman rank correlation $$\rho = 1- {\frac {6 \sum d_i^2}{n(n^2 - 1)}}$$ with Matlab. $d_i = x_i - y_i$ is the difference in the rank between the single data points. \item Compute $\rho$ between $x$ and $y$, between $x$ and $y^2$, between $\log(x+1)$ and $y^2$. \item Compute the "standard" (Pearson) correlation coefficient between these values. \item What can you observe and why does it make sense? \end{enumerate} \end{task} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{solution} \begin{solution}{Spearman rank correlation } \scriptsize \begin{lstlisting} >>> x = randi(10, 100, 1); >>> y = randi(10, 100, 1); >>> corr(x,y,'type','Spearman') ans = 0.1220 >>> corr(x,y.^2,'type','Spearman') ans = 0.1220 >>> corr(x,y,'type','Pearson') ans = 0.1074 >>> corr(x,y.^2,'type','Pearson') ans = 0.0551 \end{lstlisting} The rank correlation does not change under a monotone transformation of the data. Therefore, it can be used for ordinal data. The Pearson correlation coefficient does not have that property. \end{solution} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{what makes a good plot} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{} \begin{center} \Huge What makes a good plot? \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} A good plot \begin{itemize} \item helps the reader to clearly understand your point.\pause \item is not misleading and let's the reader judge the information on her own (different y-axis/length scales in two related plots, "squeezing" via log-plots). \pause \item contains information about the data (a comic might be illustrative, but does not contain information about the data).\pause \item adheres to the principle of {\em ink minimization}. \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{design/organization} \begin{itemize} \item Is the display consistent with the model or hypothesis being tested?\pause \item Are there "empty dimensions" in the display that could be removed (A 3D pie chart for 2D categorical data, extraneous colors that do not encode meaningful information)?\pause \item Does the display provide an honest and transparent portrayal of the data (hiding, smoothing, modifying data points should be avoided or explicitly mentioned)? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{axes} \begin{itemize} \item Are axes scales defined as linear, log, or radial?\pause \item Does each axis label describe the variable and its units (use "a.u." for arbitrary units)?\pause \item Are axes limits appropriate for the data (The graphic should not be bounded at zero if the data can take on both positive and negative values.)?\pause \item Is the aspect ratio appropriate for the data (When x and y axes contrast the same variable under different conditions the graphic should be square.)? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{color mapping} \begin{itemize} \item Is a color bar provided?\pause \item Is the color map sensible for the data type (does the data extend to both $\pm$, does it live in an interval, is it circular)?\pause \item Are contrasting colors consistent with a natural interpretation? \item Can features be discriminated when printed in grayscale? \item Has red/green contrast been avoided to accommodate common forms of colorblindness? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{uncertainty} \begin{itemize} \item Does the display indicate the uncertainty of estimated parameters?\pause \item Is the type of error surface appropriate for the data? \begin{itemize} \item Use standard deviations to describe variability in the population.\pause \item Use standard errors or confidence intervals to make inferences about parameters estimated from a sample.\pause \item Parametric confidence intervals should only be used if data meet the assumptions of the underlying model.\pause \end{itemize} \item Are the units of uncertainty defined (is it standard error, is it $95\%$ confidence interval)? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{features of a good plot} \framesubtitle{annotation} \begin{itemize} \item Are all symbols defined, preferably by directly labeling objects?\pause \item Is the directionality of a contrast between conditions obvious?\pause \item Is the number of samples or independent experiments indicated?\pause \item Are statistical procedures and criteria for significance described?\pause \item Are uncommon abbreviations avoided or clearly defined?\pause \item Are abbreviations consistent with those used in the text? \end{itemize} \mycite{Allen et al. 2012, Neuron} \end{frame} \subsection{bad examples} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{suboptimal example} \begin{center} \includegraphics[width=.5\linewidth]{figs/nobelbad} \end{center} \mycite{Hafting et al. 2005, nature} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{suboptimal example} \begin{center} \includegraphics[width=.5\linewidth]{figs/badbarright.png} \end{center} \source{http://en.wikipedia.org/wiki/Misleading\_graph} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{suboptimal example} \begin{center} \includegraphics[width=.4\linewidth]{figs/yaxisscalingleft.png} \hspace{.5cm} \includegraphics[width=.4\linewidth]{figs/yaxisscalingright.png} \end{center} \source{http://en.wikipedia.org/wiki/Misleading\_graph} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{suboptimal example} \begin{center} \includegraphics[width=.4\linewidth]{figs/badscatterleft.png} \hspace{.5cm} \includegraphics[width=.4\linewidth]{figs/badscatterright.png} \end{center} \source{http://en.wikipedia.org/wiki/Misleading\_graph} \end{frame} %------------------------------------------------------------- \begin{frame} \frametitle{suboptimal example} \begin{center} \includegraphics[width=.8\linewidth]{figs/badbarplot} \end{center} \source{www.enfovis.com} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{plotting data} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{bar plot for count and relative frequency} \begin{center} \includegraphics[width=.8\linewidth]{figs/nominaldataplot} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{bar plot for count and relative frequency} \scriptsize \begin{lstlisting} % plot bar([1,2], [50, 90], 'facecolor', 'k') % labels axes ylabel('cell count') xlabel('cell type') % cosmetics xlim([0.5,2.5]) ylim([0, 100]) box('off') set(gca,'XTick',1:2,'XTickLabel',{'pyramidal','interneuron'},'FontSize',20) % settings for saving the figure set(gcf, 'PaperUnits', 'centimeters'); set(gcf, 'PaperSize', [11.7 9.0]); set(gcf, 'PaperPosition',[0.0 0.0 11.7 9.0]); \end{lstlisting} \end{frame} %---------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{pie chart for count and relative frequency} \begin{center} \includegraphics[width=.8\linewidth]{figs/nominaldataplot2} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{exercise} \begin{task}{pie chart} Plot the same data ($n_{py}=50$, $n_{in}=90$) as a pie chart in Matlab. \end{task} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting nominal data} \framesubtitle{pie chart for relative frequency} \scriptsize \begin{lstlisting} data = [50, 90]; h = pie(data, [1,0], {'pyramidal (n=50)', 'interneuron (n=90)'}) hText = findobj(h,'Type','text') % text object handles set(h(1), 'FaceColor', [.2,.2,.2]); set(h(2), 'Rotation', 45); set(h(3), 'FaceColor', [.8,.8,.8]); set(h(4), 'Rotation', 45); title('cell count') set(gca,'XTick',1:2,'XTickLabel',{'pyramidal', 'interneuron'}) box('off') set(gcf, 'PaperUnits', 'centimeters'); set(gcf, 'PaperSize', [11.7 9.0]); set(gcf, 'PaperPosition',[0.0 0.0 11.7 9.0]); \end{lstlisting} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{histogram} \begin{center} \includegraphics[width=.8\linewidth]{figs/histogram} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{bad choice of bins} \begin{center} \includegraphics[width=.4\linewidth]{figs/histogrambad} \includegraphics[width=.4\linewidth]{figs/histogrambad2} \end{center} \begin{summary}{Rule of thumb} Choose the bins $b\approx n/20$. \end{summary} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{how to do in Matlab} \scriptsize \begin{lstlisting} x = randn(2000,1); % generate Gaussian data hist(x, 50); % generate histogram % set facecolor to gray h = findobj(gca, 'Type','patch'); set(h(1), 'FaceColor',[.2,.2,.2], 'EdgeColor','w', 'linewidth',2) % plot a white grid over it h = gridxy([],get(gca,'ytick'),'color','w','linewidth',2) uistack(h, 'top') % cosmetics box('off'); xlabel('Data') ylabel('Count') \end{lstlisting} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{bar plot} There are several ways to plot a sample $x_1, ..., x_n$ of interval/ratio/absolute scale with a bar plot \begin{center} \includegraphics[width=.6\linewidth]{figs/barplots.png} \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile,fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{bar plot} \scriptsize \begin{lstlisting} % bar plot x = rand(10,1); gray = [.5,.5,.5]; bar(1, mean(x), 'EdgeColor','w','FaceColor', gray); hold on bar(2, mean(x), 'EdgeColor','w','FaceColor', gray); plot(0*x + 2, x, 'ok'); bar(3, mean(x), 'EdgeColor','w','FaceColor', gray); errorbar(3, mean(x), std(x), 'ok'); bar(4, mean(x), 'EdgeColor','w','FaceColor', gray); errorbar(4, mean(x), std(x)/sqrt(length(x)), 'ok'); set(gca, 'xtick',[]) ylabel('uniformly distributed random data in [0,1]') box('off') title('different forms of bar plots') hold off \end{lstlisting} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile,fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{bar plot and measure of central tendency and spread} \begin{itemize} \item A bar plot collapses real data onto a single number and some measure of spread. This number is usually a {\em measure of central tendency}, i.e. a typical/central value for the probability distribution of the data.\pause \item What measures of central tendency can you think of?\pause \begin{itemize} \item mean \item median \item geometric mean (the nth root of the product of the data values) \item weighted mean \item midrange (mean of the maximum and minimum values of a data set) \end{itemize}\pause \item Additionally, the bar plot is equipped with a measure of {\em spread} or {\em dispersion}. What measure of spread can you think of?\pause \begin{itemize} \item standard deviation \item range (maximum minus minimum of a dataset) \item inter-quartile range \end{itemize} \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile,fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{measure of central tendency and spread} \Large \begin{center} \bf The part of statistics that summarizes data in a small number of values is called {\em descriptive statistics}. \end{center} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile,fragile] \frametitle{robust statistics} \begin{task}{When is statistic called robust (leave-one-out)?} \begin{itemize} \item Generate an array with $20$ random numbers using {\tt randn}. \item Compute $20$ means: the $i^{th}$ mean is computed from the data set {\em without} the $i^{th}$ example. \item Repeat this with the median. \item Make a bar plot that depicts the means of the computed means and medians along with an appropriate measure of dispersion. \item What can you observe? Do you understand why? \end{itemize} \end{task} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{boxplot} \begin{minipage}{1.0\linewidth} \begin{minipage}{0.5\linewidth} \begin{center} \includegraphics[width=\linewidth]{figs/boxplot.png} \end{center} \end{minipage} \begin{minipage}{0.5\linewidth} Who knows what the elements mean?\pause \begin{itemize} \item the box depicts the inter-quartile range \item the line denotes the median \item the whiskers denote the extreme value of the data not considered outliers \item outliers are plotted separately \end{itemize} \begin{task}{Outliers} \begin{itemize} \item Find out how an outlier is defined in a matlab boxplot. \item Can you remove an outlier from the dataset? \end{itemize} \end{task} \end{minipage} \end{minipage} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting interval/ratio/absolute data} \framesubtitle{violinplot} \begin{center} \includegraphics[width=.8\linewidth]{figs/violinplots.png} \end{center} \begin{itemize} \item Violinplots depict the distribution of the data by a smoothed histogram. \item Additional information (data points, median, inter-quartile range) are plotted inside. \end{itemize} \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting combinations of scales} What could we use for a combination of categorial/nominal and interval/ratio/absolute? \pause \begin{center} \includegraphics[width=.5\linewidth]{figs/factorplot.png} \end{center} Each category is a single bar. \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \frametitle{plotting combinations of scales} What could we use for a combination of interval/ratio/absolute and interval/ratio/absolute, e.g. $(x_1, y_1), ..., (x_n,y_n)$? \pause \begin{center} \includegraphics[width=.8\linewidth]{figs/paireddata.png} \end{center} Scatter plot or paired bar chart. Scatter plot can also be used for ordinal vs. ordinal data (why not the bar chart?). \end{frame} %------------------------------------------------------------- \begin{frame}[fragile] \begin{center} \Huge That's it. \end{center} \end{frame} \end{document}