[bootstrap] expanding exercise by difference of mean permutation test

2020-12-13 13:43:27 +01:00 · 2020-12-13 13:43:27 +01:00 · c9dd1ffbe6
commit c9dd1ffbe6
parent 68784b0e23
8 changed files with 152 additions and 31 deletions
--- a/bootstrap/exercises/bootstrapmean.m
+++ b/bootstrap/exercises/bootstrapmean.m
@ -10,7 +10,7 @@ function [bootsem, mu] = bootstrapmean(x, resample)
    for i = 1:resample
        % resample:
        xr = x(randi(nsamples, nsamples, 1));
-        % compute statistics on sample:
+        % compute statistics of resampled sample:
        mu(i) = mean(xr);
    end
    bootsem = std(mu);
--- a/bootstrap/exercises/correlationbootstrap.m
+++ b/bootstrap/exercises/correlationbootstrap.m
@ -1,17 +1,17 @@
 %% (a) bootstrap:
 nperm = 1000;
-rb = zeros(nperm,1);
+rb = zeros(nperm, 1);
 for i=1:nperm
    % indices for resampling the data:
    inx = randi(length(x), length(x), 1);
    % resampled data pairs:
-    xb=x(inx);
-    yb=y(inx);
+    xb = x(inx);
+    yb = y(inx);
    rb(i) = corr(xb, yb);
 end

 %% (b) pdf of the correlation coefficients:
-[hb,bb] = hist(rb, 20);
+[hb, bb] = hist(rb, 20);
 hb = hb/sum(hb)/(bb(2)-bb(1));      % normalization

 %% (c) significance:
@ -25,8 +25,8 @@ end

 %% plot:
 hold on;
-bar(b, h, 'facecolor', [0.5 0.5 0.5]);
-bar(bb, hb, 'facecolor', 'b');
+bar(b, h, 'facecolor', [0.5 0.5 0.5]);           % permuation test
+bar(bb, hb, 'facecolor', 'b');                   % bootstrap
 bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r');
 plot([rd rd], [0 4], 'r', 'linewidth', 2);
 xlim([-0.25 0.75])
--- a/bootstrap/exercises/correlationsignificance.m
+++ b/bootstrap/exercises/correlationsignificance.m
@ -1,4 +1,4 @@
-%% (a) generate correlated data
+%% (a) generate correlated data:
 n = 1000;
 a = 0.2;
 x = randn(n, 1);
@ -20,19 +20,19 @@ hold off
 %c = corrcoef(x, y);  % returns correlation matrix
 %rd = c(1, 2);
 rd = corr(x, y);
-fprintf('correlation coefficient = %.2f\n', rd );
+fprintf('correlation coefficient = %.2f\n', rd);

 %% (e) permutation:
 nperm = 1000;
-rs = zeros(nperm,1);
-for i=1:nperm
-    xr=x(randperm(length(x)));  % shuffle x
-    yr=y(randperm(length(y)));  % shuffle y
+rs = zeros(nperm, 1);
+for i = 1:nperm
+    xr = x(randperm(length(x)));      % shuffle x
+    yr = y(randperm(length(y)));      % shuffle y
    rs(i) = corr(xr, yr);
 end

 %% (g) pdf of the correlation coefficients:
-[h,b] = hist(rs, 20);
+[h, b] = hist(rs, 20);
 h = h/sum(h)/(b(2)-b(1));           % normalization

 %% (h) significance:
@ -49,7 +49,7 @@ subplot(1, 2, 2)
 hold on;
 bar(b, h, 'facecolor', 'b');
 bar(b(b>=rq), h(b>=rq), 'facecolor', 'r');
-plot( [rd rd], [0 4], 'r', 'linewidth', 2);
+plot([rd rd], [0 4], 'r', 'linewidth', 2);
 xlim([-0.25 0.25])
 xlabel('Correlation coefficient');
 ylabel('Probability density of H0');
--- a/bootstrap/exercises/exercises01.tex
+++ b/bootstrap/exercises/exercises01.tex
@ -15,7 +15,7 @@
 \else
 \newcommand{\stitle}{}
 \fi
-\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}}
+\header{{\bfseries\large Exercise 8\stitle}}{{\bfseries\large Resampling}}{{\bfseries\large December 14th, 2020}}
 \firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email:
 jan.benda@uni-tuebingen.de}
 \runningfooter{}{\thepage}{}
@ -86,6 +86,9 @@ jan.benda@uni-tuebingen.de}

 \begin{questions}

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\question \qt{Read chapter 7 of the script on ``resampling methods''!}\vspace{-3ex}
+
 \question \qt{Bootstrap the standard error of the mean}
 We want to compute the standard error of the mean of a data set by
 means of the bootstrap method and compare the result with the formula
@ -149,10 +152,10 @@ normally distributed?


 \continue
-\question \qt{Permutation test} \label{permutationtest}
+\question \qt{Permutation test of correlations} \label{correlationtest}
 We want to compute the significance of a correlation by means of a permutation test.
 \begin{parts}
-  \part \label{permutationtestdata} Generate 1000 correlated pairs
+  \part \label{correlationtestdata} Generate 1000 correlated pairs
  $x$, $y$ of random numbers according to:
 \begin{verbatim}
 n = 1000
@ -188,9 +191,13 @@ correlation coefficient of zero we can conclude that the correlation
 coefficient of the data indeed quantifies correlated data.

 We take the same data set that we have generated in exercise
-\ref{permutationtest} (\ref{permutationtestdata}).
+\ref{correlationtest} (\ref{correlationtestdata}).
 \begin{parts}
-  \part Bootstrap 1000 times the correlation coefficient from the data.
+  \part Bootstrap 1000 times the correlation coefficient from the
+  data, i.e.  generate bootstrap data by randomly resampling the
+  original data pairs with replacement. Use the \code{randi()}
+  function for generating random indices that you can use to select a
+  random sample from the original data.
  \part Compute and plot the probability density of these correlation
  coefficients.
  \part Is the correlation of the original data set significant?
@ -200,6 +207,43 @@ We take the same data set that we have generated in exercise
  \includegraphics[width=1\textwidth]{correlationbootstrap}
 \end{solution}

+
+\continuepage 
+\question \qt{Permutation test of difference of means}
+We want to test whether two data sets come from distributions that
+differ in their mean by means of a permutation test.
+\begin{parts}
+  \part Generate two normally distributed data sets $x$ and $y$
+  containing each $n=200$ samples. Let's assume the $x$ samples are
+  measurements of the membrane potential of a mammalian photoreceptor
+  in darkness with a mean of $-40$\,mV and a standard deviation of
+  1\,mV. The $y$ values are the membrane potentials measured under dim
+  illumination and come from a distribution with the same standard
+  deviation and a mean of $-40.5$\,mV. See section 5.2 ``Scaling and
+  shifting random numbers'' in the script.
+  \part Plot histograms of the $x$ and $y$ data in a single
+  plot. Choose appropriate bins.
+  \part Compute the means of $x$ and $y$ and their difference.
+  \part The null hypothesis is that the $x$ and $y$ data come from the
+  same distribution. How can you generate new samples $x_r$ and $y_r$
+  from the original data that come from the same distribution?
+  \part Do exactly this 1000 times and compute each time the
+  difference of the means of the two resampled samples.
+  \part Compute and plot the probability density of the resulting
+  distribution of the null hypothesis.
+  \part Is the difference of the means of the original data sets significant?
+  \part Repeat this procedure for $y$ samples that are closer or
+  further apart from the mean of the $x$ data set. For this put the
+  computations of the permuation test in a function and all the plotting
+  in another function.
+\end{parts}
+\begin{solution}
+  \lstinputlisting{meandiffpermutation.m}
+  %\lstinputlisting{meandiffplots.m}
+  \lstinputlisting{meandiffsignificance.m}
+  \includegraphics[width=1\textwidth]{meandiffsignificance}
+\end{solution}
+
 \end{questions}

 \end{document}
--- a/bootstrap/exercises/meandiffpermutation.m
+++ b/bootstrap/exercises/meandiffpermutation.m
@ -0,0 +1,29 @@
+function [md, ds, dq] = meandiffpermutation(x, y, nperm, alpha)
+% Permutation test for difference of means of two independent samples.
+%
+% [md, ds] = meandiffpermutation(x, y, nperm, alpha);
+%
+% Arguments:
+%   x: vector with the samples of the x data set.
+%   y: vector with the samples of the y data set.
+%   nperm: number of permutations run.
+%   alpha: significance level.
+%
+% Returns:
+%   md: difference of the means 
+%   ds: vector containing the differences of the means of the resampled data sets
+%   dq: difference of the means at a significance of alpha.
+
+  md = mean(x) - mean(y);             % measured difference
+  xy = [x; y];                        % merge data sets
+  % permutations:
+  ds = zeros(nperm, 1);
+  for i = 1:nperm
+      xyr = xy(randperm(length(xy))); % shuffle xy
+      xr = xyr(1:length(x));          % random x sample
+      yr = xyr(length(x)+1:end);      % random y sample
+      ds(i) = mean(xr) - mean(yr);
+  end
+  % significance:
+  dq = quantile(ds, 1.0 - alpha);
+end
--- a/bootstrap/exercises/meandiffsignificance.m
+++ b/bootstrap/exercises/meandiffsignificance.m
@ -0,0 +1,48 @@
+%% (a) generate data:
+n = 200;
+mx = -40.0;
+my = -40.5;
+x = randn(n, 1) + mx;
+y = randn(n, 1) + my;
+
+%% (b) plot histograms:
+subplot(1, 2, 1);
+bmin = min([x; y]);
+bmax = max([x; y]);
+bins = bmin:(bmax-bmin)/20.0:bmax;
+hist(x, bins, 'facecolor', 'b');
+hold on
+hist(y, bins, 'facecolor', 'r');
+xlabel('x and y')
+ylabel('counts')
+hold off
+
+% permutation test:
+[md, ds, dq] = meandiffpermutation(x, y, nperm, alpha);
+
+%% (c) difference of means:
+fprintf('difference of means = %.2fmV\n', md);
+
+%% (f) pdf of the differences:
+[h, b] = hist(ds, 20);
+h = h/sum(h)/(b(2)-b(1));           % normalization
+
+%% (g) significance:
+fprintf('difference of means at 5%% significance = %.2fmV\n', dq);
+if md >= dq
+    fprintf('--> difference of means %.2fmV is significant\n', md);
+else
+    fprintf('--> %.2fmV is not a significant difference of means\n', md);
+end
+
+%% plot:
+subplot(1, 2, 2)
+bar(b, h, 'facecolor', 'b');
+hold on;
+bar(b(b>=dq), h(b>=dq), 'facecolor', 'r');
+plot([md md], [0 4], 'r', 'linewidth', 2);
+xlabel('Difference of means');
+ylabel('Probability density of H0');
+hold off;
+
+savefigpdf(gcf, 'meandiffsignificance.pdf', 12, 6);
--- a/bootstrap/exercises/tdistribution.m
+++ b/bootstrap/exercises/tdistribution.m
@ -1,10 +1,10 @@
 %% (a) generate random numbers:
 n = 100000;
-x=randn(n, 1);
+x = randn(n, 1);

-for nsamples=[3 5 10 50]
+for nsamples = [3 5 10 50]
    nsamples
-    %% compute mean, standard deviation and t:
+    % compute mean, standard deviation and t:
    nmeans = 10000;
    means = zeros(nmeans, 1);
    sdevs = zeros(nmeans, 1);
@ -19,14 +19,14 @@ for nsamples=[3 5 10 50]
    % Gaussian pdfs:
    msdev = std(means);
    tsdev = 1.0;
-    dxg=0.01;
+    dxg = 0.01;
    xmax = 10.0;
    xmin = -xmax;
    xg = [xmin:dxg:xmax];
    pm = exp(-0.5*(xg/msdev).^2)/sqrt(2.0*pi)/msdev;
    pt = exp(-0.5*(xg/tsdev).^2)/sqrt(2.0*pi)/tsdev;
    
-    %% plots
+    % plots:
    subplot(1, 2, 1)
    bins = xmin:0.2:xmax;
    [h,b] = hist(means, bins);
@ -35,7 +35,7 @@ for nsamples=[3 5 10 50]
    hold on
    plot(xg, pm, 'r', 'linewidth', 2)
    title(sprintf('sample size = %d', nsamples));
-    xlim( [-3, 3] );
+    xlim([-3, 3]);
    xlabel('Mean');
    ylabel('pdf');
    hold off;
@ -48,11 +48,11 @@ for nsamples=[3 5 10 50]
    hold on
    plot(xg, pt, 'r', 'linewidth', 2)
    title(sprintf('sample size = %d', nsamples));
-    xlim( [-8, 8] );
+    xlim([-8, 8]);
    xlabel('Student-t');
    ylabel('pdf');
    hold off;
    
    savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5);
-    pause( 3.0 )
+    pause(3.0)
 end
--- a/regression/exercises/fitting.tex
+++ b/regression/exercises/fitting.tex