[bootstrap] expanding exercise by difference of mean permutation test

2020-12-13 13:43:27 +01:00 · 2020-12-13 13:43:27 +01:00 · c9dd1ffbe6
commit c9dd1ffbe6
parent 68784b0e23
8 changed files with 152 additions and 31 deletions
--- a/bootstrap/exercises/bootstrapmean.m
+++ b/bootstrap/exercises/bootstrapmean.m
@ -10,7 +10,7 @@ function [bootsem, mu] = bootstrapmean(x, resample)
    for i = 1:resample
        % resample:
        xr = x(randi(nsamples, nsamples, 1));
-        % compute statistics on sample:
+        % compute statistics of resampled sample:
        mu(i) = mean(xr);
    end
    bootsem = std(mu);
--- a/bootstrap/exercises/correlationbootstrap.m
+++ b/bootstrap/exercises/correlationbootstrap.m
@ -1,18 +1,18 @@
 %% (a) bootstrap:
 nperm = 1000;
-rb = zeros(nperm,1);
+rb = zeros(nperm, 1);
 for i=1:nperm
    % indices for resampling the data:
    inx = randi(length(x), length(x), 1);
    % resampled data pairs:
-    xb=x(inx);
+    xb = x(inx);
-    yb=y(inx);
+    yb = y(inx);
    rb(i) = corr(xb, yb);
 end
 %% (b) pdf of the correlation coefficients:
-[hb,bb] = hist(rb, 20);
+[hb, bb] = hist(rb, 20);
-hb = hb/sum(hb)/(bb(2)-bb(1));  % normalization
+hb = hb/sum(hb)/(bb(2)-bb(1));      % normalization
 %% (c) significance:
 rbq = quantile(rb, 0.05);
@ -25,8 +25,8 @@ end
 %% plot:
 hold on;
-bar(b, h, 'facecolor', [0.5 0.5 0.5]);
+bar(b, h, 'facecolor', [0.5 0.5 0.5]);           % permuation test
-bar(bb, hb, 'facecolor', 'b');
+bar(bb, hb, 'facecolor', 'b');                   % bootstrap
 bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r');
 plot([rd rd], [0 4], 'r', 'linewidth', 2);
 xlim([-0.25 0.75])
--- a/bootstrap/exercises/correlationsignificance.m
+++ b/bootstrap/exercises/correlationsignificance.m
@ -1,4 +1,4 @@
-%% (a) generate correlated data
+%% (a) generate correlated data:
 n = 1000;
 a = 0.2;
 x = randn(n, 1);
@ -8,7 +8,7 @@ y = randn(n, 1) + a*x;
 subplot(1, 2, 1);
 plot(x, a*x, 'r', 'linewidth', 3);
 hold on
-%scatter(x, y );   % either scatter ...
+%scatter(x, y );                    % either scatter ...
 plot(x, y, 'o', 'markersize', 2 );  % ... or plot - same plot.
 xlim([-4 4])
 ylim([-4 4])
@ -20,20 +20,20 @@ hold off
 %c = corrcoef(x, y);  % returns correlation matrix
 %rd = c(1, 2);
 rd = corr(x, y);
-fprintf('correlation coefficient = %.2f\n', rd );
+fprintf('correlation coefficient = %.2f\n', rd);
 %% (e) permutation:
 nperm = 1000;
-rs = zeros(nperm,1);
+rs = zeros(nperm, 1);
-for i=1:nperm
+for i = 1:nperm
-    xr=x(randperm(length(x)));  % shuffle x
+    xr = x(randperm(length(x)));      % shuffle x
-    yr=y(randperm(length(y)));  % shuffle y
+    yr = y(randperm(length(y)));      % shuffle y
    rs(i) = corr(xr, yr);
 end
 %% (g) pdf of the correlation coefficients:
-[h,b] = hist(rs, 20);
+[h, b] = hist(rs, 20);
-h = h/sum(h)/(b(2)-b(1));  % normalization
+h = h/sum(h)/(b(2)-b(1));           % normalization
 %% (h) significance:
 rq = quantile(rs, 0.95);
@ -49,7 +49,7 @@ subplot(1, 2, 2)
 hold on;
 bar(b, h, 'facecolor', 'b');
 bar(b(b>=rq), h(b>=rq), 'facecolor', 'r');
-plot( [rd rd], [0 4], 'r', 'linewidth', 2);
+plot([rd rd], [0 4], 'r', 'linewidth', 2);
 xlim([-0.25 0.25])
 xlabel('Correlation coefficient');
 ylabel('Probability density of H0');
--- a/bootstrap/exercises/exercises01.tex
+++ b/bootstrap/exercises/exercises01.tex
@ -15,7 +15,7 @@
 \else
 \newcommand{\stitle}{}
 \fi
-\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}}
+\header{{\bfseries\large Exercise 8\stitle}}{{\bfseries\large Resampling}}{{\bfseries\large December 14th, 2020}}
 \firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email:
 jan.benda@uni-tuebingen.de}
 \runningfooter{}{\thepage}{}
@ -86,6 +86,9 @@ jan.benda@uni-tuebingen.de}
 \begin{questions}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \question \qt{Read chapter 7 of the script on ``resampling methods''!}\vspace{-3ex}
 \question \qt{Bootstrap the standard error of the mean}
 We want to compute the standard error of the mean of a data set by
 means of the bootstrap method and compare the result with the formula
@ -149,10 +152,10 @@ normally distributed?
 \continue
-\question \qt{Permutation test} \label{permutationtest}
+\question \qt{Permutation test of correlations} \label{correlationtest}
 We want to compute the significance of a correlation by means of a permutation test.
 \begin{parts}
-  \part \label{permutationtestdata} Generate 1000 correlated pairs
+  \part \label{correlationtestdata} Generate 1000 correlated pairs
  $x$, $y$ of random numbers according to:
 \begin{verbatim}
 n = 1000
@ -188,9 +191,13 @@ correlation coefficient of zero we can conclude that the correlation
 coefficient of the data indeed quantifies correlated data.
 We take the same data set that we have generated in exercise
-\ref{permutationtest} (\ref{permutationtestdata}).
+\ref{correlationtest} (\ref{correlationtestdata}).
 \begin{parts}
-  \part Bootstrap 1000 times the correlation coefficient from the data.
+  \part Bootstrap 1000 times the correlation coefficient from the
  data, i.e.  generate bootstrap data by randomly resampling the
  original data pairs with replacement. Use the \code{randi()}
  function for generating random indices that you can use to select a
  random sample from the original data.
  \part Compute and plot the probability density of these correlation
  coefficients.
  \part Is the correlation of the original data set significant?
@ -200,6 +207,43 @@ We take the same data set that we have generated in exercise
  \includegraphics[width=1\textwidth]{correlationbootstrap}
 \end{solution}
 \continuepage 
 \question \qt{Permutation test of difference of means}
 We want to test whether two data sets come from distributions that
 differ in their mean by means of a permutation test.
 \begin{parts}
  \part Generate two normally distributed data sets $x$ and $y$
  containing each $n=200$ samples. Let's assume the $x$ samples are
  measurements of the membrane potential of a mammalian photoreceptor
  in darkness with a mean of $-40$\,mV and a standard deviation of
  1\,mV. The $y$ values are the membrane potentials measured under dim
  illumination and come from a distribution with the same standard
  deviation and a mean of $-40.5$\,mV. See section 5.2 ``Scaling and
  shifting random numbers'' in the script.
  \part Plot histograms of the $x$ and $y$ data in a single
  plot. Choose appropriate bins.
  \part Compute the means of $x$ and $y$ and their difference.
  \part The null hypothesis is that the $x$ and $y$ data come from the
  same distribution. How can you generate new samples $x_r$ and $y_r$
  from the original data that come from the same distribution?
  \part Do exactly this 1000 times and compute each time the
  difference of the means of the two resampled samples.
  \part Compute and plot the probability density of the resulting
  distribution of the null hypothesis.
  \part Is the difference of the means of the original data sets significant?
  \part Repeat this procedure for $y$ samples that are closer or
  further apart from the mean of the $x$ data set. For this put the
  computations of the permuation test in a function and all the plotting
  in another function.
 \end{parts}
 \begin{solution}
  \lstinputlisting{meandiffpermutation.m}
  %\lstinputlisting{meandiffplots.m}
  \lstinputlisting{meandiffsignificance.m}
  \includegraphics[width=1\textwidth]{meandiffsignificance}
 \end{solution}
 \end{questions}
 \end{document}
--- a/bootstrap/exercises/meandiffpermutation.m
+++ b/bootstrap/exercises/meandiffpermutation.m
@ -0,0 +1,29 @@
 function [md, ds, dq] = meandiffpermutation(x, y, nperm, alpha)
 % Permutation test for difference of means of two independent samples.
 %
 % [md, ds] = meandiffpermutation(x, y, nperm, alpha);
 %
 % Arguments:
 %   x: vector with the samples of the x data set.
 %   y: vector with the samples of the y data set.
 %   nperm: number of permutations run.
 %   alpha: significance level.
 %
 % Returns:
 %   md: difference of the means 
 %   ds: vector containing the differences of the means of the resampled data sets
 %   dq: difference of the means at a significance of alpha.
  md = mean(x) - mean(y);             % measured difference
  xy = [x; y];                        % merge data sets
  % permutations:
  ds = zeros(nperm, 1);
  for i = 1:nperm
      xyr = xy(randperm(length(xy))); % shuffle xy
      xr = xyr(1:length(x));          % random x sample
      yr = xyr(length(x)+1:end);      % random y sample
      ds(i) = mean(xr) - mean(yr);
  end
  % significance:
  dq = quantile(ds, 1.0 - alpha);
 end
--- a/bootstrap/exercises/meandiffsignificance.m
+++ b/bootstrap/exercises/meandiffsignificance.m
@ -0,0 +1,48 @@
 %% (a) generate data:
 n = 200;
 mx = -40.0;
 my = -40.5;
 x = randn(n, 1) + mx;
 y = randn(n, 1) + my;
 %% (b) plot histograms:
 subplot(1, 2, 1);
 bmin = min([x; y]);
 bmax = max([x; y]);
 bins = bmin:(bmax-bmin)/20.0:bmax;
 hist(x, bins, 'facecolor', 'b');
 hold on
 hist(y, bins, 'facecolor', 'r');
 xlabel('x and y')
 ylabel('counts')
 hold off
 % permutation test:
 [md, ds, dq] = meandiffpermutation(x, y, nperm, alpha);
 %% (c) difference of means:
 fprintf('difference of means = %.2fmV\n', md);
 %% (f) pdf of the differences:
 [h, b] = hist(ds, 20);
 h = h/sum(h)/(b(2)-b(1));           % normalization
 %% (g) significance:
 fprintf('difference of means at 5%% significance = %.2fmV\n', dq);
 if md >= dq
    fprintf('--> difference of means %.2fmV is significant\n', md);
 else
    fprintf('--> %.2fmV is not a significant difference of means\n', md);
 end
 %% plot:
 subplot(1, 2, 2)
 bar(b, h, 'facecolor', 'b');
 hold on;
 bar(b(b>=dq), h(b>=dq), 'facecolor', 'r');
 plot([md md], [0 4], 'r', 'linewidth', 2);
 xlabel('Difference of means');
 ylabel('Probability density of H0');
 hold off;
 savefigpdf(gcf, 'meandiffsignificance.pdf', 12, 6);
--- a/bootstrap/exercises/tdistribution.m
+++ b/bootstrap/exercises/tdistribution.m
@ -1,10 +1,10 @@
 %% (a) generate random numbers:
 n = 100000;
-x=randn(n, 1);
+x = randn(n, 1);
-for nsamples=[3 5 10 50]
+for nsamples = [3 5 10 50]
    nsamples
-    %% compute mean, standard deviation and t:
+    % compute mean, standard deviation and t:
    nmeans = 10000;
    means = zeros(nmeans, 1);
    sdevs = zeros(nmeans, 1);
@ -19,14 +19,14 @@ for nsamples=[3 5 10 50]
    % Gaussian pdfs:
    msdev = std(means);
    tsdev = 1.0;
-    dxg=0.01;
+    dxg = 0.01;
    xmax = 10.0;
    xmin = -xmax;
    xg = [xmin:dxg:xmax];
    pm = exp(-0.5*(xg/msdev).^2)/sqrt(2.0*pi)/msdev;
    pt = exp(-0.5*(xg/tsdev).^2)/sqrt(2.0*pi)/tsdev;
-    %% plots
+    % plots:
    subplot(1, 2, 1)
    bins = xmin:0.2:xmax;
    [h,b] = hist(means, bins);
@ -35,7 +35,7 @@ for nsamples=[3 5 10 50]
    hold on
    plot(xg, pm, 'r', 'linewidth', 2)
    title(sprintf('sample size = %d', nsamples));
-    xlim( [-3, 3] );
+    xlim([-3, 3]);
    xlabel('Mean');
    ylabel('pdf');
    hold off;
@ -48,11 +48,11 @@ for nsamples=[3 5 10 50]
    hold on
    plot(xg, pt, 'r', 'linewidth', 2)
    title(sprintf('sample size = %d', nsamples));
-    xlim( [-8, 8] );
+    xlim([-8, 8]);
    xlabel('Student-t');
    ylabel('pdf');
    hold off;
    savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5);
-    pause( 3.0 )
+    pause(3.0)
 end
--- a/regression/exercises/fitting.tex
+++ b/regression/exercises/fitting.tex