[bootstrap] expanding exercise by difference of mean permutation test

This commit is contained in:
Jan Benda 2020-12-13 13:43:27 +01:00
parent 68784b0e23
commit c9dd1ffbe6
8 changed files with 152 additions and 31 deletions

View File

@ -10,7 +10,7 @@ function [bootsem, mu] = bootstrapmean(x, resample)
for i = 1:resample
% resample:
xr = x(randi(nsamples, nsamples, 1));
% compute statistics on sample:
% compute statistics of resampled sample:
mu(i) = mean(xr);
end
bootsem = std(mu);

View File

@ -1,17 +1,17 @@
%% (a) bootstrap:
nperm = 1000;
rb = zeros(nperm,1);
rb = zeros(nperm, 1);
for i=1:nperm
% indices for resampling the data:
inx = randi(length(x), length(x), 1);
% resampled data pairs:
xb=x(inx);
yb=y(inx);
xb = x(inx);
yb = y(inx);
rb(i) = corr(xb, yb);
end
%% (b) pdf of the correlation coefficients:
[hb,bb] = hist(rb, 20);
[hb, bb] = hist(rb, 20);
hb = hb/sum(hb)/(bb(2)-bb(1)); % normalization
%% (c) significance:
@ -25,8 +25,8 @@ end
%% plot:
hold on;
bar(b, h, 'facecolor', [0.5 0.5 0.5]);
bar(bb, hb, 'facecolor', 'b');
bar(b, h, 'facecolor', [0.5 0.5 0.5]); % permuation test
bar(bb, hb, 'facecolor', 'b'); % bootstrap
bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r');
plot([rd rd], [0 4], 'r', 'linewidth', 2);
xlim([-0.25 0.75])

View File

@ -1,4 +1,4 @@
%% (a) generate correlated data
%% (a) generate correlated data:
n = 1000;
a = 0.2;
x = randn(n, 1);
@ -20,19 +20,19 @@ hold off
%c = corrcoef(x, y); % returns correlation matrix
%rd = c(1, 2);
rd = corr(x, y);
fprintf('correlation coefficient = %.2f\n', rd );
fprintf('correlation coefficient = %.2f\n', rd);
%% (e) permutation:
nperm = 1000;
rs = zeros(nperm,1);
for i=1:nperm
xr=x(randperm(length(x))); % shuffle x
yr=y(randperm(length(y))); % shuffle y
rs = zeros(nperm, 1);
for i = 1:nperm
xr = x(randperm(length(x))); % shuffle x
yr = y(randperm(length(y))); % shuffle y
rs(i) = corr(xr, yr);
end
%% (g) pdf of the correlation coefficients:
[h,b] = hist(rs, 20);
[h, b] = hist(rs, 20);
h = h/sum(h)/(b(2)-b(1)); % normalization
%% (h) significance:
@ -49,7 +49,7 @@ subplot(1, 2, 2)
hold on;
bar(b, h, 'facecolor', 'b');
bar(b(b>=rq), h(b>=rq), 'facecolor', 'r');
plot( [rd rd], [0 4], 'r', 'linewidth', 2);
plot([rd rd], [0 4], 'r', 'linewidth', 2);
xlim([-0.25 0.25])
xlabel('Correlation coefficient');
ylabel('Probability density of H0');

View File

@ -15,7 +15,7 @@
\else
\newcommand{\stitle}{}
\fi
\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}}
\header{{\bfseries\large Exercise 8\stitle}}{{\bfseries\large Resampling}}{{\bfseries\large December 14th, 2020}}
\firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email:
jan.benda@uni-tuebingen.de}
\runningfooter{}{\thepage}{}
@ -86,6 +86,9 @@ jan.benda@uni-tuebingen.de}
\begin{questions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\question \qt{Read chapter 7 of the script on ``resampling methods''!}\vspace{-3ex}
\question \qt{Bootstrap the standard error of the mean}
We want to compute the standard error of the mean of a data set by
means of the bootstrap method and compare the result with the formula
@ -149,10 +152,10 @@ normally distributed?
\continue
\question \qt{Permutation test} \label{permutationtest}
\question \qt{Permutation test of correlations} \label{correlationtest}
We want to compute the significance of a correlation by means of a permutation test.
\begin{parts}
\part \label{permutationtestdata} Generate 1000 correlated pairs
\part \label{correlationtestdata} Generate 1000 correlated pairs
$x$, $y$ of random numbers according to:
\begin{verbatim}
n = 1000
@ -188,9 +191,13 @@ correlation coefficient of zero we can conclude that the correlation
coefficient of the data indeed quantifies correlated data.
We take the same data set that we have generated in exercise
\ref{permutationtest} (\ref{permutationtestdata}).
\ref{correlationtest} (\ref{correlationtestdata}).
\begin{parts}
\part Bootstrap 1000 times the correlation coefficient from the data.
\part Bootstrap 1000 times the correlation coefficient from the
data, i.e. generate bootstrap data by randomly resampling the
original data pairs with replacement. Use the \code{randi()}
function for generating random indices that you can use to select a
random sample from the original data.
\part Compute and plot the probability density of these correlation
coefficients.
\part Is the correlation of the original data set significant?
@ -200,6 +207,43 @@ We take the same data set that we have generated in exercise
\includegraphics[width=1\textwidth]{correlationbootstrap}
\end{solution}
\continuepage
\question \qt{Permutation test of difference of means}
We want to test whether two data sets come from distributions that
differ in their mean by means of a permutation test.
\begin{parts}
\part Generate two normally distributed data sets $x$ and $y$
containing each $n=200$ samples. Let's assume the $x$ samples are
measurements of the membrane potential of a mammalian photoreceptor
in darkness with a mean of $-40$\,mV and a standard deviation of
1\,mV. The $y$ values are the membrane potentials measured under dim
illumination and come from a distribution with the same standard
deviation and a mean of $-40.5$\,mV. See section 5.2 ``Scaling and
shifting random numbers'' in the script.
\part Plot histograms of the $x$ and $y$ data in a single
plot. Choose appropriate bins.
\part Compute the means of $x$ and $y$ and their difference.
\part The null hypothesis is that the $x$ and $y$ data come from the
same distribution. How can you generate new samples $x_r$ and $y_r$
from the original data that come from the same distribution?
\part Do exactly this 1000 times and compute each time the
difference of the means of the two resampled samples.
\part Compute and plot the probability density of the resulting
distribution of the null hypothesis.
\part Is the difference of the means of the original data sets significant?
\part Repeat this procedure for $y$ samples that are closer or
further apart from the mean of the $x$ data set. For this put the
computations of the permuation test in a function and all the plotting
in another function.
\end{parts}
\begin{solution}
\lstinputlisting{meandiffpermutation.m}
%\lstinputlisting{meandiffplots.m}
\lstinputlisting{meandiffsignificance.m}
\includegraphics[width=1\textwidth]{meandiffsignificance}
\end{solution}
\end{questions}
\end{document}

View File

@ -0,0 +1,29 @@
function [md, ds, dq] = meandiffpermutation(x, y, nperm, alpha)
% Permutation test for difference of means of two independent samples.
%
% [md, ds] = meandiffpermutation(x, y, nperm, alpha);
%
% Arguments:
% x: vector with the samples of the x data set.
% y: vector with the samples of the y data set.
% nperm: number of permutations run.
% alpha: significance level.
%
% Returns:
% md: difference of the means
% ds: vector containing the differences of the means of the resampled data sets
% dq: difference of the means at a significance of alpha.
md = mean(x) - mean(y); % measured difference
xy = [x; y]; % merge data sets
% permutations:
ds = zeros(nperm, 1);
for i = 1:nperm
xyr = xy(randperm(length(xy))); % shuffle xy
xr = xyr(1:length(x)); % random x sample
yr = xyr(length(x)+1:end); % random y sample
ds(i) = mean(xr) - mean(yr);
end
% significance:
dq = quantile(ds, 1.0 - alpha);
end

View File

@ -0,0 +1,48 @@
%% (a) generate data:
n = 200;
mx = -40.0;
my = -40.5;
x = randn(n, 1) + mx;
y = randn(n, 1) + my;
%% (b) plot histograms:
subplot(1, 2, 1);
bmin = min([x; y]);
bmax = max([x; y]);
bins = bmin:(bmax-bmin)/20.0:bmax;
hist(x, bins, 'facecolor', 'b');
hold on
hist(y, bins, 'facecolor', 'r');
xlabel('x and y')
ylabel('counts')
hold off
% permutation test:
[md, ds, dq] = meandiffpermutation(x, y, nperm, alpha);
%% (c) difference of means:
fprintf('difference of means = %.2fmV\n', md);
%% (f) pdf of the differences:
[h, b] = hist(ds, 20);
h = h/sum(h)/(b(2)-b(1)); % normalization
%% (g) significance:
fprintf('difference of means at 5%% significance = %.2fmV\n', dq);
if md >= dq
fprintf('--> difference of means %.2fmV is significant\n', md);
else
fprintf('--> %.2fmV is not a significant difference of means\n', md);
end
%% plot:
subplot(1, 2, 2)
bar(b, h, 'facecolor', 'b');
hold on;
bar(b(b>=dq), h(b>=dq), 'facecolor', 'r');
plot([md md], [0 4], 'r', 'linewidth', 2);
xlabel('Difference of means');
ylabel('Probability density of H0');
hold off;
savefigpdf(gcf, 'meandiffsignificance.pdf', 12, 6);

View File

@ -1,10 +1,10 @@
%% (a) generate random numbers:
n = 100000;
x=randn(n, 1);
x = randn(n, 1);
for nsamples=[3 5 10 50]
for nsamples = [3 5 10 50]
nsamples
%% compute mean, standard deviation and t:
% compute mean, standard deviation and t:
nmeans = 10000;
means = zeros(nmeans, 1);
sdevs = zeros(nmeans, 1);
@ -19,14 +19,14 @@ for nsamples=[3 5 10 50]
% Gaussian pdfs:
msdev = std(means);
tsdev = 1.0;
dxg=0.01;
dxg = 0.01;
xmax = 10.0;
xmin = -xmax;
xg = [xmin:dxg:xmax];
pm = exp(-0.5*(xg/msdev).^2)/sqrt(2.0*pi)/msdev;
pt = exp(-0.5*(xg/tsdev).^2)/sqrt(2.0*pi)/tsdev;
%% plots
% plots:
subplot(1, 2, 1)
bins = xmin:0.2:xmax;
[h,b] = hist(means, bins);
@ -35,7 +35,7 @@ for nsamples=[3 5 10 50]
hold on
plot(xg, pm, 'r', 'linewidth', 2)
title(sprintf('sample size = %d', nsamples));
xlim( [-3, 3] );
xlim([-3, 3]);
xlabel('Mean');
ylabel('pdf');
hold off;
@ -48,11 +48,11 @@ for nsamples=[3 5 10 50]
hold on
plot(xg, pt, 'r', 'linewidth', 2)
title(sprintf('sample size = %d', nsamples));
xlim( [-8, 8] );
xlim([-8, 8]);
xlabel('Student-t');
ylabel('pdf');
hold off;
savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5);
pause( 3.0 )
pause(3.0)
end