[bootstrap] updated text and exercises

This commit is contained in:
2019-12-03 08:57:40 +01:00
parent bfad4ac339
commit 006fa998cc
11 changed files with 72 additions and 69 deletions

View File

@@ -1,11 +1,11 @@
function [bootsem, mu] = bootstrapmean( x, resample )
function [bootsem, mu] = bootstrapmean(x, resample)
% computes standard error by bootstrapping the data
% x: vector with data
% resample: number of resamplings
% returns:
% bootsem: the standard error of the mean
% mu: the bootstrapped means as a vector
mu = zeros( resample, 1 );
mu = zeros(resample, 1);
nsamples = length(x);
for i = 1:resample
% resample:
@@ -13,5 +13,5 @@ function [bootsem, mu] = bootstrapmean( x, resample )
% compute statistics on sample:
mu(i) = mean(xr);
end
bootsem = std( mu );
bootsem = std(mu);
end

View File

@@ -1,36 +1,36 @@
%% (b) load the data:
load( 'thymusglandweights.dat' );
load('thymusglandweights.dat');
nsamples = 80;
x = thymusglandweights(1:nsamples);
%% (c) mean, sem and hist:
sem = std(x)/sqrt(nsamples);
fprintf( 'Mean of the data set = %.2fmg\n', mean(x) );
fprintf( 'SEM of the data set = %.2fmg\n', sem );
fprintf('Mean of the data set = %.2fmg\n', mean(x));
fprintf('SEM of the data set = %.2fmg\n', sem);
hist(x,20)
xlabel('x')
ylabel('count')
savefigpdf( gcf, 'bootstraptymus-datahist.pdf', 6, 5 );
pause( 2.0 )
savefigpdf(gcf, 'bootstraptymus-datahist.pdf', 6, 5);
pause(2.0)
%% (d) bootstrap the mean:
resample = 500;
[bootsem, mu] = bootstrapmean( x, resample );
hist( mu, 20 );
[bootsem, mu] = bootstrapmean(x, resample);
hist(mu, 20);
xlabel('mean(x)')
ylabel('count')
savefigpdf( gcf, 'bootstraptymus-meanhist.pdf', 6, 5 );
fprintf( ' bootstrap standard error: %.3f\n', bootsem );
fprintf( 'theoretical standard error: %.3f\n', sem );
savefigpdf(gcf, 'bootstraptymus-meanhist.pdf', 6, 5);
fprintf(' bootstrap standard error: %.3f\n', bootsem);
fprintf('theoretical standard error: %.3f\n', sem);
%% (e) confidence interval:
q = quantile(mu, [0.025, 0.975]);
fprintf( '95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2) );
pause( 2.0 )
fprintf('95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2));
pause(2.0)
%% (f): dependence on sample size:
nsamplesrange = 10:10:1000;
bootsems = zeros( length(nsamplesrange),1);
bootsems = zeros(length(nsamplesrange), 1);
for n=1:length(nsamplesrange)
nsamples = nsamplesrange(n);
% [bootsems(n), mu] = bootstrapmean(x, resample);
@@ -43,5 +43,5 @@ hold off
xlabel('sample size')
ylabel('SEM')
legend('bootsrap', 'theory')
savefigpdf( gcf, 'bootstraptymus-samples.pdf', 6, 5 );
savefigpdf(gcf, 'bootstraptymus-samples.pdf', 6, 5);

View File

@@ -11,12 +11,12 @@ for i=1:nperm
end
%% (b) pdf of the correlation coefficients:
[hb,bb] = hist(rb, 20 );
[hb,bb] = hist(rb, 20);
hb = hb/sum(hb)/(bb(2)-bb(1)); % normalization
%% (c) significance:
rbq = quantile(rb, 0.05);
fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq );
fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq);
if rbq > 0.0
fprintf('--> correlation r=%.2f is significant\n', rd);
else
@@ -28,10 +28,10 @@ hold on;
bar(b, h, 'facecolor', [0.5 0.5 0.5]);
bar(bb, hb, 'facecolor', 'b');
bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r');
plot( [rd rd], [0 4], 'r', 'linewidth', 2 );
plot([rd rd], [0 4], 'r', 'linewidth', 2);
xlim([-0.25 0.75])
xlabel('Correlation coefficient');
ylabel('Probability density');
hold off;
savefigpdf( gcf, 'correlationbootstrap.pdf', 12, 6 );
savefigpdf(gcf, 'correlationbootstrap.pdf', 12, 6);

View File

@@ -6,7 +6,7 @@ y = randn(n, 1) + a*x;
%% (b) scatter plot:
subplot(1, 2, 1);
plot(x, a*x, 'r', 'linewidth', 3 );
plot(x, a*x, 'r', 'linewidth', 3);
hold on
%scatter(x, y ); % either scatter ...
plot(x, y, 'o', 'markersize', 2 ); % ... or plot - same plot.
@@ -32,12 +32,12 @@ for i=1:nperm
end
%% (g) pdf of the correlation coefficients:
[h,b] = hist(rs, 20 );
[h,b] = hist(rs, 20);
h = h/sum(h)/(b(2)-b(1)); % normalization
%% (h) significance:
rq = quantile(rs, 0.95);
fprintf('correlation coefficient at 5%% significance = %.2f\n', rq );
fprintf('correlation coefficient at 5%% significance = %.2f\n', rq);
if rd >= rq
fprintf('--> correlation r=%.2f is significant\n', rd);
else
@@ -49,10 +49,10 @@ subplot(1, 2, 2)
hold on;
bar(b, h, 'facecolor', 'b');
bar(b(b>=rq), h(b>=rq), 'facecolor', 'r');
plot( [rd rd], [0 4], 'r', 'linewidth', 2 );
plot( [rd rd], [0 4], 'r', 'linewidth', 2);
xlim([-0.25 0.25])
xlabel('Correlation coefficient');
ylabel('Probability density of H0');
hold off;
savefigpdf( gcf, 'correlationsignificance.pdf', 12, 6 );
savefigpdf(gcf, 'correlationsignificance.pdf', 12, 6);

View File

@@ -15,7 +15,7 @@
\else
\newcommand{\stitle}{}
\fi
\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large November 20th, 2018}}
\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}}
\firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email:
jan.benda@uni-tuebingen.de}
\runningfooter{}{\thepage}{}
@@ -86,7 +86,7 @@ jan.benda@uni-tuebingen.de}
\begin{questions}
\question \qt{Bootstrap of the standard error of the mean}
\question \qt{Bootstrap the standard error of the mean}
We want to compute the standard error of the mean of a data set by
means of the bootstrap method and compare the result with the formula
``standard deviation divided by the square-root of $n$''.
@@ -118,24 +118,25 @@ means of the bootstrap method and compare the result with the formula
\end{solution}
\question \qt{Student t-distribution}
The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{m})$, the
estimated mean of a data set divided by the estimated standard error
of the mean, is not a normal distribution but a Student-t distribution.
We want to compute the Student-t distribution and compare it with the
normal distribution.
\question \qt{Student t-distribution}
The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{n})$, the
estimated mean $\bar x$ of a data set of size $n$ divided by the
estimated standard error of the mean $\sigma_x/\sqrt{n}$, where
$\sigma_x$ is the estimated standard deviation, is not a normal
distribution but a Student-t distribution. We want to compute the
Student-t distribution and compare it with the normal distribution.
\begin{parts}
\part Generate 100000 normally distributed random numbers.
\part Draw from these data 1000 samples of size $n=3$, 5, 10, and 50.
\part Compute the mean $\bar x$ of the samples and plot the
\part Draw from these data 1000 samples of size $n=3$, 5, 10, and
50. For each sample size $n$ ...
\part ... compute the mean $\bar x$ of the samples and plot the
probability density of these means.
\part Compare the resulting probability densities with corresponding
\part ... compare the resulting probability densities with corresponding
normal distributions.
\part Compute in addition $t=\bar x/(\sigma_x/\sqrt{n})$ (standard
deviation of the samples $\sigma_x$) and compare their distribution
with the normal distribution with standard deviation of one. Is $t$
normally distributed? Under which conditions is $t$ normally
distributed?
\part ... compute Student's $t=\bar x/(\sigma_x/\sqrt{n})$ and compare its
distribution with the normal distribution with standard deviation of
one. Is $t$ normally distributed? Under which conditions is $t$
normally distributed?
\end{parts}
\newsolutionpage
\begin{solution}
@@ -167,16 +168,16 @@ y = randn(n, 1) + a*x;
\part Compute and plot the probability density of these correlation
coefficients.
\part Is the correlation of the original data set significant?
\part What does significance of the correlation mean?
\part Vary the sample size \code{n} and compute in the same way the
significance of the correlation.
\part What does ``significance of the correlation'' mean?
% \part Vary the sample size \code{n} and compute in the same way the
% significance of the correlation.
\end{parts}
\begin{solution}
\lstinputlisting{correlationsignificance.m}
\includegraphics[width=1\textwidth]{correlationsignificance}
\end{solution}
\question \qt{Bootstrap of the correlation coefficient}
\question \qt{Bootstrap the correlation coefficient}
The permutation test generates the distribution of the null hypothesis
of uncorrelated data and we check whether the correlation coefficient
of the data differs significantly from this
@@ -184,7 +185,7 @@ distribution. Alternatively we can bootstrap the data while keeping
the pairs and determine the confidence interval of the correlation
coefficient of the data. If this differs significantly from a
correlation coefficient of zero we can conclude that the correlation
coefficient of the data quantifies indeed a correlated data.
coefficient of the data indeed quantifies correlated data.
We take the same data set that we have generated in exercise
\ref{permutationtest} (\ref{permutationtestdata}).

View File

@@ -6,9 +6,9 @@ for nsamples=[3 5 10 50]
nsamples
%% compute mean, standard deviation and t:
nmeans = 10000;
means = zeros( nmeans, 1 );
sdevs = zeros( nmeans, 1 );
students = zeros( nmeans, 1 );
means = zeros(nmeans, 1);
sdevs = zeros(nmeans, 1);
students = zeros(nmeans, 1 );
for i=1:nmeans
sample = x(randi(n, nsamples, 1));
means(i) = mean(sample);
@@ -34,7 +34,7 @@ for nsamples=[3 5 10 50]
bar(b, h, 'facecolor', 'b', 'edgecolor', 'b')
hold on
plot(xg, pm, 'r', 'linewidth', 2)
title( sprintf('sample size = %d', nsamples) );
title(sprintf('sample size = %d', nsamples));
xlim( [-3, 3] );
xlabel('Mean');
ylabel('pdf');
@@ -47,12 +47,12 @@ for nsamples=[3 5 10 50]
bar(b, h, 'facecolor', 'b', 'edgecolor', 'b')
hold on
plot(xg, pt, 'r', 'linewidth', 2)
title( sprintf('sample size = %d', nsamples) );
title(sprintf('sample size = %d', nsamples));
xlim( [-8, 8] );
xlabel('Student-t');
ylabel('pdf');
hold off;
savefigpdf( gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5 );
savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5);
pause( 3.0 )
end