[bootstrap] updated text and exercises
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
function [bootsem, mu] = bootstrapmean( x, resample )
|
||||
function [bootsem, mu] = bootstrapmean(x, resample)
|
||||
% computes standard error by bootstrapping the data
|
||||
% x: vector with data
|
||||
% resample: number of resamplings
|
||||
% returns:
|
||||
% bootsem: the standard error of the mean
|
||||
% mu: the bootstrapped means as a vector
|
||||
mu = zeros( resample, 1 );
|
||||
mu = zeros(resample, 1);
|
||||
nsamples = length(x);
|
||||
for i = 1:resample
|
||||
% resample:
|
||||
@@ -13,5 +13,5 @@ function [bootsem, mu] = bootstrapmean( x, resample )
|
||||
% compute statistics on sample:
|
||||
mu(i) = mean(xr);
|
||||
end
|
||||
bootsem = std( mu );
|
||||
bootsem = std(mu);
|
||||
end
|
||||
|
||||
@@ -1,36 +1,36 @@
|
||||
%% (b) load the data:
|
||||
load( 'thymusglandweights.dat' );
|
||||
load('thymusglandweights.dat');
|
||||
nsamples = 80;
|
||||
x = thymusglandweights(1:nsamples);
|
||||
|
||||
%% (c) mean, sem and hist:
|
||||
sem = std(x)/sqrt(nsamples);
|
||||
fprintf( 'Mean of the data set = %.2fmg\n', mean(x) );
|
||||
fprintf( 'SEM of the data set = %.2fmg\n', sem );
|
||||
fprintf('Mean of the data set = %.2fmg\n', mean(x));
|
||||
fprintf('SEM of the data set = %.2fmg\n', sem);
|
||||
hist(x,20)
|
||||
xlabel('x')
|
||||
ylabel('count')
|
||||
savefigpdf( gcf, 'bootstraptymus-datahist.pdf', 6, 5 );
|
||||
pause( 2.0 )
|
||||
savefigpdf(gcf, 'bootstraptymus-datahist.pdf', 6, 5);
|
||||
pause(2.0)
|
||||
|
||||
%% (d) bootstrap the mean:
|
||||
resample = 500;
|
||||
[bootsem, mu] = bootstrapmean( x, resample );
|
||||
hist( mu, 20 );
|
||||
[bootsem, mu] = bootstrapmean(x, resample);
|
||||
hist(mu, 20);
|
||||
xlabel('mean(x)')
|
||||
ylabel('count')
|
||||
savefigpdf( gcf, 'bootstraptymus-meanhist.pdf', 6, 5 );
|
||||
fprintf( ' bootstrap standard error: %.3f\n', bootsem );
|
||||
fprintf( 'theoretical standard error: %.3f\n', sem );
|
||||
savefigpdf(gcf, 'bootstraptymus-meanhist.pdf', 6, 5);
|
||||
fprintf(' bootstrap standard error: %.3f\n', bootsem);
|
||||
fprintf('theoretical standard error: %.3f\n', sem);
|
||||
|
||||
%% (e) confidence interval:
|
||||
q = quantile(mu, [0.025, 0.975]);
|
||||
fprintf( '95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2) );
|
||||
pause( 2.0 )
|
||||
fprintf('95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2));
|
||||
pause(2.0)
|
||||
|
||||
%% (f): dependence on sample size:
|
||||
nsamplesrange = 10:10:1000;
|
||||
bootsems = zeros( length(nsamplesrange),1);
|
||||
bootsems = zeros(length(nsamplesrange), 1);
|
||||
for n=1:length(nsamplesrange)
|
||||
nsamples = nsamplesrange(n);
|
||||
% [bootsems(n), mu] = bootstrapmean(x, resample);
|
||||
@@ -43,5 +43,5 @@ hold off
|
||||
xlabel('sample size')
|
||||
ylabel('SEM')
|
||||
legend('bootsrap', 'theory')
|
||||
savefigpdf( gcf, 'bootstraptymus-samples.pdf', 6, 5 );
|
||||
savefigpdf(gcf, 'bootstraptymus-samples.pdf', 6, 5);
|
||||
|
||||
|
||||
@@ -11,12 +11,12 @@ for i=1:nperm
|
||||
end
|
||||
|
||||
%% (b) pdf of the correlation coefficients:
|
||||
[hb,bb] = hist(rb, 20 );
|
||||
[hb,bb] = hist(rb, 20);
|
||||
hb = hb/sum(hb)/(bb(2)-bb(1)); % normalization
|
||||
|
||||
%% (c) significance:
|
||||
rbq = quantile(rb, 0.05);
|
||||
fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq );
|
||||
fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq);
|
||||
if rbq > 0.0
|
||||
fprintf('--> correlation r=%.2f is significant\n', rd);
|
||||
else
|
||||
@@ -28,10 +28,10 @@ hold on;
|
||||
bar(b, h, 'facecolor', [0.5 0.5 0.5]);
|
||||
bar(bb, hb, 'facecolor', 'b');
|
||||
bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r');
|
||||
plot( [rd rd], [0 4], 'r', 'linewidth', 2 );
|
||||
plot([rd rd], [0 4], 'r', 'linewidth', 2);
|
||||
xlim([-0.25 0.75])
|
||||
xlabel('Correlation coefficient');
|
||||
ylabel('Probability density');
|
||||
hold off;
|
||||
|
||||
savefigpdf( gcf, 'correlationbootstrap.pdf', 12, 6 );
|
||||
savefigpdf(gcf, 'correlationbootstrap.pdf', 12, 6);
|
||||
|
||||
@@ -6,7 +6,7 @@ y = randn(n, 1) + a*x;
|
||||
|
||||
%% (b) scatter plot:
|
||||
subplot(1, 2, 1);
|
||||
plot(x, a*x, 'r', 'linewidth', 3 );
|
||||
plot(x, a*x, 'r', 'linewidth', 3);
|
||||
hold on
|
||||
%scatter(x, y ); % either scatter ...
|
||||
plot(x, y, 'o', 'markersize', 2 ); % ... or plot - same plot.
|
||||
@@ -32,12 +32,12 @@ for i=1:nperm
|
||||
end
|
||||
|
||||
%% (g) pdf of the correlation coefficients:
|
||||
[h,b] = hist(rs, 20 );
|
||||
[h,b] = hist(rs, 20);
|
||||
h = h/sum(h)/(b(2)-b(1)); % normalization
|
||||
|
||||
%% (h) significance:
|
||||
rq = quantile(rs, 0.95);
|
||||
fprintf('correlation coefficient at 5%% significance = %.2f\n', rq );
|
||||
fprintf('correlation coefficient at 5%% significance = %.2f\n', rq);
|
||||
if rd >= rq
|
||||
fprintf('--> correlation r=%.2f is significant\n', rd);
|
||||
else
|
||||
@@ -49,10 +49,10 @@ subplot(1, 2, 2)
|
||||
hold on;
|
||||
bar(b, h, 'facecolor', 'b');
|
||||
bar(b(b>=rq), h(b>=rq), 'facecolor', 'r');
|
||||
plot( [rd rd], [0 4], 'r', 'linewidth', 2 );
|
||||
plot( [rd rd], [0 4], 'r', 'linewidth', 2);
|
||||
xlim([-0.25 0.25])
|
||||
xlabel('Correlation coefficient');
|
||||
ylabel('Probability density of H0');
|
||||
hold off;
|
||||
|
||||
savefigpdf( gcf, 'correlationsignificance.pdf', 12, 6 );
|
||||
savefigpdf(gcf, 'correlationsignificance.pdf', 12, 6);
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
\else
|
||||
\newcommand{\stitle}{}
|
||||
\fi
|
||||
\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large November 20th, 2018}}
|
||||
\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}}
|
||||
\firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email:
|
||||
jan.benda@uni-tuebingen.de}
|
||||
\runningfooter{}{\thepage}{}
|
||||
@@ -86,7 +86,7 @@ jan.benda@uni-tuebingen.de}
|
||||
|
||||
\begin{questions}
|
||||
|
||||
\question \qt{Bootstrap of the standard error of the mean}
|
||||
\question \qt{Bootstrap the standard error of the mean}
|
||||
We want to compute the standard error of the mean of a data set by
|
||||
means of the bootstrap method and compare the result with the formula
|
||||
``standard deviation divided by the square-root of $n$''.
|
||||
@@ -118,24 +118,25 @@ means of the bootstrap method and compare the result with the formula
|
||||
\end{solution}
|
||||
|
||||
|
||||
\question \qt{Student t-distribution}
|
||||
The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{m})$, the
|
||||
estimated mean of a data set divided by the estimated standard error
|
||||
of the mean, is not a normal distribution but a Student-t distribution.
|
||||
We want to compute the Student-t distribution and compare it with the
|
||||
normal distribution.
|
||||
\question \qt{Student t-distribution}
|
||||
The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{n})$, the
|
||||
estimated mean $\bar x$ of a data set of size $n$ divided by the
|
||||
estimated standard error of the mean $\sigma_x/\sqrt{n}$, where
|
||||
$\sigma_x$ is the estimated standard deviation, is not a normal
|
||||
distribution but a Student-t distribution. We want to compute the
|
||||
Student-t distribution and compare it with the normal distribution.
|
||||
\begin{parts}
|
||||
\part Generate 100000 normally distributed random numbers.
|
||||
\part Draw from these data 1000 samples of size $n=3$, 5, 10, and 50.
|
||||
\part Compute the mean $\bar x$ of the samples and plot the
|
||||
\part Draw from these data 1000 samples of size $n=3$, 5, 10, and
|
||||
50. For each sample size $n$ ...
|
||||
\part ... compute the mean $\bar x$ of the samples and plot the
|
||||
probability density of these means.
|
||||
\part Compare the resulting probability densities with corresponding
|
||||
\part ... compare the resulting probability densities with corresponding
|
||||
normal distributions.
|
||||
\part Compute in addition $t=\bar x/(\sigma_x/\sqrt{n})$ (standard
|
||||
deviation of the samples $\sigma_x$) and compare their distribution
|
||||
with the normal distribution with standard deviation of one. Is $t$
|
||||
normally distributed? Under which conditions is $t$ normally
|
||||
distributed?
|
||||
\part ... compute Student's $t=\bar x/(\sigma_x/\sqrt{n})$ and compare its
|
||||
distribution with the normal distribution with standard deviation of
|
||||
one. Is $t$ normally distributed? Under which conditions is $t$
|
||||
normally distributed?
|
||||
\end{parts}
|
||||
\newsolutionpage
|
||||
\begin{solution}
|
||||
@@ -167,16 +168,16 @@ y = randn(n, 1) + a*x;
|
||||
\part Compute and plot the probability density of these correlation
|
||||
coefficients.
|
||||
\part Is the correlation of the original data set significant?
|
||||
\part What does significance of the correlation mean?
|
||||
\part Vary the sample size \code{n} and compute in the same way the
|
||||
significance of the correlation.
|
||||
\part What does ``significance of the correlation'' mean?
|
||||
% \part Vary the sample size \code{n} and compute in the same way the
|
||||
% significance of the correlation.
|
||||
\end{parts}
|
||||
\begin{solution}
|
||||
\lstinputlisting{correlationsignificance.m}
|
||||
\includegraphics[width=1\textwidth]{correlationsignificance}
|
||||
\end{solution}
|
||||
|
||||
\question \qt{Bootstrap of the correlation coefficient}
|
||||
\question \qt{Bootstrap the correlation coefficient}
|
||||
The permutation test generates the distribution of the null hypothesis
|
||||
of uncorrelated data and we check whether the correlation coefficient
|
||||
of the data differs significantly from this
|
||||
@@ -184,7 +185,7 @@ distribution. Alternatively we can bootstrap the data while keeping
|
||||
the pairs and determine the confidence interval of the correlation
|
||||
coefficient of the data. If this differs significantly from a
|
||||
correlation coefficient of zero we can conclude that the correlation
|
||||
coefficient of the data quantifies indeed a correlated data.
|
||||
coefficient of the data indeed quantifies correlated data.
|
||||
|
||||
We take the same data set that we have generated in exercise
|
||||
\ref{permutationtest} (\ref{permutationtestdata}).
|
||||
|
||||
@@ -6,9 +6,9 @@ for nsamples=[3 5 10 50]
|
||||
nsamples
|
||||
%% compute mean, standard deviation and t:
|
||||
nmeans = 10000;
|
||||
means = zeros( nmeans, 1 );
|
||||
sdevs = zeros( nmeans, 1 );
|
||||
students = zeros( nmeans, 1 );
|
||||
means = zeros(nmeans, 1);
|
||||
sdevs = zeros(nmeans, 1);
|
||||
students = zeros(nmeans, 1 );
|
||||
for i=1:nmeans
|
||||
sample = x(randi(n, nsamples, 1));
|
||||
means(i) = mean(sample);
|
||||
@@ -34,7 +34,7 @@ for nsamples=[3 5 10 50]
|
||||
bar(b, h, 'facecolor', 'b', 'edgecolor', 'b')
|
||||
hold on
|
||||
plot(xg, pm, 'r', 'linewidth', 2)
|
||||
title( sprintf('sample size = %d', nsamples) );
|
||||
title(sprintf('sample size = %d', nsamples));
|
||||
xlim( [-3, 3] );
|
||||
xlabel('Mean');
|
||||
ylabel('pdf');
|
||||
@@ -47,12 +47,12 @@ for nsamples=[3 5 10 50]
|
||||
bar(b, h, 'facecolor', 'b', 'edgecolor', 'b')
|
||||
hold on
|
||||
plot(xg, pt, 'r', 'linewidth', 2)
|
||||
title( sprintf('sample size = %d', nsamples) );
|
||||
title(sprintf('sample size = %d', nsamples));
|
||||
xlim( [-8, 8] );
|
||||
xlabel('Student-t');
|
||||
ylabel('pdf');
|
||||
hold off;
|
||||
|
||||
savefigpdf( gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5 );
|
||||
savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5);
|
||||
pause( 3.0 )
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user