[bootstrap] updated text and exercises

2019-12-03 08:57:40 +01:00
parent bfad4ac339
commit 006fa998cc
11 changed files with 72 additions and 69 deletions
--- a/bootstrap/exercises/bootstrapmean.m
+++ b/bootstrap/exercises/bootstrapmean.m
@@ -1,11 +1,11 @@
-function [bootsem, mu] = bootstrapmean( x, resample )
+function [bootsem, mu] = bootstrapmean(x, resample)
 % computes standard error by bootstrapping the data
 % x: vector with data
 % resample: number of resamplings
 % returns:
 % bootsem: the standard error of the mean
 % mu: the bootstrapped means as a vector
-    mu = zeros( resample, 1 );
+    mu = zeros(resample, 1);
    nsamples = length(x);
    for i = 1:resample
        % resample:
@@ -13,5 +13,5 @@ function [bootsem, mu] = bootstrapmean( x, resample )
        % compute statistics on sample:
        mu(i) = mean(xr);
    end
-    bootsem = std( mu );
+    bootsem = std(mu);
 end
--- a/bootstrap/exercises/bootstraptymus.m
+++ b/bootstrap/exercises/bootstraptymus.m
@@ -1,36 +1,36 @@
 %% (b) load the data:
-load( 'thymusglandweights.dat' );
+load('thymusglandweights.dat');
 nsamples = 80;
 x = thymusglandweights(1:nsamples);

 %% (c) mean, sem and hist:
 sem = std(x)/sqrt(nsamples);
-fprintf( 'Mean of the data set = %.2fmg\n', mean(x) );
-fprintf( 'SEM of the data set = %.2fmg\n', sem );
+fprintf('Mean of the data set = %.2fmg\n', mean(x));
+fprintf('SEM of the data set = %.2fmg\n', sem);
 hist(x,20)
 xlabel('x')
 ylabel('count')
-savefigpdf( gcf, 'bootstraptymus-datahist.pdf', 6, 5 );
-pause( 2.0 )
+savefigpdf(gcf, 'bootstraptymus-datahist.pdf', 6, 5);
+pause(2.0)

 %% (d) bootstrap the mean:
 resample = 500;
-[bootsem, mu] = bootstrapmean( x, resample );
-hist( mu, 20 );
+[bootsem, mu] = bootstrapmean(x, resample);
+hist(mu, 20);
 xlabel('mean(x)')
 ylabel('count')
-savefigpdf( gcf, 'bootstraptymus-meanhist.pdf', 6, 5 );
-fprintf( '  bootstrap standard error: %.3f\n', bootsem );
-fprintf( 'theoretical standard error: %.3f\n', sem );
+savefigpdf(gcf, 'bootstraptymus-meanhist.pdf', 6, 5);
+fprintf('  bootstrap standard error: %.3f\n', bootsem);
+fprintf('theoretical standard error: %.3f\n', sem);

 %% (e) confidence interval:
 q = quantile(mu, [0.025, 0.975]);
-fprintf( '95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2) );
-pause( 2.0 )
+fprintf('95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2));
+pause(2.0)

 %% (f): dependence on sample size:
 nsamplesrange = 10:10:1000;
-bootsems = zeros( length(nsamplesrange),1);
+bootsems = zeros(length(nsamplesrange), 1);
 for n=1:length(nsamplesrange)
    nsamples = nsamplesrange(n);
    % [bootsems(n), mu] = bootstrapmean(x, resample);
@@ -43,5 +43,5 @@ hold off
 xlabel('sample size')
 ylabel('SEM')
 legend('bootsrap', 'theory')
-savefigpdf( gcf, 'bootstraptymus-samples.pdf', 6, 5 );
+savefigpdf(gcf, 'bootstraptymus-samples.pdf', 6, 5);

--- a/bootstrap/exercises/correlationbootstrap.m
+++ b/bootstrap/exercises/correlationbootstrap.m
@@ -11,12 +11,12 @@ for i=1:nperm
 end

 %% (b) pdf of the correlation coefficients:
-[hb,bb] = hist(rb, 20 );
+[hb,bb] = hist(rb, 20);
 hb = hb/sum(hb)/(bb(2)-bb(1));  % normalization

 %% (c) significance:
 rbq = quantile(rb, 0.05);
-fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq );
+fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq);
 if rbq > 0.0
    fprintf('--> correlation r=%.2f is significant\n', rd);
 else
@@ -28,10 +28,10 @@ hold on;
 bar(b, h, 'facecolor', [0.5 0.5 0.5]);
 bar(bb, hb, 'facecolor', 'b');
 bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r');
-plot( [rd rd], [0 4], 'r', 'linewidth', 2 );
+plot([rd rd], [0 4], 'r', 'linewidth', 2);
 xlim([-0.25 0.75])
 xlabel('Correlation coefficient');
 ylabel('Probability density');
 hold off;

-savefigpdf( gcf, 'correlationbootstrap.pdf', 12, 6 );
+savefigpdf(gcf, 'correlationbootstrap.pdf', 12, 6);
--- a/bootstrap/exercises/correlationsignificance.m
+++ b/bootstrap/exercises/correlationsignificance.m
@@ -6,7 +6,7 @@ y = randn(n, 1) + a*x;

 %% (b) scatter plot:
 subplot(1, 2, 1);
-plot(x, a*x, 'r', 'linewidth', 3 );
+plot(x, a*x, 'r', 'linewidth', 3);
 hold on
 %scatter(x, y );   % either scatter ...
 plot(x, y, 'o', 'markersize', 2 );  % ... or plot - same plot.
@@ -32,12 +32,12 @@ for i=1:nperm
 end

 %% (g) pdf of the correlation coefficients:
-[h,b] = hist(rs, 20 );
+[h,b] = hist(rs, 20);
 h = h/sum(h)/(b(2)-b(1));  % normalization

 %% (h) significance:
 rq = quantile(rs, 0.95);
-fprintf('correlation coefficient at 5%% significance = %.2f\n', rq );
+fprintf('correlation coefficient at 5%% significance = %.2f\n', rq);
 if rd >= rq
    fprintf('--> correlation r=%.2f is significant\n', rd);
 else
@@ -49,10 +49,10 @@ subplot(1, 2, 2)
 hold on;
 bar(b, h, 'facecolor', 'b');
 bar(b(b>=rq), h(b>=rq), 'facecolor', 'r');
-plot( [rd rd], [0 4], 'r', 'linewidth', 2 );
+plot( [rd rd], [0 4], 'r', 'linewidth', 2);
 xlim([-0.25 0.25])
 xlabel('Correlation coefficient');
 ylabel('Probability density of H0');
 hold off;

-savefigpdf( gcf, 'correlationsignificance.pdf', 12, 6 );
+savefigpdf(gcf, 'correlationsignificance.pdf', 12, 6);
--- a/bootstrap/exercises/exercises01.tex
+++ b/bootstrap/exercises/exercises01.tex
@@ -15,7 +15,7 @@
 \else
 \newcommand{\stitle}{}
 \fi
-\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large November 20th, 2018}}
+\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}}
 \firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email:
 jan.benda@uni-tuebingen.de}
 \runningfooter{}{\thepage}{}
@@ -86,7 +86,7 @@ jan.benda@uni-tuebingen.de}

 \begin{questions}

-\question \qt{Bootstrap of the standard error of the mean}
+\question \qt{Bootstrap the standard error of the mean}
 We want to compute the standard error of the mean of a data set by
 means of the bootstrap method and compare the result with the formula
 ``standard deviation divided by the square-root of $n$''.
@@ -118,24 +118,25 @@ means of the bootstrap method and compare the result with the formula
 \end{solution}


-\question \qt{Student t-distribution} 
-The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{m})$, the
-estimated mean of a data set divided by the estimated standard error
-of the mean, is not a normal distribution but a Student-t distribution.
-We want to compute the Student-t distribution and compare it with the 
-normal distribution.
+\question \qt{Student t-distribution}
+The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{n})$, the
+estimated mean $\bar x$ of a data set of size $n$ divided by the
+estimated standard error of the mean $\sigma_x/\sqrt{n}$, where
+$\sigma_x$ is the estimated standard deviation, is not a normal
+distribution but a Student-t distribution.  We want to compute the
+Student-t distribution and compare it with the normal distribution.
 \begin{parts}
 \part Generate 100000 normally distributed random numbers.
-\part Draw from these data 1000 samples of size $n=3$, 5, 10, and 50.
-\part Compute the mean $\bar x$ of the samples and plot the
+\part Draw from these data 1000 samples of size $n=3$, 5, 10, and
+50. For each sample size $n$ ...
+\part ... compute the mean $\bar x$ of the samples and plot the
 probability density of these means.
-\part Compare the resulting probability densities with corresponding
+\part ... compare the resulting probability densities with corresponding
 normal distributions.
-\part Compute in addition $t=\bar x/(\sigma_x/\sqrt{n})$ (standard
-deviation of the samples $\sigma_x$) and compare their distribution
-with the normal distribution with standard deviation of one. Is $t$
-normally distributed? Under which conditions is $t$ normally
-distributed?
+\part ... compute Student's $t=\bar x/(\sigma_x/\sqrt{n})$ and compare its
+distribution with the normal distribution with standard deviation of
+one. Is $t$ normally distributed? Under which conditions is $t$
+normally distributed?
 \end{parts}
 \newsolutionpage
 \begin{solution}
@@ -167,16 +168,16 @@ y = randn(n, 1) + a*x;
  \part Compute and plot the probability density of these correlation
  coefficients.
  \part Is the correlation of the original data set significant?
-  \part What does significance of the correlation mean?
-  \part Vary the sample size \code{n} and compute in the same way the
-  significance of the correlation.
+  \part What does ``significance of the correlation'' mean?
+%  \part Vary the sample size \code{n} and compute in the same way the
+%  significance of the correlation.
 \end{parts}
 \begin{solution}
  \lstinputlisting{correlationsignificance.m}
  \includegraphics[width=1\textwidth]{correlationsignificance}
 \end{solution}

-\question \qt{Bootstrap of the correlation coefficient} 
+\question \qt{Bootstrap the correlation coefficient} 
 The permutation test generates the distribution of the null hypothesis
 of uncorrelated data and we check whether the correlation coefficient
 of the data differs significantly from this
@@ -184,7 +185,7 @@ distribution. Alternatively we can bootstrap the data while keeping
 the pairs and determine the confidence interval of the correlation
 coefficient of the data. If this differs significantly from a
 correlation coefficient of zero we can conclude that the correlation
-coefficient of the data quantifies indeed a correlated data.
+coefficient of the data indeed quantifies correlated data.

 We take the same data set that we have generated in exercise
 \ref{permutationtest} (\ref{permutationtestdata}).
--- a/bootstrap/exercises/tdistribution.m
+++ b/bootstrap/exercises/tdistribution.m
@@ -6,9 +6,9 @@ for nsamples=[3 5 10 50]
    nsamples
    %% compute mean, standard deviation and t:
    nmeans = 10000;
-    means = zeros( nmeans, 1 );
-    sdevs = zeros( nmeans, 1 );
-    students = zeros( nmeans, 1 );
+    means = zeros(nmeans, 1);
+    sdevs = zeros(nmeans, 1);
+    students = zeros(nmeans, 1 );
    for i=1:nmeans
        sample = x(randi(n, nsamples, 1));
        means(i) = mean(sample);
@@ -34,7 +34,7 @@ for nsamples=[3 5 10 50]
    bar(b, h, 'facecolor', 'b', 'edgecolor', 'b')
    hold on
    plot(xg, pm, 'r', 'linewidth', 2)
-    title( sprintf('sample size = %d', nsamples) );
+    title(sprintf('sample size = %d', nsamples));
    xlim( [-3, 3] );
    xlabel('Mean');
    ylabel('pdf');
@@ -47,12 +47,12 @@ for nsamples=[3 5 10 50]
    bar(b, h, 'facecolor', 'b', 'edgecolor', 'b')
    hold on
    plot(xg, pt, 'r', 'linewidth', 2)
-    title( sprintf('sample size = %d', nsamples) );
+    title(sprintf('sample size = %d', nsamples));
    xlim( [-8, 8] );
    xlabel('Student-t');
    ylabel('pdf');
    hold off;
    
-    savefigpdf( gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5 );
+    savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5);
    pause( 3.0 )
 end