From 849f1d8f0d6135d1f9a87b18832c24333435e128 Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Mon, 11 Jan 2021 22:00:25 +0100 Subject: [PATCH] [logisitc] added code on AUC interpretation --- logistic/code/logistic.py | 156 +++++++++++++++++++++++++++++++++ scientificcomputing-script.tex | 24 +++-- 2 files changed, 174 insertions(+), 6 deletions(-) create mode 100644 logistic/code/logistic.py diff --git a/logistic/code/logistic.py b/logistic/code/logistic.py new file mode 100644 index 0000000..18d500c --- /dev/null +++ b/logistic/code/logistic.py @@ -0,0 +1,156 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.stats import ttest_ind, mannwhitneyu + + +def auc(n, dx, uniform=False, plot=False): + # loser: + if uniform: + x0 = np.random.rand(n) + else: + x0 = np.random.randn(n)*0.3 + y0 = np.zeros(len(x0)) + # winner: + if uniform: + x1 = np.random.rand(n) + dx + else: + x1 = np.random.randn(n)*0.3 + dx + y1 = np.ones(len(x1)) + + # combine into a single table: + data = np.zeros((len(x0) + len(y0), 2)) + data[:len(x0),0] = x0 + data[:len(x0),1] = y0 + data[len(x0):,0] = x1 + data[len(x0):,1] = y1 + + # fraction of overlapping data values: + si = np.argsort(data[:,0]) + i0 = np.argmax(data[si,1] != data[si[0],1]) + i1 = len(data) - 1 - np.argmax(data[si[::-1],1] != data[si[-1],1]) + overlap = (i1-i0+1)/len(data) + + # Cohen's d: + m0 = np.mean(data[data[:,1] < 0.5,0]) + v0 = np.var(data[data[:,1] < 0.5,0]) + m1 = np.mean(data[data[:,1] > 0.5,0]) + v1 = np.var(data[data[:,1] > 0.5,0]) + cohensd = (m1 - m0)/np.sqrt(0.5*(v0+v1)) + + # t-test: + ttest, p = ttest_ind(data[data[:,1] > 0.5,0], data[data[:,1] < 0.5,0]) + + # Mann-Whitney U: + mannu, p = mannwhitneyu(data[data[:,1] < 0.5,0], data[data[:,1] > 0.5,0]) + + # ROC: + thresh = np.arange(np.min(data[:,0])-0.1, np.max(data[:,0])+0.2, 0.01) + true_pos = np.zeros(len(thresh)) + false_pos = np.zeros(len(thresh)) + for k in range(len(thresh)): + true_pos[k] = np.sum(data[data[:,0] > thresh[k],1] > 0.5)/np.sum(data[:,1] > 0.5) + false_pos[k] = np.sum(data[data[:,0] > thresh[k],1] < 0.5)/np.sum(data[:,1] < 0.5) + + # AUC: + droc = 0.001 + xroc = np.arange(0.0, 1.0+droc, droc) + yroc = np.interp(xroc, false_pos[::-1], true_pos[::-1]) + auc = np.sum(yroc)*droc + + if plot: + fig = plt.figure() + ax = fig.add_subplot(211) + ax.axvline(data[si[i0],0], color='k') + ax.axvline(data[si[i1],0], color='k', lw=2) + ax.plot(data[:,0], data[:,1], 'o') + ax.plot(data[data[:,1] < 0.5,0], np.zeros(len(data[data[:,1] < 0.5,0]))-0.5, 'or') + ax.plot(data[data[:,1] > 0.5,0], np.zeros(len(data[data[:,1] > 0.5,0]))-0.5, 'og') + ax.text(0.5*(data[si[i0],0]+data[si[i1],0]), 0.65, 'overlap=%.0f%%' % (100.0*overlap), ha='center') + ax.text(0.5*(data[si[i0],0]+data[si[i1],0]), 0.35, "Cohen's d=%.2f" % cohensd, ha='center') + ax.set_xlabel('x') + ax.set_yticks([0, 1]) + ax.set_yticklabels(['Lose', 'Win']) + if uniform: + ax.set_title('Uniformly distributed data') + else: + ax.set_title('Normally distributed data') + + ax = fig.add_subplot(223) + ax.plot(thresh, true_pos, '-og', label='TP') + ax.plot(thresh, false_pos, '-or', label='FP') + ax.legend() + ax.set_xlabel('threshold') + + ax = fig.add_subplot(224) + ax.plot(false_pos, true_pos, '-o') + ax.fill_between(xroc, yroc) + ax.text(0.5, 0.5, 'AUC=%.0f%%' % (100.0*auc)) + ax.set_xlabel('FP') + ax.set_ylabel('TP') + fig.tight_layout() + plt.show() + + return auc, overlap, cohensd, ttest, mannu + + +# demo: +auc(20, 0.5, True, True) +auc(20, 0.5, False, True) + +# AUC versus overlap: +n = 100 +aucs_uni = [] +overlaps_uni = [] +cohensd_uni = [] +ttest_uni = [] +mannu_uni = [] +aucs_norm = [] +overlaps_norm = [] +cohensd_norm = [] +ttest_norm = [] +mannu_norm = [] +for frac in np.arange(-1.5, 1.5, 0.02): + a, o, d, t, u = auc(n, frac, True, False) + aucs_uni.append(a) + overlaps_uni.append(o) + cohensd_uni.append(d) + ttest_uni.append(t) + mannu_uni.append(u) + a, o, d, t, u = auc(n, frac, False, False) + aucs_norm.append(a) + overlaps_norm.append(o) + cohensd_norm.append(d) + ttest_norm.append(t) + mannu_norm.append(u) + +fig, axs = plt.subplots(2, 2) +ax = axs[0, 0] +ax.plot([0.0, 1.0, 0.0], [0.0, 0.5, 1.0], 'k') +ax.plot(overlaps_uni, aucs_uni, 'o', label='uniform pdfs') +ax.plot(overlaps_norm, aucs_norm, 'o', label='normal pdfs') +ax.set_ylim(0, 1) +ax.set_xlabel('fraction of overlapping data') +ax.set_ylabel('AUC') +ax.legend(loc='center left') +ax = axs[0, 1] +ax.plot(cohensd_uni, aucs_uni, 'o', label='uniform pdfs') +ax.plot(cohensd_norm, aucs_norm, 'o', label='normal pdfs') +ax.set_ylim(0, 1) +ax.set_xlabel("Cohen's d") +ax.set_ylabel('AUC') +#ax.legend(loc='center left') +ax = axs[1, 1] +ax.plot(ttest_uni, aucs_uni, 'o', label='uniform pdfs') +ax.plot(ttest_norm, aucs_norm, 'o', label='normal pdfs') +ax.set_ylim(0, 1) +ax.set_xlabel("Student t") +ax.set_ylabel('AUC') +ax = axs[1, 0] +ax.plot(mannu_uni, aucs_uni, 'o', label='uniform pdfs') +ax.plot(mannu_norm, aucs_norm, 'o', label='normal pdfs') +ax.set_ylim(0, 1) +ax.set_xlabel("Mann-Whitney U") +ax.set_ylabel('AUC') +fig.savefig('aucoverlap.pdf') +plt.show() + diff --git a/scientificcomputing-script.tex b/scientificcomputing-script.tex index 27b369c..13ca240 100644 --- a/scientificcomputing-script.tex +++ b/scientificcomputing-script.tex @@ -56,6 +56,7 @@ % to closed source license models), FAIR % * knowledge is freedom ... % * do not consume what companies offer you but know what you want and implement it +% * no screenshots on how to use a stupid GUI program % * python as a modern and popular programming language in modern science % (i.e. machine learning, others?) with a strong community support. @@ -128,22 +129,33 @@ % add chapters on linear algebra, PCA, clustering, see linearalgebra/ +% add chapter on logistic regression, see logistic/ + % add chapter on simple machine learning, perceptron %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\part{Tools} % add chapter on Markup and LaTeX -> see latex/ -% with an introduction on mark-up in general, an outlook to markdown, html, xml, jason, yaml -% make notes, data, and texts understandable by computers! +% - if you know LaTeX you also know how to better use word/libreoffice +% - with an introduction on mark-up in general, an outlook to markdown, html, xml, jason, yaml +% - make notes, data, and texts understandable by computers! +% - easily generate markup from python scripts via mako . + +% bash basics: +% - calling a (python) script/program with arguments +% - files: ls, cp, mkdir, find, etc. +% - text file manipulations: grep, diff, sed, awk +% - pipes, for loop, if, test % Makefile + % distributed computing (ssh and grid engines) -% + % see replicability/ : -% version control (git and github) only put source files into repository! -% data handling (structure, storage (rsync), backup, data bases) -% data annotation, meta data, FAIR +% - version control (git and github) only put source files into repository! +% - data handling (structure, storage (rsync), backup, data bases) +% - data annotation, meta data, FAIR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%