--- title: 'GWAS Practicals 2' author: "Matti Pirinen, University of Helsinki" date: "15-Jan-2019" urlcolor: blue output: html_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) set.seed(29) ``` 1. Check empirically that by simulating repeatedly datasets like in lecture example below, the proportion of effect estimates that are larger in absolute value than the estimate of the example data sets corresponds to the P-value computed for the example data set. If you don't get exactly the same value, change $n$ to 1000 insted of 100. ```{r, include = FALSE, fig.height=4} #Block 1 n = 100 f = 0.3 #MAF x = rbinom(n,2,f) #example genotypes for n individuals y = rnorm(n) #outcome that is independent of x lm.fit = lm( y ~ x ) par( mfrow = c(1,2) ) #draw 2 panels on the grid with 1 row and 2 cols #1st on t-statistic's scale x.grid = seq(-3, 3, 0.05) #we need this to define the plotting region plot(x.grid, dt(x.grid, df = n-2), lty = 2, lwd = 2, t = "l", xlab = expression( hat(beta)/SE ), ylab = "density", main="NULL DISTR of t") #null distr. of t-stat. t.stat = summary(lm.fit)$coeff[2,3] #t-statistic: Estimate/SE points(t.stat, 0, pch = 19, cex = 1.5, col = "red") segments(t.stat*c(1,-1), c(0,0), t.stat*c(1,-1), rep( dt( t.stat, df = n-2), 2 ), col = "red") text(2, 0.25, paste0("P=",signif(summary(lm.fit)$coeff[2,4],3)), col = "red") #2nd on t^2 statitstic's scale x.grid = seq(0, 10, 0.05) #we need this to define the plotting region plot(x.grid, dchisq( x.grid, df = 1 ), lty = 2, lwd = 2, t = "l", xlab = expression(( hat(beta)/SE)^2 ), ylab = "density", main = "NULL DISTR of t^2") #null distribution of t^2-stat. t2.stat = summary(lm.fit)$coeff[2,3]^2 #t^2-statistic: (Estimate/SE)^2 points(t2.stat, 0, pch = 19, cex = 1.5, col = "red") segments(t2.stat, 0, t2.stat, dchisq(t2.stat, df = 1), col = "red") text(2.5, 0.25, paste0("P=", signif(summary(lm.fit)$coeff[2,4],3)), col = "red") legend("topright", pch = 19, col = "red", leg = "observed" ) z = summary(lm.fit)$coeff[2,3] #t-statistic also called z-score under Normal approximation pnorm(-abs(z), 0, 1, lower = T) + pnorm(abs(z), 0, 1, lower = F) #P-value from N(0,1): left + right tail pchisq(z^2, df = 1, lower = F) #P-value from chi-square is the upper tail ``` #### Distribution of P-values ```{r, include = FALSE, fig.height=4} #Block 2 set.seed(39) n = 100 #individuals p = 1000 #variants measured on each individual f = 0.4 #MAF is assumed the same for all variants; doesn't actually matter here X = matrix(rbinom(n*p, 2, f), nrow = n, ncol = p) #just random genotypes y = rnorm(n) #phenotype that is not associated with any of genotypes #apply lm to each column of X separately and collect results for genotype (row 2 of coeff) lm.res = apply(X, 2 , function(x) summary(lm(y ~ x))$coeff[2,]) #result has 4 rows: beta, SE, t-stat and pval pval = lm.res[4,] #pick pvalues par(mfrow=c(1,2)) plot(density(lm.res[3,]), sub = "", xlab = "t-stat", main = "", lwd = 2) #should be t with n-2 df x.seq = seq(-4,4,0.1) #x-coordinates for plotting lines(x.seq, dt(x.seq, df = n-2), col = "blue", lty = 2) #t distribution in blue lines(x.seq, dnorm(x.seq), col = "red", lty = 3) #normal distribution in red hist(pval, breaks = 10, xlab = "P-value", main = "", col = "limegreen") #should be uniformly distributed par(mfrow=c(1,2)) #Let's make qqplots for t-stats and for P-values qqnorm(lm.res[3,]) #t with ~100 df should be close to normal (see densities above), hence qqnorm qqline(lm.res[3,], col = "red") #For P-values, we want to compare to the Uniform(0,1) distribution: #We use ppoints(p) to get #p equally spaced values in (0,1) to represent quantiles of Uniform(0,1). #we take -log10 transformation to see the small P-values particularly well qqplot(-log10(ppoints(p)),-log10(pval), xlab = "theoretical", ylab = "obs'd", main = "Q-Q Plot for -log10 Pval") abline(0, 1, col = "red") ``` 1. Show the distributions of -log10-Pvalues of the null variants and of the variants that have effects. Are there overlap? Is this a realistic example considering GWAS? ```{r, include = FALSE, fig.height=4} #Block 3 set.seed(49) n = 1000 #individuals p = 1000 #genotypes measured on each individual m = 50 #number of variants that have an effect: they are x_1,...,x_m. f = 0.4 #MAF b = 0.5 #effect size of variants that have an effect X = matrix(rbinom(n*p, 2, f), nrow = n, ncol = p) #just random genotypes at SNPs y = X[,1:m] %*% rep(b,m) + rnorm(n) #phenotype that is associated with x_1,...,x_m #apply lm to each column of X separately lm.res = apply(X, 2 , function(x) summary(lm(y ~ x))$coeff[2,]) #has 4 rows: beta, SE, t-stat and pval pval = lm.res[4,] par(mfrow=c(1,2)) plot(density(lm.res[3,]), sub = "", xlab = "t-stat", main = "", lwd = 2) #under null is t with n-2 df lines(seq(-4,4,0.1), dnorm(seq(-4,4,0.1)), col = "red", lty = 3) #normal distribution in red hist(pval, breaks = 10, xlab = "P-value", main = "", col="skyblue") #under null is uniformly distributed par(mfrow = c(1,2)) #Let's make qqplots for t-stats and for P-values qqnorm(lm.res[3,]) qqline(lm.res[3,], col = "red") qqplot(-log10(ppoints(p)), -log10(pval), xlab = "theoretical", ylab = "obs'd", main = "Q-Q Plot for -log10 Pval") abline(0, 1, col = "red") ```