---
title: 'GWAS Practicals 2'
author: "Matti Pirinen, University of Helsinki"
date: "15-Jan-2019"
urlcolor: blue
output:
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
set.seed(29)
```

1. Check empirically that by simulating repeatedly datasets like in lecture example below,
the proportion of effect estimates that are larger in absolute value than the
estimate of the example data sets corresponds to the P-value computed
for the example data set. If you don't get exactly the same
value, change $n$ to 1000 insted of 100.

```{r, include = FALSE, fig.height=4}
#Block 1

n = 100
f = 0.3 #MAF
x = rbinom(n,2,f) #example genotypes for n individuals
y = rnorm(n) #outcome that is independent of x
lm.fit = lm( y ~ x )
par( mfrow = c(1,2) ) #draw 2 panels on the grid with 1 row and 2 cols
#1st on t-statistic's scale
x.grid = seq(-3, 3, 0.05) #we need this to define the plotting region
plot(x.grid, dt(x.grid, df = n-2), lty = 2, lwd = 2, t = "l",
     xlab = expression( hat(beta)/SE ), ylab = "density", main="NULL DISTR of t") #null distr. of t-stat.
t.stat = summary(lm.fit)$coeff[2,3] #t-statistic: Estimate/SE
points(t.stat, 0, pch = 19, cex = 1.5, col = "red")
segments(t.stat*c(1,-1), c(0,0), t.stat*c(1,-1), rep( dt( t.stat, df = n-2), 2 ), col = "red")
text(2, 0.25, paste0("P=",signif(summary(lm.fit)$coeff[2,4],3)), col = "red")

#2nd on t^2 statitstic's scale
x.grid = seq(0, 10, 0.05) #we need this to define the plotting region
plot(x.grid, dchisq( x.grid, df = 1 ), lty = 2, lwd = 2, t = "l",
     xlab = expression(( hat(beta)/SE)^2 ), ylab = "density", main = "NULL DISTR of t^2") #null distribution of t^2-stat.
t2.stat = summary(lm.fit)$coeff[2,3]^2 #t^2-statistic: (Estimate/SE)^2
points(t2.stat, 0, pch = 19, cex = 1.5, col = "red")
segments(t2.stat, 0, t2.stat, dchisq(t2.stat, df = 1), col = "red")
text(2.5, 0.25, paste0("P=", signif(summary(lm.fit)$coeff[2,4],3)), col = "red")
legend("topright", pch = 19, col = "red", leg = "observed" )

z = summary(lm.fit)$coeff[2,3] #t-statistic also called z-score under Normal approximation
pnorm(-abs(z), 0, 1, lower = T) + pnorm(abs(z), 0, 1, lower = F) #P-value from N(0,1): left + right tail
pchisq(z^2, df = 1, lower = F) #P-value from chi-square is the upper tail 

```


#### Distribution of P-values 


```{r, include = FALSE, fig.height=4}
#Block 2

set.seed(39)
n = 100 #individuals
p = 1000 #variants measured on each individual
f = 0.4 #MAF is assumed the same for all variants; doesn't actually matter here
X = matrix(rbinom(n*p, 2, f), nrow = n, ncol = p) #just random genotypes 
y = rnorm(n) #phenotype that is not associated with any of genotypes

#apply lm to each column of X separately and collect results for genotype (row 2 of coeff)
lm.res = apply(X, 2 , function(x) summary(lm(y ~ x))$coeff[2,])
#result has 4 rows: beta, SE, t-stat and pval
pval = lm.res[4,] #pick pvalues

par(mfrow=c(1,2))
plot(density(lm.res[3,]), sub = "", xlab = "t-stat", main = "", lwd = 2) #should be t with n-2 df
x.seq = seq(-4,4,0.1) #x-coordinates for plotting
lines(x.seq, dt(x.seq, df = n-2), col = "blue", lty = 2) #t distribution in blue
lines(x.seq, dnorm(x.seq), col = "red", lty = 3) #normal distribution in red
hist(pval, breaks = 10, xlab = "P-value", main = "", col = "limegreen") #should be uniformly distributed
par(mfrow=c(1,2)) #Let's make qqplots for t-stats and for P-values
qqnorm(lm.res[3,]) #t with ~100 df should be close to normal (see densities above), hence qqnorm
qqline(lm.res[3,], col = "red")  

#For P-values, we want to compare to the Uniform(0,1) distribution:
#We use ppoints(p) to get  
#p equally spaced values in (0,1) to represent quantiles of Uniform(0,1).
#we take -log10 transformation to see the small P-values particularly well 
qqplot(-log10(ppoints(p)),-log10(pval), xlab = "theoretical", 
       ylab = "obs'd", main = "Q-Q Plot for -log10 Pval") 
abline(0, 1, col = "red")
```

1. Show the distributions of -log10-Pvalues of the null variants
and of the variants that have effects. Are there overlap?
Is this a realistic example considering GWAS?


```{r, include = FALSE, fig.height=4}
#Block 3
set.seed(49)
n = 1000 #individuals
p = 1000 #genotypes measured on each individual
m = 50 #number of variants that have an effect: they are x_1,...,x_m.
f = 0.4 #MAF
b = 0.5 #effect size of variants that have an effect
X = matrix(rbinom(n*p, 2, f), nrow = n, ncol = p) #just random genotypes at SNPs
y = X[,1:m] %*% rep(b,m) + rnorm(n) #phenotype that is associated with x_1,...,x_m

#apply lm to each column of X separately 
lm.res = apply(X, 2 , function(x) summary(lm(y ~ x))$coeff[2,])
#has 4 rows: beta, SE, t-stat and pval
pval = lm.res[4,] 

par(mfrow=c(1,2))
plot(density(lm.res[3,]), sub = "", xlab = "t-stat", main = "", lwd = 2) #under null is t with n-2 df
lines(seq(-4,4,0.1), dnorm(seq(-4,4,0.1)), col = "red", lty = 3) #normal distribution in red
hist(pval, breaks = 10, xlab = "P-value", main = "", col="skyblue") #under null is uniformly distributed
par(mfrow = c(1,2)) #Let's make qqplots for t-stats and for P-values
qqnorm(lm.res[3,])
qqline(lm.res[3,], col = "red") 
qqplot(-log10(ppoints(p)), -log10(pval), xlab = "theoretical", ylab = "obs'd",
       main = "Q-Q Plot for -log10 Pval") 
abline(0, 1, col = "red")
```