#Examples accompanying paper #Pirinen M, Donnelly P, Spencer C (2012) #"Including known covariates can reduce power to detect genetic effects in case-control studies." #Nat Genet ?? #Written by #Matti Pirinen, February 2012 #matti.pirinen@iki.fi # Three functions for the end user, provided in the file "log_regression_covariate_functions.R" # For the main computations: # binary.covariate, for a binary covariate # gaussian.covariate, for a gaussian covariate # For Checking: # analytic.var.ratio.normal, for testing the variance calculations in the gaussian case (see Supp Fig 7 below) ##################################################################################################### # binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=FALSE) # #INPUT # #K, the (target) prevalence of the disease #freq.G, frequency of risk allele in general population #or.G, odds-ratio for each copy of the risk allele #freq.X, frequency of the risk variant of binary exposure #or.X, odds-ratio of X #ncases, number of cases in the case-control sample (needed for standard error computation) #ncontrols, number of controls in the case-control sample (needed for standard error computation) #population.controls, logical, if TRUE then controls have general population frequencies, otherwise # controls have proper control frequencies # # # #OUTPUT #K, the prevalence of the disease (will be close to target prevalence given as input) #ncases, number of cases for which the standard error was computed #ncontrols, number of controls for which the standard error was computed #case.freq 2x3 matrix of case frequencies, element i,j is for X=i-1, G=j-1 #control.freq 2x3 matrix of control frequencies, element i,j is for X=i-1, G=j-1 #population.controls, if TRUE, then control.freq are population frequencies #marg.or.G marginal odds-ratio for each copy of risk allele (i.e. effect of G omitting X from the model) #joint.or.G odds-ratio for each copy of risk allele when adjusted for covariate X #marg.or.X marginal odds-ratio for copy of risk exposure X=1 (i.e. effect of X omitting G from the model) #joint.or.X odds-ratio for risk exposure X=1 when adjusted for genotype G #marg.se.G standard error of marg.or.G #joint.se.G standard error of joint.or.G #NCP.GX.G ratio of the non-centrality parameters for X-adjusted G to marginal G (called SSM in the paper) # ######################################################################################################### ##################################################################################################### # gaussian.covariate(K,freq.G,or.G,or.X,ncases,ncontrols,nsimulations=1000, # population.controls=FALSE,x.lower=-6,x.upper=6,print.plots=FALSE) # #INPUT # #K, the (target) prevalence of the disease #freq.G, frequency of risk allele in general population #or.G, odds-ratio for each copy of the risk allele #or.X, odds-ratio per each std deviation of the continuous covariate X #ncases, number of cases in the case-control sample #ncontrols, number of controls in the case-control sample #nsimulations, numer of data sets simulated to get the parameter estimates #population.controls, logical, if TRUE then controls have general population frequencies, otherwise # controls have proper control frequencies #x.lower, the lower limit of the integration interval for X, on log-odds scale #x.upper, the upper limit of the integration interval for X, on log-odds scale #print.plots, logical, if true prints the densities of X in each of the six phenotype x genotype group, # and compares cdfs of those distributions to gaussians with the same mean and std # # #OUTPUT #K, the prevalence of the disease (will be close to target prevalence given as input) #ncases, number of cases for which the standard error was computed #ncontrols, number of controls for which the standard error was computed #sample.a, the value of the base-line parameter 'a' in the case-control sample #case.G.freq, the genotype frequencies in cases (0,1,2) #case.X.means, the means of X in cases for each genotype group (0,1,2) #case.X.ses, the standard deviations of X in cases for each genotype group (0,1,2) #control.G.freq, the genotype frequencies in controls (0,1,2) #control.X.means, the means of X in controls for each genotype group (0,1,2) #contrl.X.ses, the standard deviations of X in controls for each genotype group (0,1,2) #population.controls, if TRUE, then controls are from the general population #marg.or.X marginal odds-ratio of X per each stdev (i.e. the effect of X omitting G from the model) #joint.or.X odds-ratio of X per each stdev when adjusted for genotype G #marg.or.G marginal odds-ratio for each copy of risk allele (i.e. effect of G omitting X from the model) #joint.or.G odds-ratio for each copy of risk allele when adjusted for covariate X #marg.se.G standard error of marg.or.G #joint.se.G standard error of joint.or.G #NCP.GX.G ratio of the non-centrality parameters for X-adjusted G to marginal G (called SSM in the paper) # ######################################################################################################### #### ### ### Table 1 and 2 in the Main paper: ### Compute statistics for the 4 diseases mentioned in the paper ### #### #read in the functions source("log_regression_covariate_functions.R") freq.G=0.3 #risk allele frequency or.G=1.20; #genetic odds ratio # # Psoriasis with HLA-C as binary covariate X # K=0.01;freq.X=0.24;or.X=6.4;ncases=1689;ncontrols=5175; binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases=2000,ncontrols=2000,population.controls=TRUE)->res res$NCP.GX.G #SSM in Table 1 of the Main paper binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=TRUE)->res (res$marg.se.G/res$joint.se.G)^2 #var ratio reported in Table 2 # #Ankylosing Spondylitis with HLA-B27 as binary covariate X # K=0.0025;freq.X=0.08;or.X=49;ncases=1788;ncontrols=4812; binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases=2000,ncontrols=2000,population.controls=TRUE)->res res$NCP.GX.G #SSM in Table 1 of the Main paper binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=TRUE)->res (res$marg.se.G/res$joint.se.G)^2 #var ratio reported in Table 2 # #Multiple sclerosis with female as binary covariate X # K=0.001;freq.X=0.5;or.X=2.3;ncases=1854;ncontrols=5175; binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases=2000,ncontrols=2000,population.controls=TRUE)->res res$NCP.GX.G #SSM in Table 1 of the Main paper binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=TRUE)->res (res$marg.se.G/res$joint.se.G)^2 #var ratio reported in Table 2 # #Migraine with female as binary covariate X # K=0.2;freq.X=0.5;or.X=4;ncases=2000;ncontrols=2000; binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=FALSE)->res res$NCP.GX.G #SSM in Table 1 of the Main paper #### ### ### Plot Panel d. of Figure 1 in the Main paper ### #### source("log_regression_covariate_functions.R") Ks=c(0.001,0.005,seq(0.01,0.5,by=0.01)) or.Xs=c(2,5,10,50) freq.G=0.3 freq.X=0.5 or.G=1.2 ncases=2000 ncontrols=2000 nsets=length(Ks)*length(or.Xs) stats=data.frame(matrix(NA,nrow=nsets,ncol=8)) names(stats)=c("K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G") set=1 for(or.X in or.Xs){ for(K in Ks){ binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols)->res stats[set,]=res[c("K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G")] set=set+1 } } ltys=c(2,3,4,5) lwd=1.8 ticks=c(0.001,0.01,0.05,0.1,0.2,0.5) cex.axis=1.5 cex.lab=1.8 cex.main=2 par(mar=c(5,6,3,2),xaxs="i",yaxs="i") cex.legend=1.8 cex.text=1.3 plot(c(0,0),xlim=c(log10(9e-4),log10(0.6)),ylim=c(0.6,1.6),col="white",ylab=paste("SSM"),xlab="K",xaxt="n",main="d",cex.lab=cex.lab,cex.axis=cex.axis,cex.main=cex.main) axis(1,at=log10(ticks),labels=ticks,cex.lab=cex.lab,cex.axis=cex.axis) i=1 for(or.X in or.Xs){ ind=which(abs(stats[,"joint.or.X"]-or.X)<0.5) y=stats[ind,"NCP.GX.G"] lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 } abline(h=1.0,lty=1) legend("topleft",legend=or.Xs,lty=ltys,cex=cex.legend,lwd=lwd) abline(v=log10(ticks),col = "lightgray", lty = "dotted") abline(h=c(0.8,1.0,1.2,1.4),col = "lightgray", lty = "dotted") ###### ###### Panel d of Main Figure 1 ENDS. ###### ###### ###### ###### PLOTTING SUPPLEMENTARY FIGURES 1,2,3,4,5,6 and 7 ###### ###### ###### ###### Supp Figures 1,2,3 ###### source("log_regression_covariate_functions.R") Ks=c(0.001,0.005,seq(0.01,0.09,by=0.01),seq(0.1,0.95,by=0.05)) or.Xs=c(2,5,10,50) freq.G=0.3 or.G=1.2 ncases=2000 ncontrols=2000 nsets=length(Ks)*length(or.Xs) for(freq.X in c(0.2,0.5,0.8)){ stats=data.frame(matrix(NA,nrow=nsets,ncol=8)) names(stats)=c("K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G") set=1 for(or.X in or.Xs){ for(K in Ks){ binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,FALSE)->res stats[set,]=res[c("K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G")] set=set+1 } } #postscript(paste("fg0.3_fx",freq.X,"_orG_1.2_S2000_R2000.eps",sep=""),width=15,height=10,pointsize=10) par(mfrow=c(2,2)) par(cex.axis=1.3,cex.lab=1.3,mar=c(5,5,4,2)+0.1) ltys=c(2,3,4,5) lwd=1.5 ticks=c(0.001,0.01,0.05,0.1,0.2,0.5,1) plot(c(0,0),xlim=c(log10(9e-4),log10(1.1)),ylim=c(0.4,1),col="white",ylab=paste("b*/b"),xaxt="n",xlab="",main="A. Magnitudes") axis(1,at=log10(ticks),labels=ticks) i=1 for(or.X in or.Xs){ ind=((i-1)*length(Ks)+1):(i*length(Ks)) y=log(stats[ind,"marg.or.G"])/log(stats[ind,"joint.or.G"]) lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 } legend("bottomleft",legend=or.Xs,lty=ltys,cex=1.3,lwd=lwd) plot(c(0,0),xlim=c(log10(9e-4),log10(1.1)),ylim=c(0.4,1),col="white",ylab=expression(paste("var(",hat(b),"*) / var(",hat(b),")")),xaxt="n",xlab="",main="B. Variances") axis(1,at=log10(ticks),labels=ticks) i=1 for(or.X in or.Xs){ ind=((i-1)*length(Ks)+1):(i*length(Ks)) y=(stats[ind,"marg.se.G"]/stats[ind,"joint.se.G"])^2 lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 } plot(c(0,0),xlim=c(log10(9e-4),log10(1.1)),ylim=c(0.5,2.5),col="white",ylab=paste("SSM"),xlab="K",xaxt="n",main="C. SSMs, proper controls") axis(1,at=log10(ticks),labels=ticks) i=1 for(or.X in or.Xs){ ind=((i-1)*length(Ks)+1):(i*length(Ks)) y=stats[ind,"NCP.GX.G"] lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 } abline(h=1.0,lty=1) legend("topleft",legend=or.Xs,lty=ltys,cex=1.3,lwd=lwd) #compute SSMs for population controls stats=data.frame(matrix(NA,nrow=nsets,ncol=8)) names(stats)=c("K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G") set=1 for(or.X in or.Xs){ for(K in Ks){ binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,TRUE)->res stats[set,]=res[c("K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G")] set=set+1 } } plot(c(0,0),xlim=c(log10(9e-4),log10(1.1)),ylim=c(0.5,2.5),col="white",ylab=paste("SSM"),xlab="K",xaxt="n",main="D. SSMs, population controls") axis(1,at=log10(ticks),labels=ticks) i=1 for(or.X in or.Xs){ ind=((i-1)*length(Ks)+1):(i*length(Ks)) y=stats[ind,"NCP.GX.G"] lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 } abline(h=1.0,lty=1) #dev.off() } ###### ###### Supp Fig 1,2,3 ENDS ###### ###### ###### Supplementary Figure 4. ###### source("log_regression_covariate_functions.R") Ks=c(0.001,0.005,seq(0.01,0.09,by=0.01),seq(0.1,0.95,by=0.05)) or.Xs=c(2,5,10,50) or.Gs=c(1.2,1.5,2.0) fXs=c(0.05,0.25,0.5,0.75,0.95) fGs=c(0.05,0.25,0.5,0.75,0.95) nsets=length(Ks)*length(or.Xs)*length(or.Gs)*length(fXs)*length(fGs) #Runs this set once and stores in a file for further use #This takes a while... stats=data.frame(matrix(NA,nrow=nsets,ncol=10)) names(stats)=c("fX","fG","K","marg.or.G","joint.or.G","marg.or.X","joint.or.X","marg.se.G","joint.se.G","NCP.GX.G") ncases=2000 ncontrols=2000 set=0 for(fX in fXs){ for(fG in fGs){ print(paste("fX=",fX,"fG=",fG)) for(or.G in or.Gs){ for(or.X in or.Xs){ for(K in Ks){ set=set+1 binary.covariate(K,fG,or.G,fX,or.X,ncases,ncontrols,FALSE)->res stats[set,]=c(fX,fG,res[c("K","marg.or.G","joint.or.G","marg.or.X","marg.or.X","marg.se.G","marg.se.G","NCP.GX.G")]) } } } } } #use this to save the results save(stats,file="supp_figure4.res",ascii=FALSE) #use this to read in the results from the file load("supp_figure4.res") Ks=c(0.001,0.005,seq(0.01,0.09,by=0.01),seq(0.1,0.95,by=0.05)) or.Xs=c(2,5,10,50) or.Gs=c(1.2) fXs=c(0.05,0.25,0.5,0.75,0.95) fGs=c(0.05,0.5,0.95) nsets=length(Ks)*length(or.Xs)*length(or.Gs)*length(fXs)*length(fGs) ltys=c(2,3,4,5) lwd=1.5 set=1 switching=rep(NA,nsets/length(Ks)) panel=0 #postscript("supp_figure4.eps",width=8,height=19,pointsize=10) par(cex.axis=1.3,cex.lab=1.3,mar=c(1.5,1.5,1,1)+0.1,oma=c(3,3,1,1)) par(mfrow=c(5,3)) for(fX in fXs){ for(fG in fGs){ for(or.G in c(1.2)){ panel=panel+1 if(panel>12) {xlab="K"} else {xlab="";} if(panel%%3==1) ylab="SSM" else ylab="" plot(c(0,0),xlim=c(log10(9e-4),log10(1.1)),ylim=c(0.5,2.5),col="white",ylab=ylab,xlab=xlab,xaxt="n",yaxt="n",main="") if(panel>12) axis(1,at=log10(c(0.001,0.01,0.05,0.1,0.2,0.5,1.0)),labels=c(0.001,0.01,0.05,0.1,0.2,0.5,1.0)) else axis(1,at=log10(c(0.001,0.01,0.05,0.1,0.2,0.5,1.0)),labels=FALSE) if(panel%%3==1) axis(2,at=c(0.5,1.0,2.0),labels=c(0.5,1.0,2.0)) else axis(2,at=c(0.5,1.0,2.0),labels=FALSE) abline(h=1.0,lty=1) i=1 for(or.X in or.Xs){ ind=which(abs(stats[,"joint.or.X"]-or.X)<1.0 & abs(stats[,"joint.or.G"]-or.G)<0.02 & stats[,"fX"]==fX & stats[,"fG"]==fG) y=stats[ind,"NCP.GX.G"] switching[set]=Ks[max(which(y<1 & Ks<0.5))] lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 set=set+1 } text(log10(0.003),2.0,paste("frq X=",fX,"\nfrq G=",fG,sep=""),cex=1.4) if(panel==13) legend("topright",legend=or.Xs,lty=ltys,cex=1.0,lwd=lwd) } } } title(xlab="K",outer=TRUE,line=1) title(ylab="SSM",outer=TRUE,line=1) #dev.off() #minimum of the thresholds at which model M is still more powerful than model M* min(switching) ###### ###### Supplementary Figure 4. ENDS ###### ###### ###### Supplementary Figure 5. ###### source("log_regression_covariate_functions.R") freq.G=0.3; or.G=1.2; K=0.01; or.X=50; ncases=5000 ncontrols=5000 #postscript("covariate_barplots.eps",width=6,height=6/3,paper="special",pointsize=10) i=1 par(mfrow=c(1,3)) for(freq.X in c(0.1,0.5,0.9)){ binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,FALSE)->res barplot(matrix(c(sum(res$case.freq[2,]),sum(res$case.freq[1,]),sum(res$control.freq[2,]) ,sum(res$control.freq[1,])),ncol=2,byrow=FALSE),names.arg=c("cases","controls"),main=paste ("frq X=",freq.X,sep=""),sub=paste("var ratio=",signif((res$marg.se.G/res$joint.se.G)^2,3)),yaxt="n ",cex.sub=1.2,cex.main=1.2,cex.lab=1.2,cex.names=1.2) if(i==1) axis(2) i=i+1 } #dev.off() ###### ###### Supplementary Figure 5. ENDS ###### ###### ###### Supplementary Figure 6. ###### source("log_regression_covariate_functions.R") freq.G=0.3 or.G=1.2 freq.Xs=c(0.2,0.5,0.8) or.X=10 ncases=2000 Ks=c(0.001,0.01,0.05,0.2,0.5) mult=1:20 stats=matrix(NA,nrow=length(mult),ncol=length(Ks)*length(freq.Xs)) i=1 for(freq.X in freq.Xs){ for(K in Ks){ print(c(K,freq.X)) for(m in mult){ binary.covariate(K,freq.G,or.G,freq.X,or.X,ncases,m*ncases,FALSE)->res stats[m,i]=res$NCP.GX.G } i=i+1 } } lwd=1.2 ltys=c(2,3,4,5,6) #postscript("control_case_ratio.eps",width=6,height=2.5,paper="special",pointsize=9) par(mfrow=c(1,3),mar=c(5,4,3,1)+0.1) j=0 for(freq.X in freq.Xs){ if(j==0){ylab="SSM";}else{ylab="";par(mar=c(5,4,3,1)+0.1)} plot(c(0,0),xlim=c(1,mult[length(mult)]),ylim=c(0.6,1.8),col="white",ylab=ylab,xlab="|controls| / |cases|",main=paste("frq X=",freq.X,sep=""),xaxt="n",cex.sub=1.2,cex.main=1.2,cex.lab=1.2,cex.axis=1.2) axis(1,at=c(1,5,10,15,20),labels=c(1,5,10,15,20),cex=1.2) i=1 for(K in Ks){ lines(mult,stats[,j*length(Ks)+i],lty=ltys[i],lwd=lwd) i=i+1 } abline(h=1.0,lty=1) j=j+1 } legend("topleft",legend=rev(Ks),lty=rev(ltys),cex=1.1,lwd=lwd) #dev.off() ###### ###### Supplementary Figure 6. ENDS ###### ###### ###### Supplementary Figure 7. ###### source("log_regression_covariate_functions.R") nsimulations=1000 ncases=1000 ncontrols=1000 Ks=c(0.001,0.005,0.01,0.025,0.05,0.075,seq(0.1,0.95,by=0.05)) or.Xs=c(2,4,6) fGs=c(0.05,0.5,0.95) or.Gs=c(1.2) #run this set once and save the results in a file #This takes a while... nsets=length(Ks)*length(or.Xs)*length(fGs)*length(or.Gs) set=1 stats=data.frame(matrix(NA,nrow=nsets,ncol=10)) names(stats)=c("fG","or.G","or.X","K","marg.or.G","joint.or.G","marg.se.G","joint.se.G","NCP.GX.G","analytic.var.ratio") for(fG in fGs){ print(paste("fG=",fG,sep="")) for(or.G in or.Gs){ for(or.X in or.Xs){ for(K in Ks){ gaussian.covariate(K,fG,or.G,or.X,ncases,ncontrols,nsimulations)->res v=analytic.var.ratio.normal(a=res$sample.a,c=log(or.X),res$ncases/(res$ncases+res$ncontrols),res$case.G.freq,res$case.X.means,res$case.X.ses,res$control.G.freq,res$control.X.means,res$control.X.ses) stats[set,]=c(fG,or.G,or.X,res[c("K","marg.or.G","joint.or.G","marg.se.G","joint.se.G","NCP.GX.G")],v) set=set+1 } } } } #Use this to save the results for further use save(stats,file="supp_figure7.res",ascii=FALSE) #And this to load them load("supp_figure7.res") or.Xs=c(2,4,6) fG=0.5 or.G=1.2 cex.legend=1.5 cex.text=1.2 ltys=c(2,3,4) lwd=1.5 #postscript("continuous_SSMs.eps",width=10,height=10/2,paper="special",pointsize=10) par(cex.axis=1.3,cex.lab=1.3) par(mfrow=c(1,2)) plot(c(0,0),xlim=c(log10(9e-4),log10(1.1)),ylim=c(0.5,1.5),col="white",ylab="SSM",xlab="K",main="",xaxt="n",yaxt="n") axis(1,at=log10(c(0.001,0.01,0.05,0.1,0.2,0.5,1)),labels=c(0.001,0.01,0.05,0.1,0.2,0.5,1)) axis(2,at=c(0.6,0.8,1.0,1.2,1.4),labels=c(0.6,0.8,1.0,1.2,1.4)) abline(h=1.0,lty=1) i=1 for(or.X in or.Xs){ ind=which(stats[,"fG"]==fG & stats[,"or.G"]==or.G & stats[,"or.X"]==or.X) y=stats[ind,"NCP.GX.G"] lines(log10(stats[ind,"K"]),y,lty=ltys[i],lwd=lwd) i=i+1 } text(log10(0.2),0.6,paste("frq G=",fG,sep=""),cex=cex.text) legend("topleft",legend=or.Xs,lty=ltys,cex=cex.legend,lwd=lwd) #check how analytic computations of var ratio match simulations plot(stats[,"marg.se.G"]/stats[,"joint.se.G"],sqrt(stats[,"analytic.var.ratio"]),xlab="SE ratio, simulations",ylab="SE ratio, analytic") abline(0,1) #dev.off() ###### ###### Supplementary Figure 7. ENDS ######