#This file contains functions to assess the behavior of
#logistic regression models for case-control GWAS when a covariate is available.
#The file 'log_regression_covariate_examples.R' explains how to use these functions.


#For theoretical details, see
#"Including known covariates can reduce power to detect genetic effects in case-control studies."
#By Matti Pirinen, Peter Donnelly and Chris Spencer
#Nat Genet (2012)

#Written by
#Matti Pirinen, February 2012
#matti.pirinen@iki.fi




#
##
###
######## BINARY COVARIATE #########
###
##
#


model.1.loglkhood<-function(par,p,q,phi){
  #negative log-likelihood function for logistic regression with
  #base-line parameter (par[1]) and
  #additive effect for genotype (par[2])
  #loglikelihood is evaluated at the expected counts
  #in cases defined by 'p' and in controls defined by 'q'
  #and with the case proportion 'phi'
  
  
  #INPUT
  #par vector of length 2 giving the argument for the log-likelihood
  #p genotype frequencies in cases (vector of length 3)
  #q genotype frequencies in controls (vector of length 3)
  #phi proportion of cases in the sample
  
  #OUTPUT
  #- log-likelihood at the given argument 
  N=100000 #assuming sample size N=100000 as only the shape matters here, KEEP SAME AS IN gradient function below
  stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 )
  stopifnot(phi>0 & phi <1)
  stopifnot(length(par)==2)
  A=par[1]+c(0,1,2)*par[2] #log-odds for the three genotypes
  -N*(phi*sum(p*log(1/(1+exp(-A))))+(1-phi)*sum(q*log(1/(1+exp(A)))))
}

model.1.gradient<-function(par,p,q,phi){
  #gradient of the negative loglkhood function
  #INPUT see loglkhood
  #OUTPUT two dimensional gradient vector
  N=100000 #KEEP SAME AS IN the loglkhood function above
  stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 )
  stopifnot(phi>0 & phi <1)
  stopifnot(length(par)==2)
  A=par[1]+c(0,1,2)*par[2]
  -N*c(sum((phi*p-(1-phi)*q*exp(A))/(1+exp(A))),
     sum(c(0,1,2)*(phi*p-(1-phi)*q*exp(A))/(1+exp(A))))
}


model.2.loglkhood<-function(par,p,q,phi){
  #negative log-likelihood function for logistic regression with
  #base-line parameter (par[1]) and
  #additive effect for genotype (par[2]) and
  #additive effect for binary covariate (par[3])
  #loglikelihood is evaluated at the expected counts
  #in cases defined by 2x3 matrix 'p' and in controls defined by 2x3 matrix 'q'
  #and with the case proportion 'phi'
  
  
  #INPUT
  #par vector of length 3 giving the argument for the log-likelihood
  #p genotype x covariate frequencies in cases (matrix of dimension 2x3)
  #q genotype x covariate frequencies in controls (matrix of dimension 2x3)
  #phi proportion of cases in the sample
  
  #OUTPUT
  #- log-likelihood at the given argument 
  N=100000 #assuming sample size N=100000 as only the shape matters here, KEEP SAME AS IN gradient function below
  stopifnot(sum(dim(p)==c(2,3))==2)
  stopifnot(sum(dim(q)==c(2,3))==2)
  stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 )
  stopifnot(phi>0 & phi <1)
  A=par[1]+matrix(c(0,par[2],2*par[2],par[3],par[2]+par[3],2*par[2]+par[3]),byrow=TRUE,nrow=2)
  -N*(phi*sum(p*log(1/(1+exp(-A))))+(1-phi)*sum(q*log(1/(1+exp(A)))))
}

model.2.gradient<-function(par,p,q,phi){
  #gradient of the loglkhood function
  #INPUT see loglkhood
  #OUTPUT two dimensional gradient vector
  N=100000 #keep same as in loglkhood function above
  stopifnot(sum(dim(p)==c(2,3))==2)
  stopifnot(sum(dim(q)==c(2,3))==2)
  stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 )
  stopifnot(phi>0 & phi <1)
  A=par[1]+matrix(c(0,par[2],2*par[2],par[3],par[2]+par[3],2*par[2]+par[3]),byrow=TRUE,nrow=2)
  G=matrix(c(0,1,2,0,1,2),byrow=TRUE,nrow=2)
  X=matrix(c(0,0,0,1,1,1),byrow=TRUE,nrow=2)
  -N*c(sum((phi*p-(1-phi)*q*exp(A))/(1+exp(A))),
       sum(G*(phi*p-(1-phi)*q*exp(A))/(1+exp(A))),
       sum(X*(phi*p-(1-phi)*q*exp(A))/(1+exp(A))))
}



binary.covariate<-function(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=FALSE){
  #Matti Pirinen February 2012

  #Finds population parameters for a given set of disease parameters.

  #INPUT
  #K, the (target) prevalence of the disease
  #freq.G, frequency of risk allele in general population
  #or.G, odds-ratio for each copy of the risk allele 
  #freq.X, frequency of the risk variant of binary exposure
  #or.X, odds-ratio of X
  #ncases, number of cases in the case-control sample (needed for variance computation)
  #ncontrols, number of controls in the case-control sample (needed for variance computation)
  #population.controls, boolean, if TRUE then controls have general population frequencies, otherwise
  #                                           controls have proper control frequencies
  
  #OUTPUT
  #K, the prevalence of the disease (will be close to target prevalence given as input)
  #ncases, number of cases for which the standard error was computed
  #ncontrols, number of controls for which the standard error was computed
  #pop.a, logistic regression base-line parameter in the population (NOT in the case-control sample!) 
  #case.freq 2x3 matrix of case frequencies, element i,j is for X=i-1, G=j-1
  #control.freq 2x3 matrix of control frequencies, element i,j is for X=i-1, G=j-1
  #population.controls, if TRUE, then control.freq are population frequencies
  #marg.or.G marginal odds-ratio for each copy of risk allele (i.e. effect of G omitting X from the model)
  #joint.or.G odds-ratio for each copy of risk allele when adjusted for covariate X
  #marg.or.X marginal odds-ratio for copy of risk exposure X=1 (i.e. effect of X omitting G from the model)
  #joint.or.X odds-ratio for risk exposure X=1 when adjusted for genotype G
  #marg.se.G standard error of marg.or.G
  #joint.se.G standard error of joint.or.G
  #NCP.GX.G ratio of non-centrality parameters for X-adjusted G to marginal G (called SSM in the paper)

  stopifnot(K>0 & K<1)
  stopifnot(freq.G>0 & freq.G<1)
  stopifnot(freq.X>0 & freq.X<1)
  stopifnot(or.G>0 & or.X>0)
  stopifnot(ncases>0 & ncontrols>0)
  stopifnot(is.logical(population.controls))
  
  fx=freq.X
  fg=freq.G
  #there are six classes of individuals (X=0,1 x G=0,1,2)
  #the frequencies of classes (0,0),(0,1),(0,2),(1,0),(1,1),(1,2) in the general population are
  pop.freq=c((1-fx)*(1-fg)^2,(1-fx)*2*fg*(1-fg),(1-fx)*fg^2,fx*(1-fg)^2,fx*2*fg*(1-fg),fx*fg^2)
  #the additive contribution to the log-odds of having the disease for these classes are
  a=c(0,log(or.G),2*log(or.G),log(or.X),log(or.X)+log(or.G),log(or.X)+2*log(or.G))

  #choose base-line log-odds alpha in such a way that the population prevalence of the disease is approx. K
  alpha.found=0;iter=1;
  alpha=seq(-30,10,by=0.001)
  while(!alpha.found & iter<100){
    K.table=rep(NA,length(alpha))
    for(i in 1:length(alpha)){K.table[i]=sum(1/(1+exp(-(alpha[i]+a)))*pop.freq)} #this is prevalence for alpha[i]
    i=which.min(abs(K-K.table))
    if(abs(K-K.table[i])<0.01*K){alpha.found=1;K=K.table[i];alpha=alpha[i];} #you can choose relative error tolerance, here 0.01
    if(!alpha.found){
      if(i==1) alpha=seq(-30-iter*40,10-iter*40,by=0.001)
      if(i==length(alpha)) alpha=seq(-30+iter*40,10+iter*40,by=0.001)
      if(i>1 && i<length(alpha)) alpha=seq(alpha[i-1],alpha[i+1],length.out=10000)
    }
    iter=iter+1;
  }
  stopifnot(alpha.found==1)
  #Note K is not anymore exactly the K given as parameter to this function, but close to it
  #print(paste("Found a=",alpha," corresponding to K=",K,sep=""))
  
  p=matrix(1/(1+exp(-(alpha+a)))*pop.freq/K,ncol=3,byrow=TRUE) #case frequencies as 2x3 matrix
  q=matrix(1/(1+exp(alpha+a))*pop.freq/(1-K),ncol=3,byrow=TRUE) #control frequencies as 2x3 matrix
  pop.freq=matrix(pop.freq,ncol=3,byrow=TRUE) #change pop.freq to 2x3 matrix
  #check the frequencies
  stopifnot(sum(abs(pop.freq-K*p-(1-K)*q))<1e-9) #general population is a mixture of cases and controls with prevalence K
  stopifnot(abs(sum(p)-1)<1e-9 & abs(sum(q)-1)<1e-9) #p and q sum to 1
   
  #each copy of risk allele increases odds by or.G:
  stopifnot(sum(abs(or.G-c(p[1,2]/p[1,1]*q[1,1]/q[1,2],p[1,3]/p[1,2]*q[1,2]/q[1,3],p[2,2]/p[2,1]*q[2,1]/q[2,2],p[2,3]/p[2,2]*q[2,2]/q[2,3])))<1e-9)
  #covariate X increases odds by or.X:
  stopifnot(sum(abs(or.X-c(p[2,1]/p[1,1]*q[1,1]/q[2,1],p[2,2]/p[1,2]*q[1,2]/q[2,2],p[2,3]/p[1,3]*q[1,3]/q[2,3])))<1e-9)

  N=ncases+ncontrols
  #phi = proportion of cases
  phi=ncases/N

  if(population.controls){q=pop.freq;}#use population freqs for controls instead of proper control freqs
  
  #joint odds-ratio for G and X as maximum likelihood estimate from the corresponding logistic regression model
  #these will be the same as given as parameters IF proper controls are used i.e. population.controls=FALSE
  optim(c(alpha,log(or.G),log(or.X)),fn=model.2.loglkhood,gr=model.2.gradient,p,q,phi,method='BFGS')->optim.res
  stopifnot(optim.res$convergence==0)
  joint.or.G=exp(optim.res$par[2])
  joint.or.X=exp(optim.res$par[3])
  
  #marginal odds-ratio for X
  marg.or.X=sum(p[2,])/sum(q[2,])*sum(q[1,])/sum(p[1,]) 
 
  #marginal odds-ratio for G as maximum likelihood estimate from the corresponding logistic regression model:
  optim(c(alpha,log(or.G)),fn=model.1.loglkhood,gr=model.1.gradient,colSums(p),colSums(q),phi,method='BFGS')->optim.res
  stopifnot(optim.res$convergence==0)
  marg.or.G=exp(optim.res$par[2])

  #phi.0 = proportion of cases given X=0
  phi.0=sum(p[1,])*phi/(sum(q[1,])*(1-phi)+sum(p[1,])*phi)
  #phi.1 = proportion of cases given X=1
  phi.1=sum(p[2,])*phi/(sum(q[2,])*(1-phi)+sum(p[2,])*phi)

  #assuming small genetic effects (see Genet Epidemiol 35:278-290)
  #asymptotic variance is 1/(N*var(genotype)*phi*(1-phi)) when omitting X 

  t=phi*p+(1-phi)*q #frequencies in the case-control sample
  var.geno=sum(t[,2])+4*sum(t[,3])-(sum(t[,2])+2*sum(t[,3]))^2
  var.G=1/(N*var.geno*phi*(1-phi))	

  #and 0.5*harmonic mean of the subgroup variances when stratifying by X

  N.0=ncases*sum(p[1,])+ncontrols*sum(q[1,])
  N.1=N-N.0
  var.geno.0=(t[1,2]+4*t[1,3])/sum(t[1,])-((t[1,2]+2*t[1,3])/sum(t[1,]))^2
  var.geno.1=(t[2,2]+4*t[2,3])/sum(t[2,])-((t[2,2]+2*t[2,3])/sum(t[2,]))^2
  var.GX=1/(N.0*phi.0*(1-phi.0)*var.geno.0+N.1*phi.1*(1-phi.1)*var.geno.1)
 
  NCP.ratio=log(joint.or.G)^2/var.GX/log(marg.or.G)^2*var.G

  return(list(K=K,ncases=ncases,ncontrols=ncontrols,pop.a=alpha,case.freq=p,control.freq=q,population.controls=population.controls,
              marg.or.G=marg.or.G,joint.or.G=joint.or.G,marg.or.X=marg.or.X,joint.or.X=joint.or.X,marg.se.G=sqrt(var.G),joint.se.G=sqrt(var.GX),NCP.GX.G=NCP.ratio))
}









#
##
###
######### CONTINUOUS COVARIATE
###
##
#







K.integrand<-function(x,a,b,c,fg){
  #evaluates the following function at point x
  #sum_g=0,1,2 {exp(a+b*g+c*x)/(1+exp(a+b*g+c*x))}*pi(g)*dnorm(x),
  #where pi=((1-fg)^2,2*fg*(1-fg),fg^2) is the vector of genotype frequencies in H-W equilibrium
  
  #INPUT
  #x, point at which function is evaluated
  #a,b,c parameters of the logistic regression model
  #fg, frequency of allele 1
  stopifnot(fg>0  & fg<1)
  return(((1-fg)^2/(1+exp(-a-c*x))+2*fg*(1-fg)/(1+exp(-a-b-c*x))+fg^2/(1+exp(-a-2*b-c*x)))*dnorm(x))
}

G.integrand<-function(x,a,b,c,g,case){
  #evaluates
  #dnorm(x)*exp(a+b*g+c*x)/(1+exp(a+b*g+c*x)), if case==1
  #dnorm(x)/(1+exp(a+b*g+c*x)), if case==0
  stopifnot(case==0 || case==1)
  return( ( 1-case+(-1)^(1-case)/(1+exp(-a-b*g-c*x)) )*dnorm(x) )
}

Gx.integrand<-function(x,a,b,c,g,case){
  #as G.integrand but mutiplied by x
  return( ( 1-case+(-1)^(1-case)/(1+exp(-a-b*g-c*x)) )*dnorm(x)*x )}

Gx2.integrand<-function(x,a,b,c,g,case){
  #as G.integrand but mutiplied by x^2
  return( ( 1-case+(-1)^(1-case)/(1+exp(-a-b*g-c*x)) )*dnorm(x)*x^2 )}





gaussian.covariate<-function(K,freq.G,or.G,or.X,ncases,ncontrols,nsimulations=1000,population.controls=FALSE,x.lower=-6,x.upper=6,print.plots=FALSE){

#Matti Pirinen, February 2012

  
#INPUT
#
#K, the (target) prevalence of the disease
#freq.G, frequency of risk allele in general population
#or.G, odds-ratio for each copy of the risk allele 
#or.X, odds-ratio per each std deviation of the continuous covariate X
#ncases, number of cases in the case-control sample 
#ncontrols, number of controls in the case-control sample 
#nsimulations, numer of data sets simulated to get the parameter estimates
#population.controls, boolean, if TRUE then controls have general population frequencies, otherwise
#                                           controls have proper control frequencies
#x.lower, the lower limit of the integration interval for X, on log-odds scale
#x.upper, the upper limit of the integration interval for X, on log-odds scale
#print.plots, boolean, if true prints the densities of X in each of the six phenotype x genotype group,
#                      and compares cdfs of those distributions to gaussians with the same mean and std 
# 
#
#OUTPUT
#K, the prevalence of the disease (will be close to target prevalence given as input)
#ncases, number of cases for which the standard error was computed
#ncontrols, number of controls for which the standard error was computed
#sample.a, the value of the base-line parameter 'a' in the case-control sample
#case.G.freq, the genotype frequencies in cases (0,1,2)
#case.X.means, the means of X in cases for each genotype group (0,1,2)
#case.X.ses, the standard deviations of X in cases for each genotype group (0,1,2)
#control.G.freq, the genotype frequencies in controls (0,1,2)
#control.X.means, the means of X in controls for each genotype group (0,1,2)
#contrl.X.ses, the standard deviations of X in controls for each genotype group (0,1,2)
#population.controls, if TRUE, then controls are from the general population 
#marg.or.X marginal odds-ratio of X per each stdev (i.e. the effect of X omitting G from the model)
#joint.or.X odds-ratio of X per each stdev when adjusted for genotype G
#marg.or.G marginal odds-ratio for each copy of risk allele (i.e. effect of G omitting X from the model)
#joint.or.G odds-ratio for each copy of risk allele when adjusted for covariate X
#marg.se.G standard error of marg.or.G
#joint.se.G standard error of joint.or.G
#NCP.GX.G ratio of the non-centrality parameters for X-adjusted G to marginal G (called SSM in the paper)
#

  stopifnot(K>0 & K<1)
  stopifnot(freq.G>0 & freq.G<1)
  stopifnot(or.G>0 & or.X>0)
  stopifnot(ncases>0 & ncontrols>0)
  stopifnot(nsimulations>0)
  stopifnot(x.lower<x.upper)
  stopifnot(is.logical(population.controls))
  stopifnot(is.logical(print.plots))

  fg=freq.G

  #choose base-line log-odds alpha in such a way that population prevalence of the disease is approx. K
  alpha.found=0;iter=1;
  alpha=seq(-30,10,by=0.001)
  while(!alpha.found & iter<10){
    K.table=rep(NA,length(alpha))
    for(i in 1:length(alpha)){
      K.table[i]=integrate(K.integrand,lower=x.lower,upper=x.upper,a=alpha[i],b=log(or.G),c=log(or.X),fg=fg)$value} #this is prevalence for alpha[i]
    i=which.min(abs(K-K.table))
    if(abs(K-K.table[i])<0.01*K){alpha.found=1;K=K.table[i];alpha=alpha[i];} #you can choose relative error tolerance, here 0.01
    if(!alpha.found){
      if(i==1) alpha=seq(-30-iter*40,10-iter*40,by=0.001)
      if(i==length(alpha)) alpha=seq(-30+iter*40,10+iter*40,by=0.001)
      if(i>1 && i<length(alpha)) alpha=seq(alpha[i-1],alpha[i+1],length.out=10000)
    }
    iter=iter+1;
  }
  stopifnot(alpha.found==1)
  print(paste("Found a=",alpha," corresponding to K=",K,sep=""))

  g.freqs=c((1-fg)^2,2*fg*(1-fg),fg^2)
  p=rep(0,3) #case genotype freqs
  q=rep(0,3) #control genotype freqs
  for(g in 0:2){
    p[g+1]=g.freqs[g+1]/K*integrate(G.integrand,lower=x.lower,upper=x.upper,a=alpha,b=log(or.G),c=log(or.X),g=g,case=1)$value;
    q[g+1]=g.freqs[g+1]/(1-K)*integrate(G.integrand,lower=x.lower,upper=x.upper,a=alpha,b=log(or.G),c=log(or.X),g=g,case=0)$value;
  }
  #print(paste(sum(p),sum(q)))
  stopifnot(abs(sum(p)-1)<1e-4 & abs(sum(q)-1)<1e-4) #p and q sum to 1
  stopifnot(sum(abs(g.freqs-K*p-(1-K)*q))<1e-4) #general population is mixture of cases and controls with prevalence K

  x=seq(x.lower,x.upper,length.out=10000)
  means=matrix(0,ncol=3,nrow=2)
  ses=means

  if(print.plots) par(mfcol=c(2,3))

  for(g in 0:2){#calculates means and sds of the X distributions in all six classes (case/control(0,1) x genotype(0,1,2))

    #distribution of X in cases
    mu=integrate(Gx.integrand,lower=x.lower,upper=x.upper,a=alpha,b=log(or.G),c=log(or.X),g=g,case=1)$value*g.freqs[g+1]/K/p[g+1]
    s=sqrt(integrate(Gx2.integrand,lower=x.lower,upper=x.upper,a=alpha,b=log(or.G),c=log(or.X),g=g,case=1)$value*g.freqs[g+1]/K/p[g+1]-mu^2)
    means[1,g+1]=mu
    ses[1,g+1]=s

    if(print.plots) plot(x,G.integrand(x,alpha,log(or.G),log(or.X),g,1)*g.freqs[g+1]/K/p[g+1],t="l",ylab="density",main=paste("cases G=",g,sep=""),xlab="",sub=paste("mean=",signif(mu,4)," SE=",signif(s,4),sep=""))

    #distribution of X in controls
    mu=integrate(Gx.integrand,lower=x.lower,upper=x.upper,a=alpha,b=log(or.G),c=log(or.X),g=g,case=0)$value*g.freqs[g+1]/(1-K)/q[g+1]
    s=sqrt(integrate(Gx2.integrand,lower=x.lower,upper=x.upper,a=alpha,b=log(or.G),c=log(or.X),g=g,case=0)$value*g.freqs[g+1]/(1-K)/q[g+1]-mu^2)
    means[2,g+1]=mu
    ses[2,g+1]=s

    if(print.plots) plot(x,G.integrand(x,alpha,log(or.G),log(or.X),g,0)*g.freqs[g+1]/(1-K)/q[g+1],t="l",ylab="density",main=paste("controls G=",g,sep=""),sub=paste("mean=",signif(mu,4)," SE=",signif(s,4),sep=""))
  }

  #plots cumulative distribution functions of X in each genotype class
  if(print.plots){
    cdf.1=rep(0,length(x))
    cdf.0=cdf.1
    case.distr=list()
    control.distr=list()
    for(g in 0:2){
      for(i in 1:length(x)){
        cdf.1[i]=g.freqs[g+1]/K/p[g+1]*integrate(G.integrand,lower=x.lower,upper=x[i],a=alpha,b=log(or.G),c=log(or.X),g=g,case=1)$value
        cdf.0[i]=g.freqs[g+1]/(1-K)/q[g+1]*integrate(G.integrand,lower=x.lower,upper=x[i],a=alpha,b=log(or.G),c=log(or.X),g=g,case=0)$value
      }
      case.distr[[g+1]]=cdf.1
      control.distr[[g+1]]=cdf.0
    }

    x11()
    par(mfcol=c(2,3))
    for(g in 0:2){
      plot(pnorm(x,mean=means[1,g+1],sd=ses[1,g+1]),case.distr[[g+1]],pch="+",xlab="p norm",ylab="p empirical",main=paste("cases G=",g,sep=""))
      abline(0,1)
      plot(pnorm(x,mean=means[2,g+1],sd=ses[2,g+1]),control.distr[[g+1]],pch="+",xlab="p norm",ylab="p empirical",main=paste("controls G=",g,sep=""))
      abline(0,1)
    }
  }

  if(population.controls){q=g.freqs;means[2,]=0.0;ses[2,]=1.0;}

  #simulate data and analyse by glm
  y=c(rep(1,ncases),rep(0,ncontrols))
  g=rep(NA,ncases+ncontrols)
  x=g
  res=matrix(0,ncol=7,nrow=nsimulations)
  for(iter in 1:nsimulations){
    g[1:ncases]=sample(c(0,1,2),prob=p,replace=TRUE,size=ncases)
    for(i in 0:2){
      x[y==1 & g==i]=rnorm(sum(y==1 & g==i),mean=means[1,i+1],sd=ses[1,i+1])}
    g[(ncases+1):(ncases+ncontrols)]=sample(c(0,1,2),prob=q,replace=TRUE,size=ncontrols)
    for(i in 0:2){
      x[y==0 & g==i]=rnorm(sum(y==0 & g==i),mean=means[2,i+1],sd=ses[2,i+1])}
    summary(glm(y~g,family="binomial"))$coefficients[2,c(1,2)]->res[iter,1:2]
    glm(y~g+x,family="binomial")->glm.fit
    summary(glm.fit)$coefficients[3,1]->res[iter,3]
    summary(glm.fit)$coefficients[2,c(1,2)]->res[iter,4:5]
    glm(y~x,family="binomial")->glm.fit
    summary(glm.fit)$coefficients[2,1]->res[iter,6]
    summary(glm.fit)$coefficients[1,1]->res[iter,7] #a parameter in case-control sample
  }

  return(list(K=K,ncases=ncases,ncontrols=ncontrols,sample.a=median(res[,7]),case.G.freq=p,case.X.means=means[1,],case.X.ses=ses[1,],control.G.freq=q,control.X.means=means[2,],control.X.ses=ses[2,],population.controls=population.controls,marg.or.X=exp(mean(res[,6])),joint.or.X=exp(mean(res[,3])),marg.or.G=exp(mean(res[,1])),joint.or.G=exp(mean(res[,4])),marg.se.G=median(res[,2]),joint.se.G=median(res[,5]),NCP.GX.G=(mean(res[,4])/median(res[,5])*median(res[,2])/mean(res[,1]))^2))
}


phi.1.minus.phi.integrand<-function(x,a,c,mean,sd){
  #evaluates phi(x)*(1-phi(x))*theta(x), when theta is dnorm, see subsection "General case" in SoM 
  return( exp(a+c*x)/(1+exp(a+c*x))^2 *dnorm(x,mean=mean,sd=sd) )
}

analytic.var.ratio.normal<-function(a,c,phi,case.G.freq,case.X.means,case.X.ses,control.G.freq,control.X.means,control.X.ses,x.lower=-6,x.upper=6){

 #evaluates formula (10) in Online Methods for a single continuous covariate which has a normal distribution
 #in all six groups of individuals

 stopifnot(length(case.G.freq)==3)
 stopifnot(length(case.X.means)==3)
 stopifnot(length(case.X.ses)==3)
 stopifnot(abs(sum(case.G.freq)-1)<1e-4)
 stopifnot(length(control.G.freq)==3)
 stopifnot(length(control.X.means)==3)
 stopifnot(length(control.X.ses)==3)
 stopifnot(abs(sum(control.G.freq)-1)<1e-4)
 stopifnot(x.lower<x.upper)
 stopifnot(phi>0 & phi<1)

 v=0.0
 for(g in 1:3){
  v=v+(1-phi)*control.G.freq[g]*integrate(phi.1.minus.phi.integrand,lower=x.lower,upper=x.upper,a=a,c=c,mean=control.X.means[g],sd=control.X.ses[g])$value
 }	
 for(g in 1:3){
  v=v+phi*case.G.freq[g]*integrate(phi.1.minus.phi.integrand,lower=x.lower,upper=x.upper,a=a,c=c,mean=case.X.means[g],sd=case.X.ses[g])$value
 }	

 return(v/phi/(1-phi))
}