###############################################################################
#                                                                             #
# Chapter 3 - BINOMIAL MODELS                                                 #
# One-way binomial models                                                     #
#                                                                             #
# R. Labouriau                                                                #
# Last revision: Spring 2023.                                                  #
###############################################################################

###############################################################################
# Preparation                                                                 #
###############################################################################

# Looking what we have available (dataframes, vectors, constants, matrices, 
# results of analyses, etc)

ls()

# Cleaning the work space (everything that is there will be removed!,
#                          do not execute that if you want to keep
#                          some of the things you have there, 
#                          there is no way to undo this operation!)

# rm(list = ls())

# ls()

# Loading the datasets required

load("~/BSA/DataFramesStatisticalModelling.RData")

# Here ""~/BSA" is (the Mac address) were I stored the file 
# "DataFramesStatisticalModelling.RData", please replace that by the address 
# of the folder where YOU stored this file in your computer.

# Looking again what we have available AFTER having loaded the material of the
# course

ls()

# Setting the place where the output (graphs) wil be kept
setwd("~/BSA")

###############################################################################
# Drawing some figures for the review                                         #
###############################################################################

#
# Figure illustrating many tosses of a fair coin repeated three times
#

# pdf("12-Biostatistics-L-03-Fig02.pdf")
x <- rbinom(10,1, 0.5)
plot(10, mean(x), xlim=c(10,2000), ylim=c(0.1,0.9), xlab="Number of trials", ylab="Relative frequency", cex=0.5)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.5))
  nt <- c(nt, i+10)
  points(i, mean(x), cex=0.5)
}
abline(h=0.5)

x <- rbinom(10,1, 0.5)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.5))
  nt <- c(nt, i+10)
  points(i, mean(x), col="red", cex=0.5)
}

x <- rbinom(10,1, 0.5)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.5))
  nt <- c(nt, i+10)
  points(i, mean(x), col="blue", cex=0.5)
}
# dev.off()

###############################################################################

# pdf("12-Biostatistics-L-03-Fig03.pdf")
x <- rbinom(10,1, 0.75)
plot(10, mean(x10), xlim=c(10,2000), ylim=c(0,1), xlab="Number of trials", ylab="Relative frequency", cex=0.5)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.75))
  nt <- c(nt, i+10)
  points(i, mean(x), cex=0.5)
}

x <- rbinom(10,1, 0.75)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.75))
  nt <- c(nt, i+10)
  points(i, mean(x), col="red", cex=0.5)
}

x <- rbinom(10,1, 0.75)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.75))
  nt <- c(nt, i+10)
  points(i, mean(x), col="blue", cex=0.5)
}
# dev.off()

###############################################################################

# pdf("12-Biostatistics-L-03-Fig04.pdf")
x <- rbinom(10,1, 0.20)
plot(10, mean(x10), xlim=c(10,2000), ylim=c(0,1), xlab="Number of trials", ylab="Relative frequency", cex=0.5)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.20))
  nt <- c(nt, i+10)
  points(i, mean(x), cex=0.5)
}

x <- rbinom(10,1, 0.20)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.20))
  nt <- c(nt, i+10)
  points(i, mean(x), col="red", cex=0.5)
}

x <- rbinom(10,1, 0.20)
nt <- 10
for(i in 1:1990){
  x <- c(x, rbinom(1,1,0.20))
  nt <- c(nt, i+10)
  points(i, mean(x), col="blue", cex=0.5)
}
# dev.off()

###############################################################################

#
# Figure illustrating a unpredictable behaviour obtained from a predictable
# sequence, we iterate the function f(x) = 2 x^2 - 1
#

f <- function(x){return(2*x^2-1)}

Y <- numeric(250)
Y[1] <- 0.50001
for(i in 1:249){
  Y[i+1] <- f(Y[i])
}
# pdf(file="Figure-Module-2-Day3-01.pdf")
plot(Y , type='b', cex=0.5, pch=19, xlab="", ylab="")
# dev.off()

###############################################################################

# 
# Figure illustrating the law of large numbers
# 

# Here we set the simulation parameters.
n.rep <- 10000             # Number of simulations
n.trials <- 20             # Number of binomial trials (or size)
prob.success <- 1/2        # Parameter describing the probability of success in 
# each binary trial
x <- rbinom(n=n.rep, size=n.trials, prob=prob.success)
x
mean(x)
n.trials*prob.success
mean(x) - (n.trials*prob.success)

# Studying the behaviour of the mean for an increasing number of replications
X <- numeric(1000)
N.rep <- 1:1000 + 3 ; N.rep
for(i in 1:1000){
  bb <- rbinom(n=N.rep[i], size=n.trials, prob=prob.success)
  X[i] <- mean(bb)
}
# pdf("~/Dropbox/Courses/12-StatisticalModelling/Lectures/FigureLectureDay3-00.pdf")
plot(N.rep, X, ylab='Sample mean', xlab='Number of repetitions')
abline(h= n.trials*prob.success, col='red', lwd=3)
# dev.off()

###############################################################################

#
# Figure illustrating the central limit theorem
#

# Binomial distribution
n.rep <- 1000
n.observations <- 10000
for(i in 1:n.rep){
  x <- rbinom(n=n.observations, size=5, prob=0.5)
  X[i] <- (sqrt(n.observations)*(mean(x) - 5/2)) / sqrt(0.5*(1-0.5)*5)
}

# pdf("~/Dropbox/Courses/12-StatisticalModelling/Lectures/FigureLectureDay3-00B.pdf")
qqnorm(X); qqline(X)
# dev.off()

###############################################################################

# 
# Figure illustrating the failure of the central limit theorem for the Cauchy
# distribution (i.e. the ratio of two independent standard normal distributions)
#

n.rep <- 10000
X <- numeric(n.rep)
n.observations <- 1000
for(i in 1:n.rep){
  y <- rnorm(n.observations)
  z <- rnorm(n.observations)
  x <- y/z
  X[i] <- (sqrt(n.observations)*(mean(x) - 0.5)) / sqrt(var(x))
}

# pdf(file="Figure-Module-2-Day3-02.pdf")
qqnorm(X); qqline(X)
# dev.off()

###############################################################################


###############################################################################
#                                                                             #
# Analysis of the simple example of the seed germination,                     #
# first only with only one watering level and four repetitions.               #
#                                                                             #
###############################################################################

# We insert the data by hand (there are only 4 observations)

Box        <- c(  1,   2,   3,   4) ; Box
Germinated <- c( 22,  25,  27,  23) ; Germinated
N          <- c(100, 100, 100, 100) ; N

# We calculate next the maximum likelihood estimate (MLE), which is just
# the relative frequency, i.e. the proportion.

MLE <- sum(Germinated) / sum(N)
MLE

# Now we calculate the MLE using an alternative way, which is much more
# complex, but will be general and appliable in much more compplex
# situations. We will fit a model a simple binomial model with common 
# germination probability the four among boxes.
# Remark: This is not a reasonable method to calculate a proportion!!!

# Constructing the response matrix for defining the binomial model.
# The matrix has two columns, the first containing the number of successes
# (i.e. germinated seeds) and the second column containing the number of
# failures (i.e. the number of seeds that did not germinate)

print(Germinated)
print(N)
print(N-Germinated)

response <- cbind(Germinated, N-Germinated)
print(response)

# We "fit" the binomial model and storing all the results in an object
# called "TheModel"

TheModel <- glm(response ~ 1, family=binomial(link="identity"))

# The object called "TheModel" keeps many results of the current model
# and the inference made. Let us loock at its contents:

str(TheModel)

# Here is another way to look at the object "TheModel"
ls.str(pat="TheModel")

print(TheModel)

# We extract the value of the estimate from the object "TheModel"

coefficients(TheModel)

# Now, we compare with the MLE calculated by hand.

MLE

# We can also obtain more information on the model inferred by using the
# function "summary"

summary(TheModel)

# Cleaning up
rm(TheModel, response, N, Box, Germinated)

###############################################################################
#                                                                             #
# Studying a model with one probability of germination for each watering      #
# level (treatment)                                                           #
#                                                                             #
###############################################################################

# Data is contained in the dataframe "Ch3.germinationLevel1_5".
# Next we examine the contents of this dataframe (which was one of the data
# frames contained in the object "DataFramesStatisticalModelling.RData" loaded
# in the begining of this program)
# 

str(Ch3.germinationLevel1_5) 

# print(Ch3.germinationLevel1_5)

# We attach the dataframe for comfortably working with it. That is,
# we copy all the variables (vectors) of the data-frame Ch3.germinationLevel1_5
# to the R workspace.
attach(Ch3.germinationLevel1_5)

# We examine the three variables that were attached
Box
H2O
Germ

# We create next a variable containing the number of seeds per box

N <- rep(100, length(Box))
N

# We construct a factor classifying the observations according to the water
# levels used

Water <- factor(H2O)

# We check if the variables "H2O" and "Water" are factors

is.factor(H2O)
is.factor(Water)

# We calculate the number of germinated seeds per water level

tapply(X=Germ, INDEX=Water, FUN=sum)

# equivalently

tapply(Germ, Water, sum)

#  We calculate the number of seeds per water level

tapply(N, Water, sum)
 
# Calculating the proportions of germinated seeds per water level
 
 tapply(Germ, Water, sum)/  tapply(N, Water, sum)
 
 # A more elegant calculation for the proportion is
 
 tapply(Germ, Water, mean)/100
 
 # We can also keep this calculations the vector (variable) prop
 
 prop <- tapply(Germ, Water, sum)/  tapply(N, Water, sum)

 prop
 
 max(prop)
 min(prop)
 range(prop)
 
# We plot the proportions per watering level
 
 plot(1:5, prop, ylim=c(0,1), cex=2, pch=19, col="blue",
      ylab="Proportion of germinated", xlab="Watering level")
      
# We fit a one-way binomial model

# We create the response, a matrix with the numb. of success in the first
# column and the number of failures in the second column
 
resp <- cbind(Germ, N-Germ)
print(resp)

OneWayModel <- glm(resp ~ Water + 0, family=binomial(link="identity"))

coefficients(OneWayModel)

# Or equivalently in short
coef(OneWayModel)

summary(OneWayModel)

# Now lets compare the MLE calculated by hand (proportions)
prop <- tapply(Germ, Water, sum)/  tapply(N, Water, sum) ; prop

# and the MLE inferred with the general procedure for making inference 
# with one-way binomial models

coef(OneWayModel)


# Remark: this is a rather complicated way to calculate simple proportions!!!

# Another way to fit the model which has better properties, among others
# gives more numerical stability. The idea is to use a function of the
# probabilities of germination insted of the probabilities to characterize
# the model. A probability, say p,  is transformed to log(p/(1-p)), which is
# called the logits or lodds. Note that the ratio p/(1-p) is calleds the odds
# of an event. The larger the odds, the more probable is the event. The odds
# take positive values, if p = 0 then the odds are 0, for p = 1/2 the odds are
# 0 and as the value of p increases to 1, the odds increses indefinitely.
# Taking the logarithm of the odds yields the lodds or logits. The larger the
# lodds the more probable is the event. The lodds take values in the whole
# real line. When p = 1/2 the lodds are equal to zero, when p is less than
# 1/2 the lodds are negative, when p approachs 0 the lodds decreases
# indefinitely, when p is larger than 1/2 the lodds are positive and the lodds
# increase indefinitely when p increases to 1.

# Here we define a function that transforms a probabilities into lodds.

logit <- function(p){
  return(log(p/(1-p)))
}

# Next we test some values
logit(1/2)
logit(0.1)
logit(0.001)
logit(0.0001)

logit(0.99)
logit(0.999)
logit(0.9999)

# Next we plot the lodds

# pdf(file="Figure-Module-2-Day3-10.pdf")
pp <- seq(from=0.001, to=0.999, by=0.001)
plot(pp,logit(pp), type='l', lwd=2,
     xlab='Probabilities', ylab='Lodds')
abline(h=0)
abline(v=0.5)   
# dev.off()

# We can also transform lodds in probabilities by the formula
# exp(lodds)/(1+exp(lodds)) .
# Here I define a function that transforms probabilities into lodds,
# called the inverse of the logit.

ilogit <- function(lodds){
 return( exp(lodds)/(1+exp(lodds)) )
}

# Next I calculate some values

ilogit(0)
ilogit(1)
ilogit(5)
ilogit(10)
ilogit(-1)
ilogit(-5)
ilogit(-10)

# Now I plot the inverse of the logit

# pdf(file="Figure-Module-2-Day3-11.pdf")
ll <- seq(from=-6, to=6, by=0.001)
plot(ll,ilogit(ll), type='l', lwd=2, ylim=c(0,1),
     xlab='Lodds', ylab='Probability')
abline(h = c(0,1), lty = 2)
# dev.off()

# To fit a model using the lodds instead of probabilities we have to change the
# parameter "family" of the function glm. So instead of using
# "family=binomial(link="identity")" we use
# "family=binomial(link="logit")".
# Here I fit a model with one different lodd for each watering level

fit2 <- glm(resp ~ Water + 0, family=binomial(link="logit"))
 
coefficients(fit2)
  
# Please observe the difference between the call and the results for the
# model defined with the probabilities instead of the lodds

fit1 <- glm(resp ~ Water + 0, family=binomial(link="identity"))
coefficients(fit1)
  
# Here I check whether the results of the two models are equivalent
# by transforming the  coefficients obtained with each model and checking
# whether I obtain the same result.

# First I transform the lodds obtained with fit2 into probabilities
# and compare with the probabilities estimated with fit1

coefficients(fit2)
ilogit(coefficients(fit2))
coefficients(fit1)
  
# Now I do the test in the opposite direction

coefficients(fit1)
logit(coefficients(fit1))
coefficients(fit2)

# (obvious) Conclusion: The results of the models fit1 and fit2
# are equivalent. The function ilogit converts lodds into probabilities
# and the function logit converts probabilities in lodds.
  
# ---------------------------------------------------------------------------- #
# Yet another parametrisation (using a reference)                              #
# ---------------------------------------------------------------------------- #
  
fit2 <- glm(resp ~ Water + 0, family=binomial(link="logit"))
fit3 <- glm(resp ~ Water    , family=binomial(link="logit"))
  
coef(fit2); coef(fit3)
  
summary(fit3)

# ---------------------------------------------------------------------------- #
# Testing the effect of watering via lik.ratio test                            #
# ---------------------------------------------------------------------------- #

# First I define a large model containing one parameter per watering level
  
Large.model   <- glm(resp ~ Water , family=binomial(link="identity"))
  
# Next I define a reduced model containing only one parameter which is
# common to all the levels of watering.
  
Reduced.model <- glm(resp ~ 1, family=binomial(link="identity"))

summary(Reduced.model)

# Here I examine the deviance of the two models and calculate the
# likelihood ratio statistic (Lambda) using the difference of deviances
# or by extracting the log-likelihood from the model:
  
logLik(Large.model)
logLik(Reduced.model)
  
Lambda <- 2*(as.numeric(logLik(Large.model))- as.numeric(logLik(Reduced.model))) 
Lambda
  
# I extract then the number of parameters
  
length(coef(Large.model))
length(coef(Reduced.model))

df.test <- length(coef(Large.model)) -  length(coef(Reduced.model))
 
df.test
 
Lambda/df.test
 
# I calculate now the p-value

pchisq(Lambda, df=df.test, lower.tail=F)
 
# Conclusion: Reject the null hypothesis, thus there are differences in the
#         probability of germination among the different levels of watering
 
# Here I perform the same test but using a logit model.
# Observe that we get the same results!

# These calculations can be done also using the deviance

deviance(Large.model)
deviance(Reduced.model)

Lambda <-  deviance(Reduced.model) - deviance(Large.model)
Lambda

# Note that we obtain the same value of Lambda as in the previous calculations.
 
Large.model   <- glm(resp ~ Water , family=binomial(link="logit"))
Reduced.model <- glm(resp ~ 1, family=binomial(link="logit"))
deviance(Large.model)
deviance(Reduced.model)
Lambda <-  deviance(Reduced.model) - deviance(Large.model)
Lambda
length(coef(Large.model))
length(coef(Reduced.model))
df.test <- length(coef(Large.model)) -  length(coef(Reduced.model))
df.test
Lambda/df.test
pchisq(Lambda, df=df.test, lower.tail=F)
 
 
# Alternative quicker ways for testing

anova(Reduced.model, Large.model, test="Chisq")

anova(Large.model, Reduced.model, test="Chisq")
 
# ---------------------------------------------------------------------------- #
# Testing homogeneity via likelihood ratio test                                #
# ---------------------------------------------------------------------------- #

# I will define a saturated model, which will be a model attributing one
# different probability to each observation (box). This model will be
# the "Large" model, which will be compared to the "reduced" model
# given by the model wich assigns one probability to each watering level.
 
# For defining the saturated model I construct a factor (i.e. a classification
# variable)
 
Box<-factor(Box)
Saturated.model <- glm(resp ~ Box , family=binomial(link="identity"))
deviance(Saturated.model)
# Why is the deviance of the Saturated.model so small?
 
Watering.model <- glm(resp ~ Water , family=binomial(link="identity"))
deviance(Watering.model)

Lambda <- deviance(Watering.model)
 
# Calculating the number of parameters and df
length(coef(Watering.model))
length(coef(Saturated.model))
deg.freedom <- length(coef(Saturated.model)) -  length(coef(Watering.model))
deg.freedom
  
# Calculating the p-value
 
pchisq(Lambda, df=deg.freedom, lower.tail=F)
  
# Conclusion: reject hypothesis of homogeneity
 
# Alternative (quicker way to perform the test)
 
anova(Watering.model, Saturated.model, test="Chisq")

################################################################################

# ---------------------------------------------------------------------------- #
# Identifying the box(es) causing the inhomogeneity                            #
# ---------------------------------------------------------------------------- #
  
# Displaying the fitted values, reisuals and deviance-residuasl

res.dev <- residuals(Watering.model,'deviance')
res.dev
fitted.v <- Watering.model$fitted.values
fitted.v
  
# Displaying a table
  
cbind(Box, Water, Germ, fitted.v, res.dev)

# Eliminating the observations with large residuals to test
# whether they are causing the inhomogeneity
res.dev <- residuals(Watering.model,'deviance')
res.dev

# calculating the absolute value of the deviance residuals
abs(res.dev)
 
Germ

# Eliminating the observations with large residuals from the vector Germinated
Germ[abs(res.dev) < 2]
 
# Using the same technique with the response matrix
resp
resp [abs(res.dev) < 2, ]
 
# Fitting the necessary models and testing homogeneity
Saturated.model_R <- glm(resp[abs(res.dev) < 2, ] ~ Box[abs(res.dev) < 2] ,
                                              family=binomial(link="identity"))
Watering.model_R <- glm(resp[abs(res.dev) < 2, ] ~ Water[abs(res.dev) < 2] ,
   family=binomial(link="identity"))

# performing the homogeneity test

anova(Watering.model_R, Saturated.model_R , test="Chisq")
  
 # Conclusion: the three boxes were causing the inhomogeneity

################################################################################
# ---------------------------------------------------------------------------- #
# Calculating a Wald confidence interval                                       #
# ---------------------------------------------------------------------------- #
################################################################################

# Here I extract some basic information from the object fit1
# which contains the resuts of the model based on the probabilities
 
Watering.model_R <- glm(resp[abs(res.dev) < 2, ] ~ Water[abs(res.dev) < 2] + 0, 
                        family = binomial(link="identity"))
summary(Watering.model_R)

# Here I extract a Wald confidence interval with 0.95 coverage
# (based on an approximation of the distribution of the parameters estimates
#  by the normal distribution)

confint.default(Watering.model_R)

# Here I extract a profile likelihood confidence interval with 0.95 coverage
# (based on some properties of the likelihood function)

confint(Watering.model_R)

################################################################################
# OPTIONAL READING:
# Details on the construction of the confidence interval
#
################################################################################

# Here I find the variances and covariances of the parameters in the model
vcov(Watering.model_R)

# Here I extract only the variances of the parameters
diag(vcov(Watering.model_R))

# Now I calculate the standard errors of the parameters (i.e. the square
# root of the variances) and keep it in the vector called se
# (check with  third column of the report on the coefficients printed
# by the summary(fit1) )

se <-  sqrt(diag(vcov(Watering.model_R))) ; se

# Next I put the parameters in a vector and calculate the lower and the upper
# limits of the Wald conf. iterv.

Parameters <- coef(Watering.model_R) ; Parameters
# Note that "coef()" is a synonimous of "coefficients()"

# Here I calculate the quantile for the normal distribution
qnorm(0.975)

# Here I calculate 
Lower.limitN <- Parameters - qnorm(0.975)* se
Upper.limitN <- Parameters + qnorm(0.975)* se

Lower.limitN ; Upper.limitN

confint.default(Watering.model)

Lower.limit <- Parameters - qt(0.975, df=20)* se
Upper.limit <- Parameters + qt(0.975, df=20)* se

Lower.limit; Upper.limit

# Here I put everything togheter and report the results.

cbind(Parameters, Lower.limit, Upper.limit, Lower.limitN , Upper.limitN,
      confint.default(Watering.model))

# ---------------------------------------------------------------------------- #
# Making a function for calculating the Wald CI                                #
# (equivalent to the function confint.default)
# ---------------------------------------------------------------------------- #

Wald.Conf.Int <- function(fit, coverage=0.95){
	se <-  sqrt(diag(vcov(fit)))
	Parameters <- coef(fit)
	z <-  qnorm(1-((1-coverage)/2))
	Lower.limit <- Parameters - z*se
	Upper.limit <- Parameters + z*se
	CI <- cbind(Parameters, Lower.limit, Upper.limit)
	return(CI)
}

Wald.Conf.Int(Watering.model_R)
Wald.Conf.Int(Watering.model_R, coverage=0.90)
Wald.Conf.Int(Watering.model_R, coverage=0.99)

# ---------------------------------------------------------------------------- #
#
# IMPROOVING THE CONFIDENCE INTERVAL:
# The confidence interval constructed above (known as the Wald CI)
# is based on the fact that the estimates of the parameters are 
# approximately normal distributed. This approximation may (and often is) rather
# rougth. A confidence interval with better properties is based in the
# (profile) likelihood function. I will not explain here how we construct those
# intervals and the general principles involved in the construction of those
# confidence intervals. However, it is easy to calculate them using the
# function confint, as I demonstrate below:

confint(Watering.model_R)

# Recomendation: use the profile-likelihood based confidence interval above.

Nboots <- 999        # Number of (parametric) bootstrap simulations
Npar <- 5            # Number of parameters
BootParameters <- matrix(data = NA, nrow = Nboots, ncol = Npar)
set.seed(1234)
for(b in 1:Nboots){
  BootResponses <- as.matrix(simulate(Watering.model_R))
  BootModel <- glm(BootResponses ~ Water[abs(res.dev) < 2] + 0, 
                   family = binomial(link="identity"))
  BootParameters[b, ] <- coef(BootModel)
}

LowerBoot <- UpperBoot <- numeric(Npar)
for(i in 1:Npar){
  LowerBoot[i] <- quantile(BootParameters[ ,i], probs = 0.025)
  UpperBoot[i] <- quantile(BootParameters[ ,i], probs = 0.975)
}

cbind(LowerBoot, UpperBoot, confint(Watering.model_R))

# These calculations takes a long time if Nboots is large.
# Here are the results using Nboots = 99999
# 
# LowerBoot UpperBoot     2.5 %    97.5 % 
# Water[abs(res.dev) < 2]1     0.200 0.2850000 0.2022274 0.2860616
# Water[abs(res.dev) < 2]2     0.360 0.4733333 0.3617652 0.4729877
# Water[abs(res.dev) < 2]3     0.625 0.7550000 0.6237665 0.7513940
# Water[abs(res.dev) < 2]4     0.740 0.8200000 0.7376943 0.8187307
# Water[abs(res.dev) < 2]5     0.685 0.7700000 0.6825058 0.7695872

detach(Ch3.germinationLevel1_5)

################################################################################
#
#  The end!
#
################################################################################