############################################################################### # # # Chapter 3 - BINOMIAL MODELS # # One-way binomial models # # # # R. Labouriau # # Last revision: Spring 2023. # ############################################################################### ############################################################################### # Preparation # ############################################################################### # Looking what we have available (dataframes, vectors, constants, matrices, # results of analyses, etc) ls() # Cleaning the work space (everything that is there will be removed!, # do not execute that if you want to keep # some of the things you have there, # there is no way to undo this operation!) # rm(list = ls()) # ls() # Loading the datasets required load("~/BSA/DataFramesStatisticalModelling.RData") # Here ""~/BSA" is (the Mac address) were I stored the file # "DataFramesStatisticalModelling.RData", please replace that by the address # of the folder where YOU stored this file in your computer. # Looking again what we have available AFTER having loaded the material of the # course ls() # Setting the place where the output (graphs) wil be kept setwd("~/BSA") ############################################################################### # Drawing some figures for the review # ############################################################################### # # Figure illustrating many tosses of a fair coin repeated three times # # pdf("12-Biostatistics-L-03-Fig02.pdf") x <- rbinom(10,1, 0.5) plot(10, mean(x), xlim=c(10,2000), ylim=c(0.1,0.9), xlab="Number of trials", ylab="Relative frequency", cex=0.5) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.5)) nt <- c(nt, i+10) points(i, mean(x), cex=0.5) } abline(h=0.5) x <- rbinom(10,1, 0.5) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.5)) nt <- c(nt, i+10) points(i, mean(x), col="red", cex=0.5) } x <- rbinom(10,1, 0.5) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.5)) nt <- c(nt, i+10) points(i, mean(x), col="blue", cex=0.5) } # dev.off() ############################################################################### # pdf("12-Biostatistics-L-03-Fig03.pdf") x <- rbinom(10,1, 0.75) plot(10, mean(x10), xlim=c(10,2000), ylim=c(0,1), xlab="Number of trials", ylab="Relative frequency", cex=0.5) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.75)) nt <- c(nt, i+10) points(i, mean(x), cex=0.5) } x <- rbinom(10,1, 0.75) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.75)) nt <- c(nt, i+10) points(i, mean(x), col="red", cex=0.5) } x <- rbinom(10,1, 0.75) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.75)) nt <- c(nt, i+10) points(i, mean(x), col="blue", cex=0.5) } # dev.off() ############################################################################### # pdf("12-Biostatistics-L-03-Fig04.pdf") x <- rbinom(10,1, 0.20) plot(10, mean(x10), xlim=c(10,2000), ylim=c(0,1), xlab="Number of trials", ylab="Relative frequency", cex=0.5) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.20)) nt <- c(nt, i+10) points(i, mean(x), cex=0.5) } x <- rbinom(10,1, 0.20) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.20)) nt <- c(nt, i+10) points(i, mean(x), col="red", cex=0.5) } x <- rbinom(10,1, 0.20) nt <- 10 for(i in 1:1990){ x <- c(x, rbinom(1,1,0.20)) nt <- c(nt, i+10) points(i, mean(x), col="blue", cex=0.5) } # dev.off() ############################################################################### # # Figure illustrating a unpredictable behaviour obtained from a predictable # sequence, we iterate the function f(x) = 2 x^2 - 1 # f <- function(x){return(2*x^2-1)} Y <- numeric(250) Y[1] <- 0.50001 for(i in 1:249){ Y[i+1] <- f(Y[i]) } # pdf(file="Figure-Module-2-Day3-01.pdf") plot(Y , type='b', cex=0.5, pch=19, xlab="", ylab="") # dev.off() ############################################################################### # # Figure illustrating the law of large numbers # # Here we set the simulation parameters. n.rep <- 10000 # Number of simulations n.trials <- 20 # Number of binomial trials (or size) prob.success <- 1/2 # Parameter describing the probability of success in # each binary trial x <- rbinom(n=n.rep, size=n.trials, prob=prob.success) x mean(x) n.trials*prob.success mean(x) - (n.trials*prob.success) # Studying the behaviour of the mean for an increasing number of replications X <- numeric(1000) N.rep <- 1:1000 + 3 ; N.rep for(i in 1:1000){ bb <- rbinom(n=N.rep[i], size=n.trials, prob=prob.success) X[i] <- mean(bb) } # pdf("~/Dropbox/Courses/12-StatisticalModelling/Lectures/FigureLectureDay3-00.pdf") plot(N.rep, X, ylab='Sample mean', xlab='Number of repetitions') abline(h= n.trials*prob.success, col='red', lwd=3) # dev.off() ############################################################################### # # Figure illustrating the central limit theorem # # Binomial distribution n.rep <- 1000 n.observations <- 10000 for(i in 1:n.rep){ x <- rbinom(n=n.observations, size=5, prob=0.5) X[i] <- (sqrt(n.observations)*(mean(x) - 5/2)) / sqrt(0.5*(1-0.5)*5) } # pdf("~/Dropbox/Courses/12-StatisticalModelling/Lectures/FigureLectureDay3-00B.pdf") qqnorm(X); qqline(X) # dev.off() ############################################################################### # # Figure illustrating the failure of the central limit theorem for the Cauchy # distribution (i.e. the ratio of two independent standard normal distributions) # n.rep <- 10000 X <- numeric(n.rep) n.observations <- 1000 for(i in 1:n.rep){ y <- rnorm(n.observations) z <- rnorm(n.observations) x <- y/z X[i] <- (sqrt(n.observations)*(mean(x) - 0.5)) / sqrt(var(x)) } # pdf(file="Figure-Module-2-Day3-02.pdf") qqnorm(X); qqline(X) # dev.off() ############################################################################### ############################################################################### # # # Analysis of the simple example of the seed germination, # # first only with only one watering level and four repetitions. # # # ############################################################################### # We insert the data by hand (there are only 4 observations) Box <- c( 1, 2, 3, 4) ; Box Germinated <- c( 22, 25, 27, 23) ; Germinated N <- c(100, 100, 100, 100) ; N # We calculate next the maximum likelihood estimate (MLE), which is just # the relative frequency, i.e. the proportion. MLE <- sum(Germinated) / sum(N) MLE # Now we calculate the MLE using an alternative way, which is much more # complex, but will be general and appliable in much more compplex # situations. We will fit a model a simple binomial model with common # germination probability the four among boxes. # Remark: This is not a reasonable method to calculate a proportion!!! # Constructing the response matrix for defining the binomial model. # The matrix has two columns, the first containing the number of successes # (i.e. germinated seeds) and the second column containing the number of # failures (i.e. the number of seeds that did not germinate) print(Germinated) print(N) print(N-Germinated) response <- cbind(Germinated, N-Germinated) print(response) # We "fit" the binomial model and storing all the results in an object # called "TheModel" TheModel <- glm(response ~ 1, family=binomial(link="identity")) # The object called "TheModel" keeps many results of the current model # and the inference made. Let us loock at its contents: str(TheModel) # Here is another way to look at the object "TheModel" ls.str(pat="TheModel") print(TheModel) # We extract the value of the estimate from the object "TheModel" coefficients(TheModel) # Now, we compare with the MLE calculated by hand. MLE # We can also obtain more information on the model inferred by using the # function "summary" summary(TheModel) # Cleaning up rm(TheModel, response, N, Box, Germinated) ############################################################################### # # # Studying a model with one probability of germination for each watering # # level (treatment) # # # ############################################################################### # Data is contained in the dataframe "Ch3.germinationLevel1_5". # Next we examine the contents of this dataframe (which was one of the data # frames contained in the object "DataFramesStatisticalModelling.RData" loaded # in the begining of this program) # str(Ch3.germinationLevel1_5) # print(Ch3.germinationLevel1_5) # We attach the dataframe for comfortably working with it. That is, # we copy all the variables (vectors) of the data-frame Ch3.germinationLevel1_5 # to the R workspace. attach(Ch3.germinationLevel1_5) # We examine the three variables that were attached Box H2O Germ # We create next a variable containing the number of seeds per box N <- rep(100, length(Box)) N # We construct a factor classifying the observations according to the water # levels used Water <- factor(H2O) # We check if the variables "H2O" and "Water" are factors is.factor(H2O) is.factor(Water) # We calculate the number of germinated seeds per water level tapply(X=Germ, INDEX=Water, FUN=sum) # equivalently tapply(Germ, Water, sum) # We calculate the number of seeds per water level tapply(N, Water, sum) # Calculating the proportions of germinated seeds per water level tapply(Germ, Water, sum)/ tapply(N, Water, sum) # A more elegant calculation for the proportion is tapply(Germ, Water, mean)/100 # We can also keep this calculations the vector (variable) prop prop <- tapply(Germ, Water, sum)/ tapply(N, Water, sum) prop max(prop) min(prop) range(prop) # We plot the proportions per watering level plot(1:5, prop, ylim=c(0,1), cex=2, pch=19, col="blue", ylab="Proportion of germinated", xlab="Watering level") # We fit a one-way binomial model # We create the response, a matrix with the numb. of success in the first # column and the number of failures in the second column resp <- cbind(Germ, N-Germ) print(resp) OneWayModel <- glm(resp ~ Water + 0, family=binomial(link="identity")) coefficients(OneWayModel) # Or equivalently in short coef(OneWayModel) summary(OneWayModel) # Now lets compare the MLE calculated by hand (proportions) prop <- tapply(Germ, Water, sum)/ tapply(N, Water, sum) ; prop # and the MLE inferred with the general procedure for making inference # with one-way binomial models coef(OneWayModel) # Remark: this is a rather complicated way to calculate simple proportions!!! # Another way to fit the model which has better properties, among others # gives more numerical stability. The idea is to use a function of the # probabilities of germination insted of the probabilities to characterize # the model. A probability, say p, is transformed to log(p/(1-p)), which is # called the logits or lodds. Note that the ratio p/(1-p) is calleds the odds # of an event. The larger the odds, the more probable is the event. The odds # take positive values, if p = 0 then the odds are 0, for p = 1/2 the odds are # 0 and as the value of p increases to 1, the odds increses indefinitely. # Taking the logarithm of the odds yields the lodds or logits. The larger the # lodds the more probable is the event. The lodds take values in the whole # real line. When p = 1/2 the lodds are equal to zero, when p is less than # 1/2 the lodds are negative, when p approachs 0 the lodds decreases # indefinitely, when p is larger than 1/2 the lodds are positive and the lodds # increase indefinitely when p increases to 1. # Here we define a function that transforms a probabilities into lodds. logit <- function(p){ return(log(p/(1-p))) } # Next we test some values logit(1/2) logit(0.1) logit(0.001) logit(0.0001) logit(0.99) logit(0.999) logit(0.9999) # Next we plot the lodds # pdf(file="Figure-Module-2-Day3-10.pdf") pp <- seq(from=0.001, to=0.999, by=0.001) plot(pp,logit(pp), type='l', lwd=2, xlab='Probabilities', ylab='Lodds') abline(h=0) abline(v=0.5) # dev.off() # We can also transform lodds in probabilities by the formula # exp(lodds)/(1+exp(lodds)) . # Here I define a function that transforms probabilities into lodds, # called the inverse of the logit. ilogit <- function(lodds){ return( exp(lodds)/(1+exp(lodds)) ) } # Next I calculate some values ilogit(0) ilogit(1) ilogit(5) ilogit(10) ilogit(-1) ilogit(-5) ilogit(-10) # Now I plot the inverse of the logit # pdf(file="Figure-Module-2-Day3-11.pdf") ll <- seq(from=-6, to=6, by=0.001) plot(ll,ilogit(ll), type='l', lwd=2, ylim=c(0,1), xlab='Lodds', ylab='Probability') abline(h = c(0,1), lty = 2) # dev.off() # To fit a model using the lodds instead of probabilities we have to change the # parameter "family" of the function glm. So instead of using # "family=binomial(link="identity")" we use # "family=binomial(link="logit")". # Here I fit a model with one different lodd for each watering level fit2 <- glm(resp ~ Water + 0, family=binomial(link="logit")) coefficients(fit2) # Please observe the difference between the call and the results for the # model defined with the probabilities instead of the lodds fit1 <- glm(resp ~ Water + 0, family=binomial(link="identity")) coefficients(fit1) # Here I check whether the results of the two models are equivalent # by transforming the coefficients obtained with each model and checking # whether I obtain the same result. # First I transform the lodds obtained with fit2 into probabilities # and compare with the probabilities estimated with fit1 coefficients(fit2) ilogit(coefficients(fit2)) coefficients(fit1) # Now I do the test in the opposite direction coefficients(fit1) logit(coefficients(fit1)) coefficients(fit2) # (obvious) Conclusion: The results of the models fit1 and fit2 # are equivalent. The function ilogit converts lodds into probabilities # and the function logit converts probabilities in lodds. # ---------------------------------------------------------------------------- # # Yet another parametrisation (using a reference) # # ---------------------------------------------------------------------------- # fit2 <- glm(resp ~ Water + 0, family=binomial(link="logit")) fit3 <- glm(resp ~ Water , family=binomial(link="logit")) coef(fit2); coef(fit3) summary(fit3) # ---------------------------------------------------------------------------- # # Testing the effect of watering via lik.ratio test # # ---------------------------------------------------------------------------- # # First I define a large model containing one parameter per watering level Large.model <- glm(resp ~ Water , family=binomial(link="identity")) # Next I define a reduced model containing only one parameter which is # common to all the levels of watering. Reduced.model <- glm(resp ~ 1, family=binomial(link="identity")) summary(Reduced.model) # Here I examine the deviance of the two models and calculate the # likelihood ratio statistic (Lambda) using the difference of deviances # or by extracting the log-likelihood from the model: logLik(Large.model) logLik(Reduced.model) Lambda <- 2*(as.numeric(logLik(Large.model))- as.numeric(logLik(Reduced.model))) Lambda # I extract then the number of parameters length(coef(Large.model)) length(coef(Reduced.model)) df.test <- length(coef(Large.model)) - length(coef(Reduced.model)) df.test Lambda/df.test # I calculate now the p-value pchisq(Lambda, df=df.test, lower.tail=F) # Conclusion: Reject the null hypothesis, thus there are differences in the # probability of germination among the different levels of watering # Here I perform the same test but using a logit model. # Observe that we get the same results! # These calculations can be done also using the deviance deviance(Large.model) deviance(Reduced.model) Lambda <- deviance(Reduced.model) - deviance(Large.model) Lambda # Note that we obtain the same value of Lambda as in the previous calculations. Large.model <- glm(resp ~ Water , family=binomial(link="logit")) Reduced.model <- glm(resp ~ 1, family=binomial(link="logit")) deviance(Large.model) deviance(Reduced.model) Lambda <- deviance(Reduced.model) - deviance(Large.model) Lambda length(coef(Large.model)) length(coef(Reduced.model)) df.test <- length(coef(Large.model)) - length(coef(Reduced.model)) df.test Lambda/df.test pchisq(Lambda, df=df.test, lower.tail=F) # Alternative quicker ways for testing anova(Reduced.model, Large.model, test="Chisq") anova(Large.model, Reduced.model, test="Chisq") # ---------------------------------------------------------------------------- # # Testing homogeneity via likelihood ratio test # # ---------------------------------------------------------------------------- # # I will define a saturated model, which will be a model attributing one # different probability to each observation (box). This model will be # the "Large" model, which will be compared to the "reduced" model # given by the model wich assigns one probability to each watering level. # For defining the saturated model I construct a factor (i.e. a classification # variable) Box<-factor(Box) Saturated.model <- glm(resp ~ Box , family=binomial(link="identity")) deviance(Saturated.model) # Why is the deviance of the Saturated.model so small? Watering.model <- glm(resp ~ Water , family=binomial(link="identity")) deviance(Watering.model) Lambda <- deviance(Watering.model) # Calculating the number of parameters and df length(coef(Watering.model)) length(coef(Saturated.model)) deg.freedom <- length(coef(Saturated.model)) - length(coef(Watering.model)) deg.freedom # Calculating the p-value pchisq(Lambda, df=deg.freedom, lower.tail=F) # Conclusion: reject hypothesis of homogeneity # Alternative (quicker way to perform the test) anova(Watering.model, Saturated.model, test="Chisq") ################################################################################ # ---------------------------------------------------------------------------- # # Identifying the box(es) causing the inhomogeneity # # ---------------------------------------------------------------------------- # # Displaying the fitted values, reisuals and deviance-residuasl res.dev <- residuals(Watering.model,'deviance') res.dev fitted.v <- Watering.model$fitted.values fitted.v # Displaying a table cbind(Box, Water, Germ, fitted.v, res.dev) # Eliminating the observations with large residuals to test # whether they are causing the inhomogeneity res.dev <- residuals(Watering.model,'deviance') res.dev # calculating the absolute value of the deviance residuals abs(res.dev) Germ # Eliminating the observations with large residuals from the vector Germinated Germ[abs(res.dev) < 2] # Using the same technique with the response matrix resp resp [abs(res.dev) < 2, ] # Fitting the necessary models and testing homogeneity Saturated.model_R <- glm(resp[abs(res.dev) < 2, ] ~ Box[abs(res.dev) < 2] , family=binomial(link="identity")) Watering.model_R <- glm(resp[abs(res.dev) < 2, ] ~ Water[abs(res.dev) < 2] , family=binomial(link="identity")) # performing the homogeneity test anova(Watering.model_R, Saturated.model_R , test="Chisq") # Conclusion: the three boxes were causing the inhomogeneity ################################################################################ # ---------------------------------------------------------------------------- # # Calculating a Wald confidence interval # # ---------------------------------------------------------------------------- # ################################################################################ # Here I extract some basic information from the object fit1 # which contains the resuts of the model based on the probabilities Watering.model_R <- glm(resp[abs(res.dev) < 2, ] ~ Water[abs(res.dev) < 2] + 0, family = binomial(link="identity")) summary(Watering.model_R) # Here I extract a Wald confidence interval with 0.95 coverage # (based on an approximation of the distribution of the parameters estimates # by the normal distribution) confint.default(Watering.model_R) # Here I extract a profile likelihood confidence interval with 0.95 coverage # (based on some properties of the likelihood function) confint(Watering.model_R) ################################################################################ # OPTIONAL READING: # Details on the construction of the confidence interval # ################################################################################ # Here I find the variances and covariances of the parameters in the model vcov(Watering.model_R) # Here I extract only the variances of the parameters diag(vcov(Watering.model_R)) # Now I calculate the standard errors of the parameters (i.e. the square # root of the variances) and keep it in the vector called se # (check with third column of the report on the coefficients printed # by the summary(fit1) ) se <- sqrt(diag(vcov(Watering.model_R))) ; se # Next I put the parameters in a vector and calculate the lower and the upper # limits of the Wald conf. iterv. Parameters <- coef(Watering.model_R) ; Parameters # Note that "coef()" is a synonimous of "coefficients()" # Here I calculate the quantile for the normal distribution qnorm(0.975) # Here I calculate Lower.limitN <- Parameters - qnorm(0.975)* se Upper.limitN <- Parameters + qnorm(0.975)* se Lower.limitN ; Upper.limitN confint.default(Watering.model) Lower.limit <- Parameters - qt(0.975, df=20)* se Upper.limit <- Parameters + qt(0.975, df=20)* se Lower.limit; Upper.limit # Here I put everything togheter and report the results. cbind(Parameters, Lower.limit, Upper.limit, Lower.limitN , Upper.limitN, confint.default(Watering.model)) # ---------------------------------------------------------------------------- # # Making a function for calculating the Wald CI # # (equivalent to the function confint.default) # ---------------------------------------------------------------------------- # Wald.Conf.Int <- function(fit, coverage=0.95){ se <- sqrt(diag(vcov(fit))) Parameters <- coef(fit) z <- qnorm(1-((1-coverage)/2)) Lower.limit <- Parameters - z*se Upper.limit <- Parameters + z*se CI <- cbind(Parameters, Lower.limit, Upper.limit) return(CI) } Wald.Conf.Int(Watering.model_R) Wald.Conf.Int(Watering.model_R, coverage=0.90) Wald.Conf.Int(Watering.model_R, coverage=0.99) # ---------------------------------------------------------------------------- # # # IMPROOVING THE CONFIDENCE INTERVAL: # The confidence interval constructed above (known as the Wald CI) # is based on the fact that the estimates of the parameters are # approximately normal distributed. This approximation may (and often is) rather # rougth. A confidence interval with better properties is based in the # (profile) likelihood function. I will not explain here how we construct those # intervals and the general principles involved in the construction of those # confidence intervals. However, it is easy to calculate them using the # function confint, as I demonstrate below: confint(Watering.model_R) # Recomendation: use the profile-likelihood based confidence interval above. Nboots <- 999 # Number of (parametric) bootstrap simulations Npar <- 5 # Number of parameters BootParameters <- matrix(data = NA, nrow = Nboots, ncol = Npar) set.seed(1234) for(b in 1:Nboots){ BootResponses <- as.matrix(simulate(Watering.model_R)) BootModel <- glm(BootResponses ~ Water[abs(res.dev) < 2] + 0, family = binomial(link="identity")) BootParameters[b, ] <- coef(BootModel) } LowerBoot <- UpperBoot <- numeric(Npar) for(i in 1:Npar){ LowerBoot[i] <- quantile(BootParameters[ ,i], probs = 0.025) UpperBoot[i] <- quantile(BootParameters[ ,i], probs = 0.975) } cbind(LowerBoot, UpperBoot, confint(Watering.model_R)) # These calculations takes a long time if Nboots is large. # Here are the results using Nboots = 99999 # # LowerBoot UpperBoot 2.5 % 97.5 % # Water[abs(res.dev) < 2]1 0.200 0.2850000 0.2022274 0.2860616 # Water[abs(res.dev) < 2]2 0.360 0.4733333 0.3617652 0.4729877 # Water[abs(res.dev) < 2]3 0.625 0.7550000 0.6237665 0.7513940 # Water[abs(res.dev) < 2]4 0.740 0.8200000 0.7376943 0.8187307 # Water[abs(res.dev) < 2]5 0.685 0.7700000 0.6825058 0.7695872 detach(Ch3.germinationLevel1_5) ################################################################################ # # The end! # ################################################################################