############################################################################### # # # Chapter 3 - BINOMIAL MODELS # # One-way binomial models # # # # R. Labouriau # # Last revision/edition: Spring 2023. # ############################################################################### ############################################################################### # Preparation # ############################################################################### # Setting the place where the output (graphs) wil be kept setwd("~/BSA") load("DataFramesStatisticalModelling.RData") ################################################################################ # Example Radamachera # # Logistic regression # ################################################################################ str(Ch3.Radamachera.simple) attach(Ch3.Radamachera.simple) # Ploting the proportions # pdf(file="Figure-Ch3-61.pdf") plot(dose, abscised/nplants, cex=1.5 , xlab="Dose", ylab="Proportion abscised", pch=19, col='blue') prop <- tapply(abscised, factor(dose),sum)/ tapply(nplants, factor(dose),sum) prop points(levels(factor(dose)), prop, type='b', lty=2, lwd=1.5, pch=19, cex=1.5, col='black') # dev.off() # Inserting a random perturbation in the graph to see superposed points # pdf(file="Figure-Ch3-62.pdf") n <- length(dose) plot(dose+0.03*(runif(n)-0.5), abscised/nplants, cex=1.5 , xlab="Dose (+noise)", ylab="Proportion abscised", pch=19, col='blue') prop <- tapply(abscised, factor(dose),sum)/ tapply(nplants, factor(dose),sum) prop points(levels(factor(dose)), prop, type='b', lty=2, lwd=1.5, pch=19, col='black') # dev.off() # Ploting the logit functions and the logit transformed proportions Logit <- function(p){ return(log(p/(1-p))) } ILogit <- function(lodds){ return(exp(lodds)/(1+exp(lodds))) } # Ploting the logistic function and the inverse logistic function # pdf(file="Figure-Ch3-63.pdf") grid <- seq(from=0.005, to=0.995, by=0.0001) plot(grid, Logit(grid), type='l', lwd=3, xlab='Probability (p)', ylab='Lodds (logit(p))') # dev.off() # pdf(file="Figure-Ch3-64.pdf") grid <- seq(from=-4.59, to=4.459, by=0.0001) plot(grid, ILogit(grid), type='l', lwd=3, xlab='lodds ', ylab='Probability') abline(v=0) abline(h=1/2) # dev.off() # pdf(file="Figure-Ch3-65.pdf") ' plot(dose, Logit(abscised/nplants), cex=1.5 , xlab="Dose", ylab="Logit transformed proportion abscised", pch=19, col='blue') points(levels(factor(dose)), Logit(prop), type='b', lty=2, lwd=1.5, pch=19, col='black') cc <- coef(lm(Logit(prop) ~ as.numeric(levels(factor(dose))))) abline(a=cc[1], b=cc[2]) # dev.off() # Inserting a random perturbation in the graph to see superposed points # pdf(file="Figure-Ch3-66.pdf") n<-length(dose) plot(dose+0.03*(runif(n)-0.5), Logit(abscised/nplants), cex=1.5 , xlab="Dose (+noise)", ylab="Logit transformed proportion abscised", pch=19, col='blue') points(levels(factor(dose)), Logit(prop), type='b', lty=2, lwd=1.5, pch=19, col='black') cc <- coef(lm(Logit(prop) ~ as.numeric(levels(factor(dose))))) abline(a=cc[1], b=cc[2]) # dev.off() # Fitting a logit model resp<-cbind(abscised, nplants-abscised) fit1 <- glm( resp ~ dose, family=binomial(link="logit")) fitted.coef <- coef(fit1); fitted.coef summary(fit1) # Drawing the logistic adjusted logistic curve grid.points <- seq(from=min(dose), to=max(dose), by=0.001) eta <- fitted.coef[1] + ( fitted.coef[2] * grid.points) pred.prob <- ILogit(eta); pred.prob # pdf(file="Figure-Ch3-67.pdf") n<-length(dose) plot(dose+0.03*(runif(n)-0.5), abscised/nplants, cex=1.5 , xlab="Dose (+noise)", ylab="Proportion abscised", pch=19, col='blue') points(grid.points, pred.prob, type='l', lwd=2, col='red') # dev.off() # Fitting a free-curve model. The variable dose becomes a factor. resp<-cbind(abscised, nplants-abscised) fit1 <- glm( resp ~ dose, family=binomial(link="logit")) fit2 <- glm( resp ~ 0+factor(dose), family=binomial(link="logit")) coef(fit1) deviance(fit1) summary(fit2) fit2n <- glm( resp ~ factor(dose), family=binomial(link="logit")) summary(fit2n) coef(fit2) deviance(fit2) # Testing homogeneity deviance(fit2) length(dose) # 70 observations length(coef(fit2)) # 7 parameters in the free-curve model pchisq(deviance(fit2), df=70-7, lower.tail=F) deviance(fit2)/63 # Uffa! passed the homogeneity test. # Testing the assumed (logistic) regression curve anova(fit1, fit2, test="Chisq") # Again passed the test! # Ploting the observed proportions and the predicted for the two models # to get an intuition of what is happening. grid.points <- seq(from=min(dose), to=max(dose), by=0.0001) eta <- fitted.coef[1] + ( fitted.coef[2] * grid.points) pred.prob <- ILogit(eta) n<-length(dose) # pdf(file="Figure-Ch3-68.pdf") plot(dose+0.03*(runif(n)-0.5), abscised/nplants, cex=1.5 , xlab="Dose", ylab="Proportion abscised", pch=19, col='blue') points(grid.points, pred.prob, type='l', lwd=3, col='red') free.lodds <- ILogit(coef(fit2)) points(levels(factor(dose)), free.lodds , type='p', cex=2, col='yellow', pch=19) # dev.off() # Testing the homogeneity assumption and the appropriatness of the regression # curve SIMULTANEOUSLY! deviance(fit1) length(dose) # 70 observations length(coef(fit1)) # 2 parameters in the free-curve model pchisq(deviance(fit1), df=70-2, lower.tail=F) deviance(fit2)/68 # Again passed the two tests simultaneously! fit3 <- glm( resp ~ 1, family=binomial(link="logit")) deviance(fit1) deviance(fit2) deviance(fit3) anova(fit3, fit1, test="Chisq") detach(Ch3.Radamachera.simple) ################################################################################ # Example Radamachera # # Multiple Logistic regression # ################################################################################ # Reading the data str(Ch3.Radamachera.multiple) attach(Ch3.Radamachera.multiple) # Ploting the proportions # pdf("Figure-Ch3-69.pdf") dose1 <- dose[variety==1] abscised1 <- abscised[variety==1] nplants1 <- nplants[variety==1] dose2 <- dose[variety==2] abscised2 <- abscised[variety==2] nplants2 <- nplants[variety==2] n <- length(dose1) plot(dose1+0.03*(runif(n)-0.5), abscised1/nplants1, cex=1.5 , xlab="Dose", ylab="Proportion abscised", pch=19, col='blue', ylim=range(abscised/nplants)) prop <-tapply(abscised1, factor(dose1),sum)/ tapply(nplants1, factor(dose1),sum) prop points(levels(factor(dose1)), prop, type='b', lty=2, lwd=2.5, pch=19, col='blue') n <- length(dose2) points(dose2+0.03*(runif(n)-0.5), abscised2/nplants2, cex=1.5 , xlab="Dose", ylab="Proportion abscised", pch=19, col='red') prop <- tapply(abscised2, factor(dose2),sum)/ tapply(nplants2, factor(dose2),sum) prop points(levels(factor(dose2)), prop, type='b', lty=2, lwd=2.5, pch=19, col='red') # dev.off() # Plotting the logit transformed proportions # pdf("Figure-Ch3-70.pdf") Logit <- function(p){ return(log(p/(1-p))) } ILogit <- function(lodds){ return(exp(lodds)/(1+exp(lodds))) } plot(dose1+0.03*(runif(n)-0.5), Logit(abscised1/nplants1), cex=1.5 , xlab="Dose", ylab="Logistic transformed proportion abscised", pch=19, col='blue', ylim=range(Logit(abscised/nplants))) prop <-tapply(abscised1, factor(dose1),sum)/ tapply(nplants1, factor(dose1),sum) prop points(levels(factor(dose1)), Logit(prop), type='b', lty=2, lwd=2.5, pch=19, col='blue') n <- length(dose2) points(dose2+0.03*(runif(n)-0.5), Logit(abscised2/nplants2), cex=1.5 , xlab="Dose", ylab="Proportion abscised", pch=19, col='red') prop <- tapply(abscised2, factor(dose2),sum)/ tapply(nplants2, factor(dose2),sum) prop points(levels(factor(dose2)), Logit(prop), type='b', lty=2, lwd=2.5, pch=19, col='red') # dev.off() # Fitting (simultaneously) a separate logistic regression for each variety response <- cbind(abscised , nplants-abscised) diff.slopes <- glm(response ~ factor(variety) * dose, family=binomial(link='logit') ) summary(diff.slopes) # Different parametrization diff.slopes <- glm(response ~ 0 + factor(variety) * dose , family=binomial(link='logit') ) summary(diff.slopes) # Yet another parametrization diff.slopes <- glm(response ~ 0 + factor(variety) * dose - dose, family=binomial(link='logit') ) summary(diff.slopes) # Fitting a parallel model. That is different intercepts and common slope # in the lodds scale parallel <- glm(response ~ factor(variety) + dose, family=binomial(link='logit') ) summary(parallel) # Using another parametrisation parallel <- glm(response ~ 0 + factor(variety) + dose, family=binomial(link='logit') ) summary(parallel) # Fitting a free curve model, attributing one lodds (or equiv. probability) # for each combination of variety and dose free.curve <- glm(response ~ 0 + factor(variety) * factor(dose), family=binomial(link='logit') ) summary(free.curve) # Testing the homogeneity assumption deviance(free.curve) length(dose) # 140 observations length(coef(free.curve)) # 14 parameters pchisq(deviance(free.curve), df=140-4, lower.tail=F) # Uffa! assed this test! # Testing the apropriattness of the logistic curve used anova(diff.slopes, free.curve, test="Chisq") # Passes this test also! # Joint test of homogeneity and appropriatness of the logistic curve # I fit the saturated model, just for fun. Obs saturated <- glm(response ~ 0 + factor(Obs), family=binomial(link='logit') ) deviance(saturated) anova(diff.slopes, saturated, test="Chisq") # Testing additivity, i.e. parallelism of the two curves in the lodds scale anova(parallel, diff.slopes, test="Chisq") # Ploting the observed proportions and the predicted for the two models # to get an intuition of what is happening. # pdf("Figure-Ch3-71.pdf") dose1 <- dose[variety==1] abscised1 <- abscised[variety==1] nplants1 <- nplants[variety==1] dose2 <- dose[variety==2] abscised2 <- abscised[variety==2] nplants2 <- nplants[variety==2] n <- length(dose1) plot(dose1+0.03*(runif(n)-0.5), abscised1/nplants1, cex=1.5,xlab="Dose(+noise)", ylab="Proportion abscised", pch=19, col='blue', ylim=range(abscised/nplants)) n <- length(dose2) points(dose2+0.03*(runif(n)-0.5), abscised2/nplants2, cex=1.5 , xlab="Dose", ylab="Proportion abscised", pch=19, col='red') fitted.coef <- coef(parallel) ; fitted.coef grid.points <- seq(from=min(dose), to=max(dose), by=0.0001); eta1 <- fitted.coef[1] + ( fitted.coef[3] * grid.points) eta2 <- fitted.coef[2] + ( fitted.coef[3] * grid.points) pred.prob1 <- ILogit(eta1); pred.prob1[1:10] pred.prob2 <- ILogit(eta2); pred.prob2 [1:10] points(grid.points, pred.prob1, type='l', lwd=3, col='blue') points(grid.points, pred.prob2, type='l', lwd=3, col='red') free.curve <- glm(response ~ 0 + factor(dose) : factor(variety), family=binomial(link='logit') ) coef(free.curve) free.lodds <- ILogit(coef(free.curve)) ; free.lodds points(levels(factor(dose)), free.lodds[1:7] , type='p', cex=2.6, col='blue') points(levels(factor(dose)), free.lodds[1:7] , type='p', cex=3 , col='blue') points(levels(factor(dose)), free.lodds[1:7] , type='p', cex=2.0, col='yellow', pch=19) points(levels(factor(dose)), free.lodds[8:14] , type='p', cex=2.6, col='red') points(levels(factor(dose)), free.lodds[8:14] , type='p', cex=3 , col='red') points(levels(factor(dose)), free.lodds[8:14] , type='p', cex=2.0, col='yellow', pch=19) # dev.off() ################################################################################ # # Analysis of the Bulls' fertility example # ################################################################################ FiguresOut <- "~/Dropbox/Courses/Basic Statistical Analysis in Life and Environmental Sciences/LectureNotes/" str(BullSeminalViabilityH) attach(BullSeminalViabilityH) table(Dose) summary(N) ( Proportions <- tapply(Alive, Dose, sum)/ tapply(N, Dose, sum) ) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-01.pdf", sep="")) plot(1:6, Proportions[1:6], type="b", pch=19, col="blue", ylim=c(0,1), lwd=3, xlab="Dose", ylab="Proportion of motile cells" ) # dev.off() Logit <- function(p) return(log(p/(1-p))) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-02.pdf", sep="")) plot(1:6, Logit(Proportions[1:6]), type="b", pch=19, col="blue", ylim=range(Logit(Proportions)), xlab="Dose", ylab="Log-odds of motile cells" ) # dev.off() resp <- cbind(Alive, N-Alive) resp[1:5, ] fit1 <- glm(resp ~ factor(Dose)+0, family=binomial(link="logit")) summary(fit1) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-03.pdf", sep="")) plot(fitted(fit1) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); abline(v=0.5) # dev.off() # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-04.pdf", sep="")) plot(fitted(fit1) , residuals(fit1,type="pearson"), ylab="Pearson-residuals", xlab="Fitted values") abline(h=0); abline(v=0.5) # dev.off() deviance(fit1) n.observations <- length(N) n.observations n.parameters.fit1 <- length(coef(fit1)) n.parameters.fit1 n.df.fit1 <- n.observations - n.parameters.fit1 n.df.fit1 # Testing the homogeneity of the means pchisq(deviance(fit1), df=n.df.fit1, lower.tail=F) fit2 <- glm(resp ~ Dose, family=binomial) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-05.pdf", sep="")) plot(factor(Dose) , residuals(fit2,type="response"), ylab="Raw-residuals", xlab="Dose", col="lightblue") abline(h=0) lines(lowess(x=Dose, y=residuals(fit2,type="response"),f = 1/2), lwd=6, col="red") points(Dose, residuals(fit2,type="response")) # dev.off() summary(fit2) # Performing a test anova(fit2, fit1, test="Chisq") # Fitting a third model fit3 <- glm(resp ~ 1, family=binomial) anova(fit3, fit2, test="Chisq") coef(fit2) confint(fit2) detach(BullSeminalViabilityH) rm(fit1, fit2, fit3, n.observations, n.parameters.fit1, n.df.fit1) ################################################################################ # # Two simultaneous regressions # ################################################################################ str(BullSeminalViabilityALL) attach(BullSeminalViabilityALL) # Exploratory analysis mean(N) table(Breed, Dose) Proportions <- tapply(Alive, list(Dose, Breed), sum)/ tapply(N, list(Dose, Breed), sum) Proportions # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-06.pdf", sep="")) plot(1:6, Proportions[1:6], type="b", pch=19, col="blue", ylim=c(0,1), xlab="Dose", ylab="Proportion of motile cells", lwd=3 ) points(1:6, Proportions[7:12], type="b", pch=19, col="red", lwd=3) legend(x="topright", legend=c("H","J"), col=c("blue","red"), lwd=3, bty="n") # dev.off() Logit <- function(p) return(log(p/(1-p))) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-07.pdf", sep="")) plot(1:6, Logit(Proportions[1:6]), type="b", pch=19, col="blue", ylim=range(Logit(Proportions)), xlab="Dose", ylab="Log-odds of motile cells" ) points(1:6, Logit(Proportions[7:12]), type="b", pch=19, col="red") legend(x="topright", legend=c("H","J"), col=c("blue","red"), lwd=3, bty="n") # dev.off() # Fitting the first model resp <- cbind(Alive, N-Alive) resp[1:5, ] fit1 <- glm(resp ~ Breed*factor(Dose), family=binomial(link="logit")) summary(fit1) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-08.pdf", sep="")) plot(fitted(fit1) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); abline(v=0.5) # dev.off() # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-09.pdf", sep="")) plot(fitted(fit1) , residuals(fit1,type="pearson"), ylab="Pearson-residuals", xlab="Fitted values") abline(h=0); abline(v=0.5) # dev.off() deviance(fit1) n.observations <- length(N) n.observations n.parameters.fit1 <- length(coef(fit1)) n.parameters.fit1 n.df.fit1 <- n.observations - n.parameters.fit1 n.df.fit1 # Performing a test pchisq(deviance(fit1), df=n.df.fit1, lower.tail=F) # Fitting the second model with two logistic curves # using the direct parametrisation fit2 <- glm(resp ~ 0 + Breed - Dose + Breed:Dose, family=binomial) summary(fit2) # Fitting the second model with two logistic curves # using the standard (corner point) parametrisation fit2 <- glm(resp ~ Breed + Dose + Breed:Dose, family=binomial) summary(fit2) # Performing a test anova(fit2, fit1, test="Chisq") # Fitting an additive model (direct parametrisation) fit3 <- glm(resp ~ 0 + Breed + Dose, family=binomial) summary(fit3) # Fitting an additive model (corner point parametrisation) fit3 <- glm(resp ~ Breed + Dose, family=binomial) summary(fit3) # Performing a test anova(fit3, fit2, test="Chisq") # Fitting a fourth model fit4 <- glm(resp ~ Breed, family=binomial) summary(fit4) # Performing a test anova(fit4, fit3, test="Chisq") # Fitting a fifth model fit5 <- glm(resp ~ Dose, family=binomial) summary(fit5) # Performing a test anova(fit5, fit3, test="Chisq") # Fitting a sixth model fit6 <- glm(resp ~ 1, family=binomial) summary(fit6) # Performing a test anova(fit6, fit3, test="Chisq") # Performing another test anova(fit6, fit1, test="Chisq") # Summing up deviance(fit3) n.observations <- length(N) n.observations n.parameters.fit3 <- length(coef(fit3)) n.parameters.fit3 n.df.fit3 <- n.observations - n.parameters.fit3 n.df.fit3 # This gives the p-value of a joint test! pchisq(deviance(fit3), df=n.df.fit3, lower.tail=F) summary(fit3) # pdf(paste(FiguresOut,"Figure-Ch3-Continuous-10.pdf", sep="")) plot(1:6, Proportions[1:6], pch=19, col="blue", ylim=c(0,1), cex=1.5, xlab="Dose", ylab="" ) points(1:6, Proportions[7:12], pch=19, col="red", cex=1.5) coef(fit3) grid <- seq(from=1, to=6, by=0.1) responseH <- coef(fit3)[1] + coef(fit3)[3]*grid responseJ <- (coef(fit3)[1] + coef(fit3)[2]) + coef(fit3)[3]*grid iLogit <- function(L) return(exp(L)/(1+exp(L))) points(grid, iLogit(responseH), type="l", col="blue", lwd=3) points(grid, iLogit(responseJ), type="l", col="red", lwd=3) legend(x="topright", legend=c("H","J"), col=c("blue","red"), lwd=3, bty="n") # dev.off() detach(BullSeminalViabilityALL) rm(grid, responseH, responseJ, n.observations, n.parameters.fit1, Proportions, n.parameters.fit3, n.df.fit1, n.df.fit3, fit1, fit2, fit3, fit4, fit5) ################################################################################ # # THE END! # ################################################################################ # That is all. Bye, bye ....