################################################################################ # # # Suggestions for the analysis of the six questions of the exam of the course # Basic Statistical Analysis. # 2024 # # Remark: The codes below are just suggestions for the analyses related to the # examen. You are welcome to modify these analyses, add more analyses # incorporating further aspects or use your own codes. # You do not need to incorporate all the results produced here in your # report. # ################################################################################ # # ################################################################################ # rm(list = ls()) # This command will remove everything previously placed in the # workspace (not possible to recover latter!) ################################################################################ # ## Installing some useful packages: # # ## A function to check the presence of required packages # InsPack <- function (pack){ # if (!pack %in% installed.packages()) { # print(paste("installing", pack)) # install.packages(pack) # } # else print(paste(pack, "already installed. :-)")) # } # ## Installing some usefull (standard) packages # InsPack("postHoc") # InsPack("igraph") # InsPack("lme4") # InsPack("nlme") # InsPack("multcomp") # InsPack("lsmeans") # InsPack("permute") # InsPack("MASS") # ################################################################################ setwd("~/BSA") # This is where I placed the required files. # Modify that to point to the place in your computer where # the file "ExamBSA2024.RData" is placed. load(file="ExamBSA2024.RData") ################################################################################ # Question 1 - Binomial two-ways # ################################################################################ D <- Exam2024.01 str(D) table(D$TempReg, D$Incecticide) table(D$N) Proportions <- tapply(D$NumberDead, list(D$Incecticide, D$TempReg), sum)/ tapply(D$N, list(D$Incecticide, D$TempReg), sum) Proportions plot(1:6, Proportions[1:6], type="b", pch=19, col="blue", ylim=c(0,1), xlab="Incecticide", ylab="Proportion Dead" ) points(1:6, Proportions[7:12], type="b", pch=19, col="red") Logit <- function(p) return(log(p/(1-p))) plot(1:6, Logit(Proportions[1:6]), type="b", pch=19, col="blue", ylim=range(Logit(Proportions)), xlab="Herbicide", ylab="Log-odds of alive" ) points(1:6, Logit(Proportions[7:12]), type="b", pch=19, col="red") # Fitting the first model is.factor(D$TempReg) is.factor(D$Incecticide) resp <- cbind(D$NumberDead, D$N-D$NumberDead) resp[1:5, ] fit1 <- glm(resp ~ Incecticide:TempReg + 0, family=binomial(link="logit"), data = D) summary(fit1) plot(fitted(fit1) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values") plot(fitted(fit1) , residuals(fit1,type="pearson"), ylab="Pearson-residuals", xlab="Fitted values") abline(h=0); abline(v=0.5) Nobs <- length(D$N); Perturbation <- runif(Nobs, min=-0.02, max=0.02) plot(fitted(fit1)+Perturbation, residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values + Noise") abline(h=0); abline(v=0.5) plot(fitted(fit1)+Perturbation , residuals(fit1,type="pearson"), ylab="Pearson-residuals", xlab="Fitted values + Noise") abline(h=0); abline(v=0.5) deviance(fit1) ( n.observations <- length(D$N) ) n.parameters.fit1 <- length(coef(fit1)) n.parameters.fit1 n.df.fit1 <- n.observations - n.parameters.fit1 n.df.fit1 pchisq(deviance(fit1), df=n.df.fit1, lower.tail=F) fit2 <- glm(resp ~ Incecticide + TempReg, family=binomial(link="logit"), data = D) summary(fit2) anova(fit2, fit1, test="Chisq") fit3 <- glm(resp ~ Incecticide + 0, family=binomial(link="logit"), data = D) summary(fit3) anova(fit3, fit2, test="Chisq") fit4 <- glm(resp ~ TempReg + 0, family=binomial(link="logit"), data = D) summary(fit4) anova(fit4, fit2, test="Chisq") summary(fit2) fit20 <- glm(resp ~ 0 + Incecticide + TempReg, family=binomial(link="logit"), data = D) summary(fit20) deviance(fit2) deviance(fit20) plot(fitted(fit2), fitted(fit20)) abline(0,1) confint(fit20) library(postHoc) TT <- posthoc(Model = fit20, EffectIndices = 1:6, EffectLabels = levels(D$Isecticide)) print(TT) barplot(TT) lines(TT) plot(TT) ################################################################################ # Question 2 - Binomial regression ################################################################################ str(Exam2024.02) D <- Exam2024.02 Size <- 150 D$Kelvin <- D$Temperature + 273.15 ( Nobs <- length(D$Temperature) ) table(D$Temperature, D$Treatment) # A plot for illustrating the (non-linear) relationship between the # proportion and the temperature plot(D$Temperature[D$Treatment == "Scarified"], D$Germinated[D$Treatment == "Scarified"]/Size, col="red", pch=19, ylim=c(0,1), ylab="Proportion of germinated seeds", xlab="Temperature") points(D$Temperature[D$Treatment != "Scarified"], D$Germinated[D$Treatment != "Scarified"]/Size, col="blue", pch=19) lines(lowess(x = D$Temperature[D$Treatment == "Scarified"], y = D$Germinated[D$Treatment == "Scarified"]/Size), lwd=5, col="red") lines(lowess(x = D$Temperature[D$Treatment != "Scarified"], y = D$Germinated[D$Treatment != "Scarified"]/Size), lwd=5, col="blue") legend(x="bottomright", legend=c("Scarified", "NotScarified"), pch=19, col=c("red", "blue"), bty="n") # A plot for empirically checking the adjustment of an exponential law plot(D$Temperature[D$Treatment == "Scarified"], log(D$Germinated[D$Treatment == "Scarified"]/Size), col="red", pch=19, ylim = c(-4.5,0), ylab="Logarithm of the proportion of germinated seeds", xlab="Temperature") points(D$Temperature[D$Treatment !="Scarified"], log(D$Germinated[D$Treatment != "Scarified"]/Size), col="blue", pch=19) legend(x="bottomright", legend=c("Scarified", "NotScarified"), pch=19, col=c("red", "blue"), bty="n") lines(lowess(x = D$Temperature[D$Treatment == "Scarified"], y = log(D$Germinated[D$Treatment == "Scarified"]/Size), f=4/5), lwd=5, col="red") lines(lowess(x = D$Temperature[D$Treatment != "Scarified"], y = log(D$Germinated[D$Treatment != "Scarified"]/Size), f=4/5), lwd=5, col="blue") # A plot for empirically checking the adjustment of the Arrhenius equation plot((1/D$Kelvin[D$Treatment == "Scarified"]), log(D$Germinated[D$Treatment == "Scarified"]/Size), col="red", pch=19, ylim = c(-4.5,0), # ylim=c(-4, max(log(D$Luminecent/20))), ylab="Logarithm of the proportion of germinated seeds", xlab="1/Absolute Temperature" ) lines(lowess(x = 1/D$Kelvin[D$Treatment == "Scarified"], y = log(D$Germinated[D$Treatment == "Scarified"]/Size)), lwd=5, col="red") points((1/D$Kelvin[D$Treatment != "Scarified"]), log(D$Germinated[D$Treatment != "Scarified"]/Size), col="blue", pch=19) lines(lowess(x = 1/D$Kelvin[D$Treatment != "Scarified"], y = log(D$Germinated[D$Treatment != "Scarified"]/Size)), lwd=5, col="blue") legend(x="bottomleft", legend=c("Scarified", "NotScarified"), pch=19, col=c("red", "blue"), bty="n") # Fitting a first model and performing some basic residual analysis fit1 <- glm(cbind(Germinated, Size-Germinated) ~ factor(Temperature)*Treatment, family=binomial(link="logit"), data = D) Fitted <- fitted(fit1) par(mfrow = c(2,1)) RawResiduals <- residuals(fit1, "response") plot(Fitted+runif(Nobs,min=-.01,max=.01), RawResiduals, pch=19) abline(h=0) PearsonResiduals <- residuals(fit1, "pearson") plot(Fitted+runif(Nobs,min=-.01,max=.01), PearsonResiduals, pch=19) abline(h=0) par(mfrow = c(1,1)) # Performing a test deviance(fit1) ( DF <- length(D$Temperature) - length(coef(fit1))) pchisq(deviance(fit1), df=DF, lower.tail=F) # Fitting another model and making some residual analysis fit2 <- glm(cbind(Germinated, Size-Germinated) ~ Temperature*Treatment, family=binomial(link="log"), data = D) RawResiduals <- residuals(fit2, "response") plot(D$Temperature[D$Treatment == "Scarified"], RawResiduals[D$Treatment == "Scarified"], pch=19, col="red", cex=0.7, xlab="Temperature ", ylab="Raw Residuals") abline(h=0) lines(lowess(x=D$Temperature[D$Treatment == "Scarified"], y=RawResiduals[D$Treatment == "Scarified"], f = 1/3), lwd=5, col="red") points(D$Temperature[D$Treatment != "Scarified"] +runif(Nobs/2,min=-0.2,max=0), RawResiduals[D$Treatment != "Scarified"], pch=19, col="blue", cex=0.7) lines(lowess(x=D$Temperature[D$Treatment != "Scarified"], y=RawResiduals[D$Treatment != "Scarified"], f = 1/3), lwd=5, col="blue") # Performing a test anova(fit2,fit1, test="Chisq") # Fitting even another empirical model and making some residual analysis fit2b <- glm(cbind(Germinated, Size-Germinated) ~ log(Temperature)*Treatment, family=binomial(link="logit"), data = D) RawResiduals <- residuals(fit2b, "response") plot(D$Temperature[D$Treatment == "Scarified"], RawResiduals[D$Treatment == "Scarified"], pch=19, col="red", cex=0.7, xlab="1/ Absolute Temperature (plus noise)", ylab="Raw Residuals", ylim=range(RawResiduals)) abline(h=0) lines(lowess(x=D$Temperature[D$Treatment == "Scarified"], y=RawResiduals[D$Treatment == "Scarified"]), lwd=5, col="red") points(D$Temperature[D$Treatment != "Scarified"], RawResiduals[D$Treatment != "Scarified"], pch=19, col="blue", cex=0.7 ) lines(lowess(x=D$Temperature[D$Treatment != "Scarified"], y=RawResiduals[D$Treatment != "Scarified"]), lwd=5, col="blue") anova(fit2b, fit1, test = "Chisq") # Adjusting another model (based on the Arrhenius equation) D$iTemp <- 1/D$Kelvin fit3 <- glm(cbind(Germinated, Size-Germinated) ~ Treatment:iTemp + Treatment + 0, family=binomial(link="log"), data = D) RawResiduals <- residuals(fit3, "response") plot(D$iTemp[D$Treatment == "Scarified"], RawResiduals[D$Treatment == "Scarified"], pch=19, col="red", cex=0.7, xlab="1/ Absolute Temperature (plus noise)", ylab="Raw Residuals", ylim=range(RawResiduals)) abline(h=0) lines(lowess(x=D$iTemp[D$Treatment == "Scarified"], y=RawResiduals[D$Treatment == "Scarified"]), lwd=5, col="red") points(D$iTemp[D$Treatment != "Scarified"], RawResiduals[D$Treatment != "Scarified"], pch=19, col="blue", cex=0.7 ) lines(lowess(x=D$iTemp[D$Treatment != "Scarified"], y=RawResiduals[D$Treatment != "Scarified"]), lwd=5, col="blue") # Performing a test anova(fit3,fit1, test="Chisq") summary(fit3) # Fitting a model fit4 <- glm(cbind(Germinated, Size-Germinated) ~ iTemp + Treatment + 0, family=binomial(link="log"), data = D) summary(fit4) # Performing a test anova(fit4, fit3, test="Chisq") # Fitting a model fit5 <- glm(cbind(Germinated, Size-Germinated) ~ iTemp:Treatment , family=binomial(link="log"), data = D) summary(fit5) # Performing a test anova(fit5, fit1, test="Chisq") summary(fit3) cbind(coef(fit3)[c(1,2)], confint(fit3)[c(1,2), ] ) R <- 1.9872036/1000 # Universal gas constant in kcal⋅K−1⋅mol−1 cbind(coef(fit3)[c(3,4)], confint(fit3)[c(3,4), ] )*(-R) # Do we have protein transconformation involved in the signaling process # occurred in the testa of the seed (which is removed when scarifying)? ################################################################################ # Question 3 - Poisson one-way ################################################################################ str(Exam2023.03) D <- Exam2023.03 table(D$Era) # Studying the relation between the means and the variances ( Means <- tapply(D$NumberPGrains, D$Era, mean) ) ( Variances <- tapply(D$NumberPGrains, D$Era, var) ) plot(Means, Variances, pch=19) abline(0,1) summary(lm(Variances ~ Means )) abline(coef(lm(Variances ~ Means )), lty=2) coef(lm(Variances ~ Means )) ### Simulating to get intuition par(mfrow=c(3,3)) plot(Means, Variances, pch=19, main="Observed Data") abline(0,1) summary(lm(Variances ~ Means )) abline(coef(lm(Variances ~ Means )), lty=2) X <- numeric(length(D$NumberPGrains)) for (j in 1:8){ i <- 0 for (r in 1:20){ for(k in 1:6){ i <- i + 1 X[i] <- rpois(1,lambda=Means[k]) } } MeansSim <- tapply(X, D$Era, mean) VariancesSim <- tapply(X, D$Era, var) plot(MeansSim, VariancesSim, pch=19, main="Simulated Data", xlab="Means", ylab="Variances") abline(0,1) abline(coef(lm(Variances ~ Means )), lty=2) } par(mfrow=c(1,1)) ### # Exploratory analysis plot(D$Era, D$NumberPGrains, col="lightblue") # Fitting a first model fit1 <- glm(NumberPGrains ~ Era, family=poisson(link="identity"), data = D) summary(fit1) deviance(fit1) ( n.observations <- length(D$NumberPGrains) ) ( n.parameters.fit1 <- length(coef(fit1)) ) ( n.df.fit1 <- n.observations - n.parameters.fit1 ) # Performing a test pchisq(deviance(fit1), df=n.df.fit1, lower.tail=F) # Making some basic residual analysis plot(fitted(fit1)+ runif(n.observations, min=-0.3, max=0.3) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values", pch=19) abline(h=0) plot(fitted(fit1)+ runif(n.observations, min=-0.3, max=0.3) , residuals(fit1,type="pearson"), ylab="Pearson-residuals", xlab="Fitted values", pch=19) abline(h=0); # Fitting a second model fit2 <- glm(NumberPGrains ~ 1, family=poisson(link="identity"), data = D) summary(fit2) # Performing a test anova(fit2, fit1, test="Chisq") # Summing up fit1b <- glm(NumberPGrains ~ Era + 0 , family=poisson(link="identity"), data = D) deviance(fit1b) deviance(fit1) summary(fit1b) cbind(coef(fit1b, confint(fit1b)) ) library(postHoc) TT <- posthoc(Model=fit1b, EffectLabels = levels(D$Era), digits = 2) plot(TT) barplot(TT) lines(TT) print(TT) ################################################################################ # Question 4 - - Poisson regression ################################################################################ str(Exam2024.04) D <- Exam2024.04 table(D$gSoil, D$Species) ( Nobs <- length(D$Counts) ) ( Means <- tapply(D$Counts, list(D$gSoil, D$Species), mean) ) Means <- as.numeric(Means) ( Variances <- tapply(D$Counts, list(D$gSoil, D$Species), var) ) Variances <- as.numeric(Variances) plot(Means, Variances) abline(0,1) abline(coef(lm(Variances~Means)), lty=2) plot(D$gSoil, D$Counts, pch=19 , col=c(rep("red",100), rep("blue",100))) plot(D$gSoil + runif(Nobs, min=-0.00002, max=0.00002), D$Counts, col=c(rep("red",100), rep("blue",100))) # Fiting a first model and performing some basic residual analyses fit0 <- glm(Counts ~ factor(gSoil)*Species, family=poisson(link="identity"), data = D) plot(fitted(fit0), residuals(fit0, "response")) abline(h=0) plot(fitted(fit0), residuals(fit0, "pearson")) abline(h=0) Noise <- runif(length(D$Counts), min = -0.3, max = 0.3) plot(fitted(fit0)+Noise, residuals(fit0, "response")) abline(h=0) plot(fitted(fit0)+Noise, residuals(fit0, "pearson")) abline(h=0) deviance(fit0) ( DF <- Nobs - length(coef(fit0)) ) deviance(fit0)/DF # Performing a test pchisq(deviance(fit0), DF, lower.tail=F) # Fiting another model and performing some basic residual analyses and tests D$gSoil2 <- D$gSoil**2 fit1 <- glm(Counts ~ gSoil:Species + gSoil2:Species - 1, family=poisson(link="identity"), data = D) summary(fit1) confint(fit1) anova(fit1, fit0, test="Chisq") plot(D$gSoil, residuals(fit1, "response")) abline(h=0) lines( lowess(x=D$gSoil, y=residuals(fit1, "response")), lwd=2, col="blue" ) summary(fit1) # Fiting another model and performing some basic residual analyses and tests fit2 <- glm(Counts ~ gSoil*Species , family=poisson(link="identity"), data = D) # Alternatively (optional) # fit2 <- glm(Counts ~ gSoil:Species + Species + 0, family=poisson(link="identity")) summary(fit2) confint(fit2) anova(fit2, fit0, test="Chisq") plot(D$gSoil, residuals(fit2, "response")) abline(h=0) lines( lowess(x=D$gSoil, y=residuals(fit2, "response")), lwd=2, col="blue" ) # Fiting another model and performing some basic residual analyses and tests fit3 <- glm(Counts ~ gSoil:Species - 1, family=poisson(link="identity"), data = D) summary(fit3) confint(fit3) anova(fit3, fit1, test="Chisq") plot(D$gSoil, residuals(fit3, "response")) abline(h=0) lines( lowess(x=D$gSoil, y=residuals(fit3, "response")), lwd=2, col="blue" ) # Fiting another model fit4 <- glm(Counts ~ gSoil:Species + gSoil2 - 1, family=poisson(link="identity"), data = D) summary(fit4) confint(fit4) anova(fit4, fit3, test="Chisq") summary(fit4) # Fiting another model fit5 <- glm(Counts ~ gSoil + gSoil2:Species - 1, family=poisson(link="identity"), data = D) summary(fit5) anova(fit5, fit1, test="Chisq") rm(Nobs, Means, Variances, fit1, fit2, fit3, fit4) ################################################################################ # Question 5 - Normal two-ways ################################################################################ str(Exam2024.05) D <- Exam2024.05 # Some exploratory analyses table(D$Treat, D$Cultivar) ( Means <- tapply(D$NO3, list(D$Treat, D$Cultivar), mean) ) plot(D$Cultivar, D$NO3, col=D$Treat) points(1:4, Means[1,], col=1, type="b", pch=19) points(1:4, Means[2,], col=2, type="b", pch=19) points(1:4, Means[3,], col=3, type="b", pch=19) # Fitting a first model D$Cultivar <- factor(D$Cultivar) D$Treat <- factor(D$Treat) fit1 <- glm (NO3 ~ Cultivar + Treat + Cultivar:Treat, family=gaussian(link="identity"), data = D) summary(fit1) # Some residual analysis qqnorm(residuals(fit1,type="response")) qqline(residuals(fit1,type="response")) shapiro.test(residuals(fit1,type="response")) plot(fitted(fit1) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); bartlett.test(residuals(fit1,type="response") ~ interaction(D$Cultivar,D$Treat) ) plot(factor(fitted(fit1)) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); # Fitting a second model fit2 <- glm (NO3 ~ Cultivar + Treat, family=gaussian(link="identity"), data = D) summary(fit2) anova(fit2, fit1, test="F") # Fitting a third model fit3 <- glm (NO3 ~ Cultivar, family=gaussian(link="identity"), data = D) summary(fit3) anova(fit3, fit2, test="F") # Fitting a fourth model fit4 <- glm (NO3 ~ Treat, family=gaussian(link="identity"), data = D) summary(fit4) anova(fit4, fit2, test="F") confint(fit4) fit20 <- glm (NO3 ~ Cultivar + Treat + 0 , family=gaussian(link="identity"), data = D) summary(fit20) library(postHoc) TT <- posthoc(Model = fit20, EffectIndices = 1:4, digits = 2) print(TT) barplot(TT) lines(TT) plot(TT) fit20b <- glm (NO3 ~ Treat + Cultivar + 0 , family=gaussian(link="identity"), data = D) summary(fit20b) TT <- posthoc(Model = fit20b, EffectIndices = 1:3, digits = 2) print(TT) barplot(TT) lines(TT) plot(TT) ################################################################################ # Question 6 - Normal regression ################################################################################ str(Exam2024.06) D <- Exam2024.06 table(D$Months) plot(D$Months, D$PPM) plot(D$Months, log(D$PPM)) # Fitting a first model fit0 <- glm(PPM ~ factor(Months) + 0, family=gaussian(link="log"), data = D) summary(fit0) # Some residual analysis qqnorm(residuals(fit0,type="response")) qqline(residuals(fit0,type="response")) # Testing normality shapiro.test(residuals(fit0,type="response")) plot(fitted(fit0) , residuals(fit0,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); # Testing variance homogeneity bartlett.test(PPM ~ factor(Months), data = D) plot(factor(fitted(fit0)) , residuals(fit0,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); # Another model fit1 <- glm(PPM ~ Months, family=gaussian(link="log"), data = D) summary(fit1) anova(fit0, fit1, test="F") # Some residual analysis qqnorm(residuals(fit1,type="response")) qqline(residuals(fit1,type="response")) plot(fitted(fit1) , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Fitted values") abline(h=0); plot(D$Months , residuals(fit1,type="response"), ylab="Raw-residuals", xlab="Months") abline(h=0); lines(lowess(x=D$Months, y=residuals(fit1,type="response") ), lwd=4, col="blue" ) cbind(coef(fit1), confint(fit1)) # OPTIONAL challenger # Another common, but problematic solution, is to logarithmically transform # the responses and fit a linear regression. # Observe the lack of adequacy of this model. fitLog <- lm(log(PPM) ~ Months, data = D) qqnorm(residuals(fitLog,type="response")) qqline(residuals(fitLog,type="response")) plot(fitted(fitLog), residuals(fitLog,type="response")) abline(h = 0) summary(fitLog) # Compare ... coef(fitLog) coef(fit1) plot(fitted(fit1), exp(fitted(fitLog))) abline(0,1) table( fitted(fit1) == exp(fitted(fitLog)) ) summary(fitted(fit1) - exp(fitted(fitLog))) # END OF THE OPTIONAL challenger ################################################################################ # # THE END # ################################################################################