################################################################################
#
# 
# Suggestions for the analysis of the six questions of the exam of the course
# Basic Statistical Analysis.
# 2024   
#
# Remark: The codes below are just suggestions for the analyses related to the
#         examen. You are welcome to modify these analyses, add more analyses
#         incorporating further aspects or use your own codes.
#         You do not need to incorporate all the results produced here in your
#         report.
#
################################################################################
#
#
################################################################################

# rm(list = ls()) # This command will remove everything previously placed in the
# workspace (not possible to recover latter!)

################################################################################

# ## Installing some useful packages:
#  
# ## A function to check the presence of required packages 
# InsPack <- function (pack){
#   if (!pack %in% installed.packages()) {
#   print(paste("installing", pack))
#   install.packages(pack)
#  }
#  else print(paste(pack, "already installed. :-)"))
# }
#   
## Installing some usefull (standard) packages
  # InsPack("postHoc")
  # InsPack("igraph")
  # InsPack("lme4")
  # InsPack("nlme")
  # InsPack("multcomp")
  # InsPack("lsmeans")
  # InsPack("permute")
  # InsPack("MASS")
# 
################################################################################

setwd("~/BSA") # This is where I placed the required files.
# Modify that to point to the place in your computer where 
# the file "ExamBSA2024.RData" is placed.

load(file="ExamBSA2024.RData")


################################################################################
# Question 1 - Binomial two-ways                                               #
################################################################################

D <- Exam2024.01
str(D)

table(D$TempReg, D$Incecticide)

table(D$N)
Proportions <- tapply(D$NumberDead, list(D$Incecticide, D$TempReg), sum)/
  tapply(D$N, list(D$Incecticide, D$TempReg), sum)

Proportions

plot(1:6, Proportions[1:6], type="b", pch=19, col="blue", ylim=c(0,1),
     xlab="Incecticide", ylab="Proportion Dead"
)
points(1:6, Proportions[7:12], type="b", pch=19, col="red")

Logit <- function(p) return(log(p/(1-p)))

plot(1:6, Logit(Proportions[1:6]), type="b", pch=19, col="blue", 
     ylim=range(Logit(Proportions)),
     xlab="Herbicide", ylab="Log-odds of alive"
)
points(1:6, Logit(Proportions[7:12]), type="b", pch=19, col="red")

# Fitting the first model
is.factor(D$TempReg)
is.factor(D$Incecticide)

resp <- cbind(D$NumberDead, D$N-D$NumberDead)

resp[1:5, ]

fit1 <- glm(resp ~ Incecticide:TempReg + 0, 
            family=binomial(link="logit"), data = D) 

summary(fit1)

plot(fitted(fit1) , residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values")

plot(fitted(fit1) , residuals(fit1,type="pearson"),		 
     ylab="Pearson-residuals", xlab="Fitted values")
abline(h=0); abline(v=0.5)

Nobs <- length(D$N); Perturbation <- runif(Nobs, min=-0.02, max=0.02)
plot(fitted(fit1)+Perturbation, residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values + Noise")
abline(h=0); abline(v=0.5)

plot(fitted(fit1)+Perturbation , residuals(fit1,type="pearson"),		 
     ylab="Pearson-residuals", xlab="Fitted values + Noise")
abline(h=0); abline(v=0.5)

deviance(fit1)
( n.observations <- length(D$N) )

n.parameters.fit1 <- length(coef(fit1))
n.parameters.fit1

n.df.fit1 <- n.observations - n.parameters.fit1
n.df.fit1

pchisq(deviance(fit1), df=n.df.fit1, lower.tail=F)

fit2 <- glm(resp ~ Incecticide + TempReg, family=binomial(link="logit"), data = D)

summary(fit2)

anova(fit2, fit1, test="Chisq")

fit3 <- glm(resp ~ Incecticide + 0, family=binomial(link="logit"), data = D)

summary(fit3)

anova(fit3, fit2, test="Chisq")

fit4 <- glm(resp ~ TempReg + 0, family=binomial(link="logit"), data = D)

summary(fit4)

anova(fit4, fit2, test="Chisq")

summary(fit2)

fit20 <- glm(resp ~ 0 + Incecticide + TempReg, family=binomial(link="logit"), data = D)

summary(fit20)

deviance(fit2)

deviance(fit20)

plot(fitted(fit2), fitted(fit20))
abline(0,1)

confint(fit20)

library(postHoc)

TT <- posthoc(Model = fit20, EffectIndices = 1:6, 
              EffectLabels = levels(D$Isecticide))
print(TT)
barplot(TT)
lines(TT)
plot(TT)


################################################################################
# Question 2 - Binomial regression                                            
################################################################################

str(Exam2024.02)
D <- Exam2024.02
Size <- 150
D$Kelvin <- D$Temperature + 273.15

( Nobs <- length(D$Temperature) )

table(D$Temperature, D$Treatment)

# A plot for illustrating the (non-linear) relationship between the
# proportion and the temperature
plot(D$Temperature[D$Treatment == "Scarified"],  
     D$Germinated[D$Treatment == "Scarified"]/Size, 
     col="red", pch=19, ylim=c(0,1),
     ylab="Proportion of germinated seeds",
     xlab="Temperature")
points(D$Temperature[D$Treatment != "Scarified"], 
       D$Germinated[D$Treatment != "Scarified"]/Size, col="blue", pch=19)
lines(lowess(x = D$Temperature[D$Treatment == "Scarified"], 
             y = D$Germinated[D$Treatment == "Scarified"]/Size), 
      lwd=5, col="red")
lines(lowess(x = D$Temperature[D$Treatment != "Scarified"], 
             y = D$Germinated[D$Treatment != "Scarified"]/Size), 
      lwd=5, col="blue")

legend(x="bottomright", legend=c("Scarified", "NotScarified"), 
       pch=19, col=c("red", "blue"), bty="n")

# A plot for empirically checking the adjustment of an exponential law
plot(D$Temperature[D$Treatment == "Scarified"], 
     log(D$Germinated[D$Treatment == "Scarified"]/Size), 
     col="red", pch=19, ylim = c(-4.5,0),
     ylab="Logarithm of the proportion of germinated seeds",
     xlab="Temperature")

points(D$Temperature[D$Treatment !="Scarified"],
       log(D$Germinated[D$Treatment != "Scarified"]/Size), 
       col="blue", pch=19)
legend(x="bottomright", legend=c("Scarified", "NotScarified"), 
       pch=19, col=c("red", "blue"), bty="n")
lines(lowess(x = D$Temperature[D$Treatment == "Scarified"], 
             y = log(D$Germinated[D$Treatment == "Scarified"]/Size), f=4/5), 
      lwd=5, col="red")
lines(lowess(x = D$Temperature[D$Treatment != "Scarified"], 
             y = log(D$Germinated[D$Treatment != "Scarified"]/Size), f=4/5),
      lwd=5, col="blue")

# A plot for empirically checking the adjustment of the Arrhenius equation
plot((1/D$Kelvin[D$Treatment == "Scarified"]),  
     log(D$Germinated[D$Treatment == "Scarified"]/Size), 
     col="red", pch=19, ylim = c(-4.5,0),
     # ylim=c(-4, max(log(D$Luminecent/20))),
     ylab="Logarithm of the proportion of germinated seeds",
     xlab="1/Absolute Temperature"
)
lines(lowess(x = 1/D$Kelvin[D$Treatment == "Scarified"], 
             y = log(D$Germinated[D$Treatment == "Scarified"]/Size)), 
      lwd=5, col="red")
points((1/D$Kelvin[D$Treatment != "Scarified"]), 
       log(D$Germinated[D$Treatment != "Scarified"]/Size), 
       col="blue", pch=19)
lines(lowess(x = 1/D$Kelvin[D$Treatment != "Scarified"], 
             y = log(D$Germinated[D$Treatment != "Scarified"]/Size)), 
      lwd=5, col="blue")

legend(x="bottomleft", legend=c("Scarified", "NotScarified"),
       pch=19, col=c("red", "blue"), bty="n")

# Fitting a first model and performing some basic residual analysis

fit1 <- glm(cbind(Germinated, Size-Germinated) ~ factor(Temperature)*Treatment, 
            family=binomial(link="logit"), data = D)

Fitted <- fitted(fit1)
par(mfrow = c(2,1))
RawResiduals <- residuals(fit1, "response")
plot(Fitted+runif(Nobs,min=-.01,max=.01), RawResiduals, pch=19)
abline(h=0)

PearsonResiduals <- residuals(fit1, "pearson")
plot(Fitted+runif(Nobs,min=-.01,max=.01), PearsonResiduals, pch=19)
abline(h=0)
par(mfrow = c(1,1))

# Performing a test
deviance(fit1)
( DF <- length(D$Temperature) - length(coef(fit1)))
pchisq(deviance(fit1), df=DF, lower.tail=F)

# Fitting another model and making some residual analysis
fit2 <- glm(cbind(Germinated, Size-Germinated) ~  Temperature*Treatment, 
            family=binomial(link="log"), data = D)
RawResiduals <- residuals(fit2, "response")

plot(D$Temperature[D$Treatment == "Scarified"], 
     RawResiduals[D$Treatment == "Scarified"], pch=19, col="red", cex=0.7,
     xlab="Temperature ", ylab="Raw Residuals") 
abline(h=0)

lines(lowess(x=D$Temperature[D$Treatment == "Scarified"], 
             y=RawResiduals[D$Treatment == "Scarified"], f = 1/3),
      lwd=5, col="red")
points(D$Temperature[D$Treatment != "Scarified"] +runif(Nobs/2,min=-0.2,max=0), 
       RawResiduals[D$Treatment != "Scarified"], pch=19, col="blue", cex=0.7)
lines(lowess(x=D$Temperature[D$Treatment != "Scarified"], 
             y=RawResiduals[D$Treatment != "Scarified"], f = 1/3), 
      lwd=5, col="blue")

# Performing a test
anova(fit2,fit1, test="Chisq")

# Fitting even another empirical model and making some residual analysis
fit2b <- glm(cbind(Germinated, Size-Germinated) ~ log(Temperature)*Treatment, 
             family=binomial(link="logit"), data = D)
RawResiduals <- residuals(fit2b, "response")
plot(D$Temperature[D$Treatment == "Scarified"], 
     RawResiduals[D$Treatment == "Scarified"], pch=19, col="red", cex=0.7,
     xlab="1/ Absolute Temperature (plus noise)", ylab="Raw Residuals",
     ylim=range(RawResiduals))
abline(h=0)
lines(lowess(x=D$Temperature[D$Treatment == "Scarified"], 
             y=RawResiduals[D$Treatment == "Scarified"]), 
      lwd=5, col="red")
points(D$Temperature[D$Treatment != "Scarified"], 
       RawResiduals[D$Treatment != "Scarified"],
       pch=19, col="blue", cex=0.7
)
lines(lowess(x=D$Temperature[D$Treatment != "Scarified"], 
             y=RawResiduals[D$Treatment != "Scarified"]), 
      lwd=5, col="blue")

anova(fit2b, fit1, test = "Chisq")

# Adjusting another model (based on the Arrhenius equation)
D$iTemp <- 1/D$Kelvin
fit3 <- glm(cbind(Germinated, Size-Germinated) ~ Treatment:iTemp + Treatment + 0, 
            family=binomial(link="log"), data = D)
RawResiduals <- residuals(fit3, "response")

plot(D$iTemp[D$Treatment == "Scarified"], 
     RawResiduals[D$Treatment == "Scarified"], pch=19, col="red", cex=0.7,
     xlab="1/ Absolute Temperature (plus noise)", ylab="Raw Residuals",
     ylim=range(RawResiduals))
abline(h=0)
lines(lowess(x=D$iTemp[D$Treatment == "Scarified"], 
             y=RawResiduals[D$Treatment == "Scarified"]), 
      lwd=5, col="red")
points(D$iTemp[D$Treatment != "Scarified"], 
       RawResiduals[D$Treatment != "Scarified"],
       pch=19, col="blue", cex=0.7
)
lines(lowess(x=D$iTemp[D$Treatment != "Scarified"], 
             y=RawResiduals[D$Treatment != "Scarified"]), 
      lwd=5, col="blue")

# Performing a test
anova(fit3,fit1, test="Chisq")

summary(fit3)

# Fitting a model
fit4 <- glm(cbind(Germinated, Size-Germinated) ~ iTemp + Treatment  + 0, 
            family=binomial(link="log"), data = D) 

summary(fit4)

# Performing a test
anova(fit4, fit3, test="Chisq")

# Fitting a model
fit5 <- glm(cbind(Germinated, Size-Germinated) ~ iTemp:Treatment , 
            family=binomial(link="log"), data = D)

summary(fit5)

# Performing a test
anova(fit5, fit1, test="Chisq")

summary(fit3)

cbind(coef(fit3)[c(1,2)], confint(fit3)[c(1,2), ] )
R <- 1.9872036/1000  # Universal gas constant in kcal⋅K−1⋅mol−1
cbind(coef(fit3)[c(3,4)], confint(fit3)[c(3,4), ] )*(-R)

# Do we have protein transconformation involved in the signaling process
# occurred in the testa of the seed (which is removed when scarifying)?

################################################################################
# Question 3 - Poisson one-way
################################################################################

str(Exam2023.03)

D <- Exam2023.03

table(D$Era)

# Studying the relation between the means and the variances
( Means <- tapply(D$NumberPGrains, D$Era, mean) )
( Variances <- tapply(D$NumberPGrains, D$Era, var) )

plot(Means, Variances, pch=19)
abline(0,1)
summary(lm(Variances ~ Means ))
abline(coef(lm(Variances ~ Means )), lty=2)
coef(lm(Variances ~ Means ))

### Simulating to get intuition
par(mfrow=c(3,3))
plot(Means, Variances, pch=19, main="Observed Data")
abline(0,1)
summary(lm(Variances ~ Means ))
abline(coef(lm(Variances ~ Means )), lty=2)

X <- numeric(length(D$NumberPGrains))
for (j in 1:8){ 
  i <- 0
  for (r in 1:20){
    for(k in 1:6){
      i <- i + 1
      X[i] <- rpois(1,lambda=Means[k])
    }
  }
  MeansSim <- tapply(X, D$Era, mean)
  VariancesSim <- tapply(X, D$Era, var)
  plot(MeansSim, VariancesSim, pch=19, main="Simulated Data",
       xlab="Means", ylab="Variances")
  abline(0,1)
  abline(coef(lm(Variances ~ Means )), lty=2)
}

par(mfrow=c(1,1))

###

# Exploratory analysis

plot(D$Era, D$NumberPGrains, col="lightblue")

# Fitting  a first model

fit1 <- glm(NumberPGrains ~ Era, family=poisson(link="identity"), data = D)

summary(fit1)

deviance(fit1)

( n.observations <- length(D$NumberPGrains) )

( n.parameters.fit1 <- length(coef(fit1)) )

( n.df.fit1 <- n.observations - n.parameters.fit1 )

# Performing a test
pchisq(deviance(fit1), df=n.df.fit1, lower.tail=F)

# Making some basic residual analysis
plot(fitted(fit1)+ runif(n.observations, min=-0.3, max=0.3)
     , residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values", pch=19)
abline(h=0)

plot(fitted(fit1)+ runif(n.observations, min=-0.3, max=0.3) , 
     residuals(fit1,type="pearson"),		 
     ylab="Pearson-residuals", xlab="Fitted values", pch=19)
abline(h=0); 

# Fitting a second model 

fit2 <- glm(NumberPGrains ~ 1, family=poisson(link="identity"), data = D)

summary(fit2)

# Performing a test
anova(fit2, fit1, test="Chisq")

# Summing up
fit1b <- glm(NumberPGrains ~ Era + 0 , 
             family=poisson(link="identity"), data = D)

deviance(fit1b)

deviance(fit1)

summary(fit1b)

cbind(coef(fit1b, confint(fit1b)) )

library(postHoc)
TT <- posthoc(Model=fit1b, EffectLabels = levels(D$Era), digits = 2)

plot(TT)
barplot(TT)
lines(TT)
print(TT)

################################################################################
# Question 4 - - Poisson regression
################################################################################

str(Exam2024.04)

D <- Exam2024.04

table(D$gSoil, D$Species)

( Nobs <- length(D$Counts) )

( Means <- tapply(D$Counts, list(D$gSoil, D$Species), mean) )
Means <- as.numeric(Means)
( Variances <- tapply(D$Counts, list(D$gSoil, D$Species), var) )
Variances <- as.numeric(Variances)

plot(Means, Variances)
abline(0,1)
abline(coef(lm(Variances~Means)), lty=2)

plot(D$gSoil, D$Counts, pch=19 , col=c(rep("red",100), rep("blue",100)))

plot(D$gSoil + runif(Nobs, min=-0.00002, max=0.00002), D$Counts,
     col=c(rep("red",100), rep("blue",100)))

# Fiting a first model and performing some basic residual analyses
fit0 <- glm(Counts ~ factor(gSoil)*Species, 
            family=poisson(link="identity"), data = D)

plot(fitted(fit0), residuals(fit0, "response"))
abline(h=0)

plot(fitted(fit0), residuals(fit0, "pearson"))
abline(h=0)

Noise <- runif(length(D$Counts), min = -0.3, max = 0.3)
plot(fitted(fit0)+Noise, residuals(fit0, "response"))
abline(h=0)

plot(fitted(fit0)+Noise, residuals(fit0, "pearson"))
abline(h=0)

deviance(fit0)

( DF <- Nobs - length(coef(fit0)) )

deviance(fit0)/DF

# Performing a test
pchisq(deviance(fit0), DF, lower.tail=F)

# Fiting another model and performing some basic residual analyses and tests
D$gSoil2 <- D$gSoil**2
fit1 <- glm(Counts ~ gSoil:Species + gSoil2:Species - 1, 
            family=poisson(link="identity"), data = D)

summary(fit1)

confint(fit1)

anova(fit1, fit0, test="Chisq")

plot(D$gSoil, residuals(fit1, "response"))
abline(h=0)
lines(
  lowess(x=D$gSoil, y=residuals(fit1, "response")),
  lwd=2, col="blue"
)

summary(fit1)

# Fiting another model and performing some basic residual analyses and tests
fit2 <- glm(Counts ~ gSoil*Species  , 
            family=poisson(link="identity"), data = D)

# Alternatively (optional) 
# fit2 <- glm(Counts ~ gSoil:Species  + Species + 0, family=poisson(link="identity"))

summary(fit2)

confint(fit2)

anova(fit2, fit0, test="Chisq")

plot(D$gSoil, residuals(fit2, "response"))
abline(h=0)
lines(
  lowess(x=D$gSoil, y=residuals(fit2, "response")),
  lwd=2, col="blue"
)

# Fiting another model and performing some basic residual analyses and tests

fit3 <- glm(Counts ~ gSoil:Species - 1, 
            family=poisson(link="identity"), data = D)

summary(fit3)

confint(fit3)

anova(fit3, fit1, test="Chisq")

plot(D$gSoil, residuals(fit3, "response"))
abline(h=0)
lines(
  lowess(x=D$gSoil, y=residuals(fit3, "response")),
  lwd=2, col="blue"
)

# Fiting another model 
fit4 <- glm(Counts ~ gSoil:Species + gSoil2 - 1, 
            family=poisson(link="identity"), data = D)

summary(fit4)

confint(fit4)

anova(fit4, fit3, test="Chisq")


summary(fit4)

# Fiting another model 
fit5 <- glm(Counts ~ gSoil + gSoil2:Species - 1, 
            family=poisson(link="identity"), data = D)

summary(fit5)

anova(fit5, fit1, test="Chisq")

rm(Nobs, Means, Variances, fit1, fit2, fit3, fit4)

################################################################################
# Question 5 - Normal two-ways
################################################################################

str(Exam2024.05)

D <- Exam2024.05

# Some exploratory analyses

table(D$Treat, D$Cultivar)

( Means <- tapply(D$NO3, list(D$Treat, D$Cultivar), mean) )


plot(D$Cultivar, D$NO3, col=D$Treat)
points(1:4, Means[1,], col=1, type="b", pch=19)
points(1:4, Means[2,], col=2, type="b", pch=19)
points(1:4, Means[3,], col=3, type="b", pch=19)

# Fitting a first model

D$Cultivar <- factor(D$Cultivar)
D$Treat <- factor(D$Treat)

fit1 <- glm (NO3 ~ Cultivar + Treat + Cultivar:Treat, 
             family=gaussian(link="identity"), data = D)

summary(fit1)

# Some residual analysis

qqnorm(residuals(fit1,type="response"))
qqline(residuals(fit1,type="response"))

shapiro.test(residuals(fit1,type="response"))

plot(fitted(fit1) , residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values")
abline(h=0);

bartlett.test(residuals(fit1,type="response") ~ 
                interaction(D$Cultivar,D$Treat) )

plot(factor(fitted(fit1)) , residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values")
abline(h=0);

# Fitting a second model 

fit2 <- glm (NO3 ~ Cultivar + Treat, family=gaussian(link="identity"), data = D)

summary(fit2)

anova(fit2, fit1, test="F")

# Fitting a third model 

fit3 <- glm (NO3 ~ Cultivar, family=gaussian(link="identity"), data = D)

summary(fit3)

anova(fit3, fit2, test="F")

# Fitting a fourth model 

fit4 <- glm (NO3 ~ Treat, family=gaussian(link="identity"), data = D)

summary(fit4)

anova(fit4, fit2, test="F")

confint(fit4)

fit20 <- glm (NO3 ~ Cultivar + Treat + 0 , 
              family=gaussian(link="identity"), data = D)

summary(fit20)

library(postHoc)

TT <- posthoc(Model = fit20, EffectIndices = 1:4, digits = 2)
print(TT)
barplot(TT)
lines(TT)
plot(TT)

fit20b <- glm (NO3 ~  Treat + Cultivar + 0 , 
               family=gaussian(link="identity"), data = D)

summary(fit20b)

TT <- posthoc(Model = fit20b, EffectIndices = 1:3, digits = 2)
print(TT)
barplot(TT)
lines(TT)
plot(TT)

################################################################################
# Question 6 - Normal regression
################################################################################

str(Exam2024.06)

D <- Exam2024.06

table(D$Months)

plot(D$Months, D$PPM)

plot(D$Months, log(D$PPM))

# Fitting a first model

fit0 <- glm(PPM ~ factor(Months) + 0, family=gaussian(link="log"), data = D)

summary(fit0)

# Some residual analysis

qqnorm(residuals(fit0,type="response"))
qqline(residuals(fit0,type="response"))

# Testing normality
shapiro.test(residuals(fit0,type="response"))

plot(fitted(fit0) , residuals(fit0,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values")
abline(h=0);

# Testing variance homogeneity
bartlett.test(PPM ~ factor(Months), data = D)

plot(factor(fitted(fit0)) , residuals(fit0,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values")
abline(h=0);

# Another model

fit1 <- glm(PPM ~ Months, family=gaussian(link="log"), data = D)

summary(fit1)

anova(fit0, fit1, test="F")

# Some residual analysis

qqnorm(residuals(fit1,type="response"))
qqline(residuals(fit1,type="response"))

plot(fitted(fit1) , residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Fitted values")
abline(h=0);

plot(D$Months , residuals(fit1,type="response"),		 
     ylab="Raw-residuals", xlab="Months")
abline(h=0);
lines(lowess(x=D$Months, 
             y=residuals(fit1,type="response")
), 
lwd=4, col="blue"
)

cbind(coef(fit1), confint(fit1))

# OPTIONAL challenger
# Another common, but problematic solution, is to logarithmically transform
# the responses and fit a linear regression.         
# Observe the lack of adequacy of this model.

fitLog <- lm(log(PPM) ~ Months, data = D)

qqnorm(residuals(fitLog,type="response"))
qqline(residuals(fitLog,type="response"))

plot(fitted(fitLog), residuals(fitLog,type="response"))
abline(h = 0)

summary(fitLog)

# Compare ...
coef(fitLog)
coef(fit1)

plot(fitted(fit1), exp(fitted(fitLog)))
abline(0,1)

table( fitted(fit1) == exp(fitted(fitLog)) )

summary(fitted(fit1) - exp(fitted(fitLog)))

# END OF THE OPTIONAL challenger

################################################################################
#
#                            THE END
#
################################################################################