################################################################################
#
# Solution to exercise 4.2
#
# Last revised: Spring 2019
#
# Copyright © 2018 by Rodrigo Labouriau
################################################################################

load("DataFramesStatisticalModelling.RData")

str(Ex4.2)

D <- Ex4.2

str(D)

Nobs <- length(D$n.weeds)

################################################################################
# a - Plot the numbers of weed plants for each observation (vertical axis)
#     against the doses (horizontal axis).
################################################################################

plot(D$Nitr+runif(n=100,min=0, max=5), D$n.weeds, xlab="Nitrogen", ylab="N.weeds")

# This calculates the sample mean of n.weeds for each value of Nitr.
(  Means <- tapply(D$n.weeds, D$Nitr  ,mean)  )

points(c(0,50,100,150,200), Means, pch=19, col="red", type="b")

################################################################################
# b - Estimate the mean and the variance for each combination of the doses
# used and plot the variances against the means to verify whether the
# variance is approximately equal to the mean (as expected from a
#   Poisson model)
################################################################################

# This calculates the sample mean of n.weeds for each value of Nitr.
(  Means     <- tapply(D$n.weeds, D$Nitr  ,mean)  )

# This calculates the sample variances of n.weeds for each value of Nitr.
(  Variances <- tapply(D$n.weeds, D$Nitr  ,var)  )

rbind(Means, Variances)

plot( Means, Variances)
abline(0,1)

################################################################################
# c - Fit a one-way Poisson model (with Nitr entering as a factor) and
# compare that model with a model containing a fixed intensity
# parameter (i.e. containing only the intercept)
################################################################################

# Here a fit a Poisson one-way classification model
oneWay <- glm(n.weeds~factor(Nitr)+0,family=poisson(link="identity"), data=D)
# Note that the model assumes one intensity parameter (expectation) for each
# level of nitrogen used
summary(oneWay)

# Here a fit a Poisson model assuming the same expectation for each observation
constant <- glm(n.weeds~1 ,family=poisson(link="identity"), data=D)
summary(constant)

# Here I use a likelihood ratio test to compare the models 
anova(constant, oneWay, test="Chisq")

# What we conclude? The expected numbers of weeds per nitrogen level are not
# all the same!

################################################################################
# d - Fit a Poisson regression model (with Nitr entering as a continuous
# explanatory variable) and compare that model with the previous
# model. Use this comparison to discuss the linearity of the response
# to the dose (at least in this range)
################################################################################

# Here I fit a Poisson linear regression model (assumes that the expected
# number of weeds increase linearly with the amount of nitrogen supplied)
linear<-glm(n.weeds~Nitr,family=poisson(link="identity"), data=D)
summary(linear)
anova(linear,oneWay,test="Chisq")

################################################################################
# Additional topic (optional)
# Below I will make some basic model control and some simulation ...
################################################################################

# Testing homogeneity

deviance(oneWay)
Nobs <- length(D$n.weeds)      # The number of observations 
Npar <- length(coef(oneWay))   # The number of parameters in the oneWay model
pchisq(deviance(oneWay),df=Nobs-Npar,lower.tail = FALSE)
# Conclusion: no reasons to suspect of lack of homogeneity!

# Examining the residuals

# Here I calculate a vector with what is predicted (the fitted values) by
# the model for each observation
Fitted <- fitted(oneWay)

# Here I calculate the raw residuals, i.e. the difference between what was
# observed and what is predicted by the model.
Raw.Residuals <- residuals(oneWay, "response")

# We should see a pattern of an increasing dispersion of the residuals as the
# fitted values increase (a thrombone form) in the graph below.
# This is due to the fact that the variance inreases with the means,
# if the data is Poisson distributed ..
plot(Fitted, Raw.Residuals)

# The Pearson residuals calculated below (i.e. the raw residuals divided by
# square root of the variance that we would have if he data were Poisson
# distributed)
Pearson.Residuals <- residuals(oneWay, "pearson")

# If the data is Poisson distributed, then the pattern of increasing dispersion
# observed in the previous plot should disapear ... and it does disapear ...
plot(Fitted, Pearson.Residuals)

################################################################################

# The points in the plot displayed in item b above (estimates of the variance
# against estimates of the means) should be disposed in, approximately, a 
# straight line, if the data is Poisson distributed (under the Poisson 
# distribution the variance is equal to the mean).
# Now, the question is whether the plot displayed there is convincing.
# The points are not exactly disposed along a straight line (crossing the 
# origin) ... but, of course, a certain deviation from the ideal straigt line
# should be expected due to the effect of random variation. 
# I will simulate the data below and check how much we could expect these
# deviations to be in a perfect scenario, i.e. in a situation where we know
# that the data is Poisson distributed (because we simulate so)

# Here I simulate the counts of number of weeds following a Poisson
# distribution with expectations given by the estimated expectations
# for each level of nitrogen (estimated using the model oneWay). 
# The same numbers of replicates are used, i.e. we simulate a result
# that follow the same probability law as the probability law assumed
# in the model oneWay defined above. I keep these simulated counts
# in the vector Simulated.n.weeds in the data-frame D.
D$Simulated.n.weeds <- simulate(oneWay)[[1]]

D$Simulated.n.weeds

# Next, I estimate the means and the variances for the SIMULATED counts
# and display a plot of those variances against those means ...
(  Simulated.Means     <- tapply(D$Simulated.n.weeds, D$Nitr  ,mean) )
(  Simulated.Variances <- tapply(D$Simulated.n.weeds, D$Nitr  ,var)  )
plot( Simulated.Means, Simulated.Variances)
abline(0,1)

# Here I will produce several simulations and display the plots of the 
# variances against the means for different simulations toghether, so
# you can compare ...

par(mfrow=c(3,3))
plot(Means, Variances, main="Original Data",
  pch=19, col="red")
abline(0,1)
# I plot below hte results of 8 simulations
for(i in 1:8){
  D$Simulated.n.weeds <- simulate(oneWay)[[1]]
  Simulated.Means     <- tapply(D$Simulated.n.weeds, D$Nitr  ,mean)
  Simulated.Variances <- tapply(D$Simulated.n.weeds, D$Nitr  ,var)
  plot(Simulated.Means, Simulated.Variances, main="Simulated Data")
  abline(0,1)
  
}

par(mfrow=c(1,1))

################################ THE END #######################################