################################################################################ # # # Module 3 - Poisson models Day 5 # # # # R. Labouriau # # # # Revisions/editions: Spring 2020. # ################################################################################ ################################################################################ # # # Loading all the relevant data to be used here # # # ################################################################################ load("~/BaSALES/DataFramesStatisticalModelling.RData") library(testModelDistribution) # Setting the place where the output (graphs) will be kept # setwd("~/BaSALES/Temp/") ################################################################################ # Defining a function to draw Poisson QQ-plots # ################################################################################ qqpois <- function(x, lambda=mean(x), main=" ", qqline=T){ ox <- x[order(x)] emp <- ecdf(ox)(ox) teor <- ppois(ox, lambda=lambda) plot(teor,emp, xlab="Poisson Theoretical Quantiles", ylab="Sample Quantiles", main=main,ylim=c(0,1), cex=1.5, pch=19, col='blue') if(qqline){ lines(c(min(emp),max(emp)), c(min(emp),max(emp)), col="black", lwd=2) } } ################################################################################ # Draws related to the Poisson distribution # # # ################################################################################ # Plotting the density grid.p <- 1:40 poisson2 <- dpois(grid.p , lambda=2) poisson5 <- dpois(grid.p , lambda=5) poisson10 <- dpois(grid.p , lambda=10) poisson20 <- dpois(grid.p , lambda=20) # pdf('Figure-Ch4-04.pdf') par(mfrow=c(2,2), pty="s") plot(grid.p, poisson2, type='h', lwd=3,col="blue", xlab='',ylab='',main="2") plot(grid.p, poisson5 ,type='h', lwd=3,col="blue", xlab='',ylab='',main="5") plot(grid.p, poisson10 ,type='h', lwd=3,col="blue", xlab='',ylab='',main="10") plot(grid.p, poisson20 ,type='h', lwd=3,col="blue", xlab='',ylab='',main="20") # dev.off() # Plotting simulated data # pdf('Figure-Ch4-05.pdf') par(mfrow=c(2,2), pty="s") hist(rpois(1000, lambda=2), main='2', col='lightblue', xlab='') hist(rpois(1000, lambda=5), main='5', col='lightblue', xlab='') hist(rpois(1000, lambda=10), main='10', col='lightblue', xlab='') hist(rpois(1000, lambda=20), main='20', col='lightblue', xlab='') # dev.off() # Testing the relationship between mean and variance PoissoN <- rpois(100, lambda=0.5) # pdf('Figure-Ch4-05-b.pdf') par(mfrow=c(1,1), pty="s") plot(mean(PoissoN), var(PoissoN), cex=0.9, col='blue', pch=19, xlab='Mean', ylab='Variance', ylim=c(0,25), xlim=c(0,16)) abline(a=0, b=1) for(i in 1:100){ for(L in 1:15){ PoissoN <- rpois(100, lambda=L) points(mean(PoissoN), var(PoissoN), cex=0.9, col='blue', pch=19) } } abline(a=0, b=1, lwd=2) # dev.off() ################################################################################ # Example: # # Counts of alpha particles, Rutherford and Geiger (1910). The probability # # variations in the distribution of alpha-particles. Philosophical Magazine, # # Series 6, 20, 698-704. # ################################################################################ # Inserting the data Freq <- c(57, 203, 383, 525, 532, 408, 273, 139, 45, 27, 10, 4, 0, 1, 1) Counts <- 0:14 n.counts <- sum(Freq);n.counts plot(Counts, Freq, type='h', lwd=5, xlab='Counts', ylab='Frequency') # pdf('D:/Rodrigo/Courses/2010-Intro2StatModels/Lectures/Figure-Ch4-00A0.pdf') plot(Counts, Freq, type='h', lwd=5, xlab='Counts', ylab='Frequency') # dev.off() lambda <- sum(Freq*Counts)/sum(Freq); lambda # 3.871549 Expected <- dpois(Counts, lambda=lambda) * n.counts points(Counts+0.13, Expected , type='h', lwd=5, col='blue') legend(9,200, legend=c('Observed', 'Expected'), col=c('black','blue'), lwd=4, bty='n') # Reconstructing the individual counts from the frequency of counts indvidual.counts <- numeric(n.counts) observation <- 1 for(i in 1:length(Freq)){ count <- Counts[i] for(j in 1:Freq[i]){ indvidual.counts[observation] <- count observation <- observation + 1 } } mean(indvidual.counts) # 3.877778 var(indvidual.counts) # 3.743967 # Drawing the QQ-plot qqpois (indvidual.counts) # Verifying what happens if a wrong lambda is used lambda qqpois(indvidual.counts, lambda=2.8) qqpois(indvidual.counts, lambda=4.8) # Outputing the graph # pdf('Figure-Ch4-00C.pdf') par(mfrow=c(1,2), pty='s') qqpois(indvidual.counts, lambda=2.8, main='Lambda = 2.8') qqpois(indvidual.counts, lambda=4.8, main='Lambda = 4.8') # dev.off() par(mfrow=c(1,1), pty='s') ################################################################################ M <- glm(indvidual.counts ~ 1, family = poisson(link = "identity")) summary(M) TT <- testPoissonModel(M) summary(TT) summary(TT, Nrows = 20) ( Pvalue <- TT$Pvalue ) pdf('Figure-Ch4-05Z-Rutherford.pdf', width = 20) par(mfrow = c(1,2)) plot(TT, Main = "Original Data", Sub = paste("P-value for adjustment to a Poisson distribution = ", Pvalue), Nsim = 10000 ) qqpois(indvidual.counts) dev.off() par(mfrow = c(1,1)) # Generating a figure for analysing the Rutherford-Geiger example pdf('Figure-Ch4-05Z-Simulated.pdf', width = 20) par(mfrow = c(1,2)) Y <- rpois(n = length(indvidual.counts) ,lambda = lambda) M <- glm(Y ~ 1, family = poisson(link = "identity")) TT <- testPoissonModel(M) ( Pvalue <- TT$Pvalue ) plot(TT, Main = "Simulated", Sub = paste("P-value for adjustment to a Poisson distribution = ", Pvalue), Nsim = 10000 ) # qqpois(indvidual.counts) dev.off() par(mfrow = c(1,1)) ################################################################################ # Poisson regression model # # Soil fungi # ################################################################################ # Reading and summarizing the data attach(data.fungi) str(data.fungi) # pdf('Figure-Ch4-11.pdf') par(mfrow=c(1,1), pty='s') plot(gsoil, Penicillium, xlab='Amount of soil (g)', ylab='CFU (Penicillium)', pch=19, col='yellow', cex=1.8 ) # dev.off() # Fitting a free-curve model free.curve <- glm(Penicillium ~ factor(gsoil) , family=poisson(link="log") ) ( Values <- as.numeric(levels(factor(Penicillium))) ) table(Penicillium) GG <- testPoissonModel(Model = free.curve, SupportIndex = Values) summary(GG) # Testing homogeneity deviance(free.curve) length(gsoil) # 20 observations and 4 parameters in the free curve model pchisq(deviance(free.curve), df=16, lower.tail=F) # Fitting a linear model through the origin linear0 <- glm(Penicillium ~ 0 + gsoil , family=poisson(link="identity") ) coef(linear0) # Fitting a linear model (not necessarily through the origin) linear <- glm(Penicillium ~ gsoil , family=poisson(link="identity") ) coef(linear) coef(linear0) # Plotting to get an idea of what is happening # pdf('Figure-Ch4-12.pdf') par(mfrow=c(1,1), pty='s') plot(gsoil, Penicillium, xlab='Amount of soil (g)', ylab='CFU (Penicillium)', pch=19, col='yellow', cex=1.8, xlim=c(0.000025, max(gsoil)), ylim=c(0.00005,max(Penicillium)) ) abline(a=0, b=coef(linear0), col='blue', lwd=3) abline(a=coef(linear[1]), b=coef(linear[2]), col='red', lwd=3) abline(h=0) # dev.off() # Testing linearity anova(linear, free.curve, test="Chisq") # Testing whether the regression line crosses the origin anova(linear0, linear, test="Chisq") # Fitting a parabolic regression gsoil2 <- gsoil*gsoil parabola <- glm(Penicillium ~ 0 + gsoil + gsoil2 , family=poisson(link="identity") ) anova(linear0, parabola, test="Chisq") coef(parabola) coef(linear0) # Plotting the models # pdf('Figure-Ch4-14.pdf') plot(gsoil, Penicillium, xlab='Amount of soil (g)', ylab='CFU (Penicillium)', pch=19, col='yellow', cex=2.5, xlim=c(0, max(gsoil)), ylim=c(0,max(Penicillium)) ) grid.p <- seq(from=0, to=max(gsoil), by=0.000001) linear0.p <- grid.p*coef(linear0) points(grid.p, linear0.p, type='l', lwd=3, col='blue') expected <- grid.p*coef(parabola)[1] + grid.p*grid.p*coef(parabola)[2] points(grid.p, expected, type='l', lwd=3, col='red') linear.term <- grid.p*coef(parabola)[1] points(grid.p, linear.term, type='l', lwd=3, lty=2, col='red') abline(h=0); abline(v=0) legend(2.5E-04,20, legend=c("Plating model", "Parabolic model", "Linear term of the parabolic model"), col=c("blue","red","red"), lty=c(1,1,2), lwd=3, bty="n" ) free.curve <- glm(Penicillium ~ factor(gsoil) , family=poisson(link="log") ) parabola <- glm(Penicillium ~ 0 + gsoil + gsoil2 , family=poisson(link="identity") ) linear <- glm(Penicillium ~ gsoil , family=poisson(link="identity") ) anova(parabola, free.curve, test="Chisq") anova(linear, free.curve, test="Chisq") annova(linear0, parabola, test="Chisq") # ---------------------------------------------------------------------------- # # Summing up # # ---------------------------------------------------------------------------- # free.curve <- glm(Penicillium ~ factor(gsoil) , family=poisson(link="identity") ) # Or equivalently: # free.curve <- glm(Penicillium ~ factor(gsoil) , family=poisson(link="log") ) deviance(free.curve) gsoil2 <- gsoil*gsoil parabola <- glm(Penicillium ~ 0 + gsoil + gsoil2, family=poisson(link="identity") ) deviance(parabola) linear <- glm(Penicillium ~ gsoil , family=poisson(link="identity") ) deviance(linear) linear0 <- glm(Penicillium ~ 0 + gsoil , family=poisson(link="identity") ) deviance(linear0) detach(data.fungi) # ---------------------------------------------------------------------------- # # Repeating the analysis using the extended data # # ---------------------------------------------------------------------------- # attach(data.fungi.Extended) gsoil2 <- gsoil*gsoil free.curve <- glm(Penicillium ~ factor(gsoil) , family=poisson(link="log") ) parabola <- glm(Penicillium ~ 0 + gsoil + gsoil2 , family=poisson(link="identity")) linear <- glm(Penicillium ~ gsoil , family=poisson(link="identity") ) linear0 <- glm(Penicillium ~ 0 + gsoil , family=poisson(link="identity") ) deviance(free.curve) pchisq(deviance(free.curve), df=24, lower.tail=F) deviance(parabola) anova(parabola, free.curve, test="Chisq") deviance(linear) anova(linear, free.curve, test="Chisq") deviance(linear0) anova(linear0, parabola, test="Chisq") detach(data.fungi.Extended) # ---------------------------------------------------------------------------- # # Modelling the competition inside the Petri dish # # ---------------------------------------------------------------------------- # # pdf('Figure-Ch4-15.pdf') plot(Penicillium, other, ylab="Other fungi", xlab="Penicillium", cex=1.5 ,col="yellow", pch=19, xlim=c(0, max(Penicillium)), ylim=c(0,max(other)) ) cc <- lm(other ~ 0 + Penicillium) abline(a=cc[1], b=cc[2], lwd=2, col="red") # dev.off() # Fitting a bynomial model for the probability of being a Penicillium CFU resp <- cbind(Penicillium, other) binom <- glm(resp ~ factor(gsoil), family=binomial) Null <- glm(resp ~ 1, family=binomial) anova(Null, binom, test="Chisq") summary(binom) # The proportion of Penicillium does not increase with the ammount of soil # added. The competition is intra-specific. # Fitting a Poisson model with the total number of other fungi as an # explanatory variable gsoil2 <- gsoil*gsoil compet.corr <- glm(Penicillium ~ 0 + other + gsoil + gsoil2 , family=poisson(link="identity") ) anova(compet.corr, test="Chisq") anova(parabola, compet.corr, test="Chisq") summary(compet.corr) no.compet <- glm(Penicillium ~ 0 + other + gsoil , family=poisson(link="identity") ) anova(no.compet, compet.corr, test="Chisq") detach(data.fungi) # ---------------------------------------------------------------------------- # # # # THE END! # # # # ---------------------------------------------------------------------------- #