################################################################################
#      
#
# Tutorial-14                                                                  
#                                                                              
# The Poisson distribution and the law of rare events -                        
#                                                                              
# R. Labouriau                                                                 
# Last revision: Spring 2020
#
# Copyright © 2018 by Rodrigo Labouriau
################################################################################

# We will play here with the Poisson law of rare events, a classic result from
# the probability theory which essentially says that the distribution of the 
# number of occurrences of a rare event (i.e. an event with low probability)
# when observed many times is approximately Poisson distributed.
#
# We start by introducing some tools to simulate and reconize a 
# Poisson distribution.

# We define below a function that draws a qq-plot for the Poisson distribution,
# that is, a plot of the theoretical quantiles that one would get from a Poisson
# distributions against the observed sample quantiles. If the distribution of a 
# vector of observations is (approximately or exactly) Poisson distributed, then
# qq-plot must be linear; departures from the Poisson distribution would generate
# non-linear plots.
# 
qqpois <- function(x, lambda=mean(x), main=" ", 
                   qqline=T, qqlineAll=T, Local=F, cex=1.5, pch=19){
  ox <- x[order(x)]
  emp <- ecdf(ox)(ox)
  teor <- ppois(ox, lambda=lambda)
  if(Local){
    plot(teor,emp, xlab="Poisson Theoretical Quantiles", ylab="Sample Quantiles",
         main=main,ylim=c(min(emp),max(emp)), xlim=c(min(teor),max(teor)), cex=cex, pch=pch, col='blue')
  }
  else{
    plot(teor,emp, xlab="Poisson Theoretical Quantiles", ylab="Sample Quantiles",
         main=main,ylim=c(0,1), xlim=c(0,1), cex=cex, pch=pch, col='blue')    
  }
  if(qqline){
    lines(c(min(emp),max(emp)), c(min(emp),max(emp)), col="black", lwd=2)
  }
  if(qqlineAll){
    abline(0,1, col="red")
  }
}

# We define below a function that draws a histogram of the (vector of) data and
# supperposes a (red) vertical line besides the bar of the histogram
# with the height equal to the theoretically expected numbers of observations 
# with that many counts for a sample os a Poisson distribution with a given
# intensity parameter lambda and the same number of observations. 
# If not specified, the lambda is estimated by the sample mean (i.e. the mle).

hpois <- function(x, lambda=mean(x), main=" ", xlab=" "){
  br <- seq(from=min(x)-1, to=max(x)+1, by=0.5) 
  hist(x, breaks=br, freq=T,main=main, ylab="", xlab=xlab, col="lightblue", xlim=c(min(x)-1,max(x)+1))  
  points((min(x):(max(x)+1))+0.2, dpois(min(x):(max(x)+1), lambda=lambda)*length(x), type="h", lwd=5, col="red")
}

# Generating a (pseudo) random variable Poisson distributed (with expectation 10)

Y <- rpois(1000, lambda=4)

# Drawing a Poisson Q-Q plot
qqpois(Y)

# Drawing a normal Q-Q plot for a simulated Poisson r.v.
qqnorm(Y); qqline(Y)

# Drawing several Poisson Q-Q plots (with different assumed expectations)
par(mfrow=c(2,2))
qqpois(Y, lambda=1, main="Lambda=1")
qqpois(Y, lambda=4, main="Lambda=4")
qqpois(Y, main="Estimated lambda")
qqpois(Y, lambda=15, main="Lambda=15")
par(mfrow=c(1,1))

# Drawing several histograms and probability functions
# (with different assumed expectations)
par(mfrow=c(2,2))
hpois(Y, lambda=1, main="Lambda=1")
hpois(Y, lambda=4, main="Lambda=4")
hpois(Y, main="Estimated lambda")
hpois(Y, lambda=15, main="Lambda=15")
par(mfrow=c(1,1))

################################################################################
#
# Testing for a Poisson distribution:
#  Here I present a function that performs a test for adjustment for a Poisson
#  distribution.
# The idea behind the test is to calculate the frequency of the observed counts
# and compare it with the frequency that one might theoretically expect 
# from a Poisson distributed random variable.
# A Pearson chi-square test of goodness of fit is then applied.
################################################################################

testPois <- function(x, lambda=mean(x), returnList=F, YatesCorrection=T){
  br <- seq(from=min(x)-1, to=max(x), by=1)
  Oo <-  hist(x, breaks=br, freq=T, xlim=c(min(x)-1,max(x)+1), plot=F, warn.unused=F)
  Obs <- Oo$counts 
  ObsRange <- min(x):max(x)
  Exp <- dpois( ObsRange, lambda=lambda)*length(x) 
  if (YatesCorrection) {Desv <- (abs(Exp - Obs)-0.5)^2/ Exp }
   else                {Desv <- (Exp - Obs)^2/ Exp }
  Chisq <- sum(Desv)
  df <- length(ObsRange)-1
  pvalue <- pchisq(Chisq, df=df, lower.tail=F)
  Results <- list(pvalue, df, Chisq)
  if (returnList) {return(Results)}
  else{return(pvalue)}
} 

# Here I simulate some Poisson distributed observations and make some tests

x <- rpois(100,lambda=5)
mean(x); var(x)
hpois(x)
qqpois(x)
testPois(x)

qqnorm(x)
qqline(x)

set.seed(123)
x <- rpois(1000,lambda=3)
mean(x); var(x)
hpois(x)
qqpois(x)
testPois(x)

# Here I study the adecquacy of a Poisson distribution in a simulated data and then
# I introduce an outlier and test again (we should then reject the hypothesis of 
# adequacy of the Poisson distribution)
x <- rpois(100,lambda=5)
mean(x); var(x)
hpois(x)
qqpois(x)
testPois(x)

# Inserting the outlier and testing again
x[100] <- 20
mean(x); var(x)
hpois(x)
qqpois(x)
testPois(x)

# Next, we check whether the test is working in simulated data

Ntests <- 1000
SampleSize <- 1000
Pvalues <- numeric(Ntests)
for (i in 1:Ntests){
  x <- rpois(SampleSize,lambda=5)
  Pvalues[i] <- testPois(x)
}

# Displaying basic descritive measures of the simulated p-values
summary(Pvalues)

# Calculating the proportion of rejection when working with a 
# significance level of 1%
mean(Pvalues<0.01)

# Repeating the same procedure with a smaller sample size
Ntests <- 1000
SampleSize <- 100
Pvalues <- numeric(Ntests)
for (i in 1:Ntests){
  x <- rpois(SampleSize,lambda=5)
  Pvalues[i] <- testPois(x)
}

summary(Pvalues)
mean(Pvalues<0.01)

# Repeating the same procedure with an even smaller sample size
Ntests <- 1000
SampleSize <- 10
Pvalues <- numeric(Ntests)
for (i in 1:Ntests){
  x <- rpois(SampleSize,lambda=5)
  Pvalues[i] <- testPois(x)
}

summary(Pvalues)
mean(Pvalues<0.01)

# Now, I vary the sample size systematically to see the pattern

Grid <- seq(from=5, to=1000, by=20)
RejectionProportion <- numeric(length(Grid))
for (j in 1:length(Grid)){
  Ntests <- 1000
  SampleSize <- Grid[j]
  Pvalues <- numeric(Ntests)
  for (i in 1:Ntests){
    x <- rpois(SampleSize,lambda=10)
    Pvalues[i] <- testPois(x)
  } 
  RejectionProportion[j] <- mean(Pvalues<0.05) 
}

plot(Grid, RejectionProportion)                         

################
# Now I we study the power the probability of rejection when an outlier is 
# artificially introduced
################

SampleSize <- 500
lambda <- 9
x <- rpois(SampleSize,lambda=lambda)
testPois(x)
# Inserting an outlier one standard deviation form the expected value
x[1] <- 12
testPois(x)
# Inserting an outlier two standard deviations form the expected value
x[1] <- 12
testPois(x)
hpois(x)
qqpois(x)
# Inserting an outlier two standard deviations form the expected value in 10% of the data
NObsOut <- ceiling(SampleSize/10); NObsOut 
x[1:NObsOut] <- 15
hpois(x)
qqpois(x)
testPois(x)

# Making several of those tests
Ntests <- 1000
SampleSize <- 500
Pvalues <- numeric(Ntests)
for (i in 1:Ntests){
  x <- rpois(SampleSize,lambda=lambda)
  NObsOut <- ceiling(SampleSize/10)
  x[1:NObsOut] <- 15
  Pvalues[i] <- testPois(x)
}

summary(Pvalues)
mean(Pvalues>0.01)

# Reducing drastically the sample size
Ntests <- 1000
SampleSize <- 10
Pvalues <- numeric(Ntests)
for (i in 1:Ntests){
  x <- rpois(SampleSize,lambda=lambda)
  NObsOut <- ceiling(SampleSize/10)
  x[1:NObsOut] <- 15
  Pvalues[i] <- testPois(x)
}

summary(Pvalues)
mean(Pvalues>0.01)

# Repeating the procedure to several sample sizes

Grid <- seq(from=10, to=200, by=10)
RejectionProportion <- numeric(length(Grid))
Ntests <- 10000
for (j in 1:length(Grid)){
  SampleSize <- Grid[j]
  Pvalues <- numeric(Ntests)
  for (i in 1:Ntests){
    x <- rpois(SampleSize,lambda=lambda)
    NObsOut <- ceiling(SampleSize/10)
    x[1:NObsOut] <- 15
    Pvalues[i] <- testPois(x)
  }
  RejectionProportion[j] <- mean(Pvalues<0.05) 
}

plot(Grid, RejectionProportion, xlab="Sample Size", 
     ylab="Proportion of Rejection", type="b")


################################################################################
#  Study of the law of rare numbers:
#  The law of rare numbers essentially says that a binomial distribution is 
#  well approximated by a Poisson distribution when the probability of success
#  p is small and the number of trials (also called size, i.e. the number of 
#  binary trials) is large. Formally the law of rare numbers says that when 
#  p decreases and the size increases in such a way that p*size remains 
#  constant, the probability function of the corresponding binomial 
#  distribution approaches the probability function of a Poisson distribution. 
#  In fact, this Poisson distribution has intensity parameter equal to the 
#  product p*size.
# 
#  In order to demonstrate that we simulate binomial distributed random 
#  variables, say 1000 observations, with varying p and size. We will start 
#  with p=1/2, which is a relatively high values and the size 5, i.e. the 
#  binomial distribution would describe the number of sucesses of 5 binary 
#  trials (say, through a coin 5 times an count the number of heads) each with 
#  probability 1/2 of success. Next, we decrease p by dividing it by 2 and 
#  increase the size by multiplying it by 2, so that the product of p and 
#  the size is constant.
################################################################################

Size <- 5
Prob <- 0.5
par(mfrow=c(1,2))
X <- rbinom(10000, size=Size, prob=Prob)
hpois(X, main=paste("B(",Size, ",", Prob,")"))
qqpois(X)

par(mfrow=c(1,2))
Y <- rpois(10000, lambda=Size*Prob)
hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")"))
qqpois(Y)

############
Size <- 5*2
Prob <- 0.5/2
par(mfrow=c(1,2))
X <- rbinom(10000, size=Size, prob=Prob)
hpois(X, main=paste("B(",Size, ",", Prob,")"))
qqpois(X)

par(mfrow=c(1,2))
Y <- rpois(10000, lambda=Size*Prob)
hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")"))
qqpois(Y)

############
Size <- 5*4
Prob <- 0.5/4
par(mfrow=c(1,2))
X <- rbinom(10000, size=Size, prob=Prob)
hpois(X, main=paste("B(",Size, ",", Prob,")"))
qqpois(X)

par(mfrow=c(1,2))
Y <- rpois(10000, lambda=Size*Prob)
hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")"))
qqpois(Y)

############
Size <- 5*8
Prob <- 0.5/8
par(mfrow=c(1,2))
X <- rbinom(10000, size=Size, prob=Prob)
hpois(X, main=paste("B(",Size, ",", Prob,")"))
qqpois(X)

par(mfrow=c(1,2))
Y <- rpois(10000, lambda=Size*Prob)
hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")"))
qqpois(Y)

############
Size <- 5*16
Prob <- 0.5/16
par(mfrow=c(1,2))
X <- rbinom(10000, size=Size, prob=Prob)
hpois(X, main=paste("B(",Size, ",", Prob,")"))
qqpois(X)

par(mfrow=c(1,2))
Y <- rpois(10000, lambda=Size*Prob)
hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")"))
qqpois(Y)

################################
# Cleaning up.
rm(X,Y)

#                              THE END!