################################################################################ # # # Tutorial-14 # # The Poisson distribution and the law of rare events - # # R. Labouriau # Last revision: Spring 2020 # # Copyright © 2018 by Rodrigo Labouriau ################################################################################ # We will play here with the Poisson law of rare events, a classic result from # the probability theory which essentially says that the distribution of the # number of occurrences of a rare event (i.e. an event with low probability) # when observed many times is approximately Poisson distributed. # # We start by introducing some tools to simulate and reconize a # Poisson distribution. # We define below a function that draws a qq-plot for the Poisson distribution, # that is, a plot of the theoretical quantiles that one would get from a Poisson # distributions against the observed sample quantiles. If the distribution of a # vector of observations is (approximately or exactly) Poisson distributed, then # qq-plot must be linear; departures from the Poisson distribution would generate # non-linear plots. # qqpois <- function(x, lambda=mean(x), main=" ", qqline=T, qqlineAll=T, Local=F, cex=1.5, pch=19){ ox <- x[order(x)] emp <- ecdf(ox)(ox) teor <- ppois(ox, lambda=lambda) if(Local){ plot(teor,emp, xlab="Poisson Theoretical Quantiles", ylab="Sample Quantiles", main=main,ylim=c(min(emp),max(emp)), xlim=c(min(teor),max(teor)), cex=cex, pch=pch, col='blue') } else{ plot(teor,emp, xlab="Poisson Theoretical Quantiles", ylab="Sample Quantiles", main=main,ylim=c(0,1), xlim=c(0,1), cex=cex, pch=pch, col='blue') } if(qqline){ lines(c(min(emp),max(emp)), c(min(emp),max(emp)), col="black", lwd=2) } if(qqlineAll){ abline(0,1, col="red") } } # We define below a function that draws a histogram of the (vector of) data and # supperposes a (red) vertical line besides the bar of the histogram # with the height equal to the theoretically expected numbers of observations # with that many counts for a sample os a Poisson distribution with a given # intensity parameter lambda and the same number of observations. # If not specified, the lambda is estimated by the sample mean (i.e. the mle). hpois <- function(x, lambda=mean(x), main=" ", xlab=" "){ br <- seq(from=min(x)-1, to=max(x)+1, by=0.5) hist(x, breaks=br, freq=T,main=main, ylab="", xlab=xlab, col="lightblue", xlim=c(min(x)-1,max(x)+1)) points((min(x):(max(x)+1))+0.2, dpois(min(x):(max(x)+1), lambda=lambda)*length(x), type="h", lwd=5, col="red") } # Generating a (pseudo) random variable Poisson distributed (with expectation 10) Y <- rpois(1000, lambda=4) # Drawing a Poisson Q-Q plot qqpois(Y) # Drawing a normal Q-Q plot for a simulated Poisson r.v. qqnorm(Y); qqline(Y) # Drawing several Poisson Q-Q plots (with different assumed expectations) par(mfrow=c(2,2)) qqpois(Y, lambda=1, main="Lambda=1") qqpois(Y, lambda=4, main="Lambda=4") qqpois(Y, main="Estimated lambda") qqpois(Y, lambda=15, main="Lambda=15") par(mfrow=c(1,1)) # Drawing several histograms and probability functions # (with different assumed expectations) par(mfrow=c(2,2)) hpois(Y, lambda=1, main="Lambda=1") hpois(Y, lambda=4, main="Lambda=4") hpois(Y, main="Estimated lambda") hpois(Y, lambda=15, main="Lambda=15") par(mfrow=c(1,1)) ################################################################################ # # Testing for a Poisson distribution: # Here I present a function that performs a test for adjustment for a Poisson # distribution. # The idea behind the test is to calculate the frequency of the observed counts # and compare it with the frequency that one might theoretically expect # from a Poisson distributed random variable. # A Pearson chi-square test of goodness of fit is then applied. ################################################################################ testPois <- function(x, lambda=mean(x), returnList=F, YatesCorrection=T){ br <- seq(from=min(x)-1, to=max(x), by=1) Oo <- hist(x, breaks=br, freq=T, xlim=c(min(x)-1,max(x)+1), plot=F, warn.unused=F) Obs <- Oo$counts ObsRange <- min(x):max(x) Exp <- dpois( ObsRange, lambda=lambda)*length(x) if (YatesCorrection) {Desv <- (abs(Exp - Obs)-0.5)^2/ Exp } else {Desv <- (Exp - Obs)^2/ Exp } Chisq <- sum(Desv) df <- length(ObsRange)-1 pvalue <- pchisq(Chisq, df=df, lower.tail=F) Results <- list(pvalue, df, Chisq) if (returnList) {return(Results)} else{return(pvalue)} } # Here I simulate some Poisson distributed observations and make some tests x <- rpois(100,lambda=5) mean(x); var(x) hpois(x) qqpois(x) testPois(x) qqnorm(x) qqline(x) set.seed(123) x <- rpois(1000,lambda=3) mean(x); var(x) hpois(x) qqpois(x) testPois(x) # Here I study the adecquacy of a Poisson distribution in a simulated data and then # I introduce an outlier and test again (we should then reject the hypothesis of # adequacy of the Poisson distribution) x <- rpois(100,lambda=5) mean(x); var(x) hpois(x) qqpois(x) testPois(x) # Inserting the outlier and testing again x[100] <- 20 mean(x); var(x) hpois(x) qqpois(x) testPois(x) # Next, we check whether the test is working in simulated data Ntests <- 1000 SampleSize <- 1000 Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=5) Pvalues[i] <- testPois(x) } # Displaying basic descritive measures of the simulated p-values summary(Pvalues) # Calculating the proportion of rejection when working with a # significance level of 1% mean(Pvalues<0.01) # Repeating the same procedure with a smaller sample size Ntests <- 1000 SampleSize <- 100 Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=5) Pvalues[i] <- testPois(x) } summary(Pvalues) mean(Pvalues<0.01) # Repeating the same procedure with an even smaller sample size Ntests <- 1000 SampleSize <- 10 Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=5) Pvalues[i] <- testPois(x) } summary(Pvalues) mean(Pvalues<0.01) # Now, I vary the sample size systematically to see the pattern Grid <- seq(from=5, to=1000, by=20) RejectionProportion <- numeric(length(Grid)) for (j in 1:length(Grid)){ Ntests <- 1000 SampleSize <- Grid[j] Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=10) Pvalues[i] <- testPois(x) } RejectionProportion[j] <- mean(Pvalues<0.05) } plot(Grid, RejectionProportion) ################ # Now I we study the power the probability of rejection when an outlier is # artificially introduced ################ SampleSize <- 500 lambda <- 9 x <- rpois(SampleSize,lambda=lambda) testPois(x) # Inserting an outlier one standard deviation form the expected value x[1] <- 12 testPois(x) # Inserting an outlier two standard deviations form the expected value x[1] <- 12 testPois(x) hpois(x) qqpois(x) # Inserting an outlier two standard deviations form the expected value in 10% of the data NObsOut <- ceiling(SampleSize/10); NObsOut x[1:NObsOut] <- 15 hpois(x) qqpois(x) testPois(x) # Making several of those tests Ntests <- 1000 SampleSize <- 500 Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=lambda) NObsOut <- ceiling(SampleSize/10) x[1:NObsOut] <- 15 Pvalues[i] <- testPois(x) } summary(Pvalues) mean(Pvalues>0.01) # Reducing drastically the sample size Ntests <- 1000 SampleSize <- 10 Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=lambda) NObsOut <- ceiling(SampleSize/10) x[1:NObsOut] <- 15 Pvalues[i] <- testPois(x) } summary(Pvalues) mean(Pvalues>0.01) # Repeating the procedure to several sample sizes Grid <- seq(from=10, to=200, by=10) RejectionProportion <- numeric(length(Grid)) Ntests <- 10000 for (j in 1:length(Grid)){ SampleSize <- Grid[j] Pvalues <- numeric(Ntests) for (i in 1:Ntests){ x <- rpois(SampleSize,lambda=lambda) NObsOut <- ceiling(SampleSize/10) x[1:NObsOut] <- 15 Pvalues[i] <- testPois(x) } RejectionProportion[j] <- mean(Pvalues<0.05) } plot(Grid, RejectionProportion, xlab="Sample Size", ylab="Proportion of Rejection", type="b") ################################################################################ # Study of the law of rare numbers: # The law of rare numbers essentially says that a binomial distribution is # well approximated by a Poisson distribution when the probability of success # p is small and the number of trials (also called size, i.e. the number of # binary trials) is large. Formally the law of rare numbers says that when # p decreases and the size increases in such a way that p*size remains # constant, the probability function of the corresponding binomial # distribution approaches the probability function of a Poisson distribution. # In fact, this Poisson distribution has intensity parameter equal to the # product p*size. # # In order to demonstrate that we simulate binomial distributed random # variables, say 1000 observations, with varying p and size. We will start # with p=1/2, which is a relatively high values and the size 5, i.e. the # binomial distribution would describe the number of sucesses of 5 binary # trials (say, through a coin 5 times an count the number of heads) each with # probability 1/2 of success. Next, we decrease p by dividing it by 2 and # increase the size by multiplying it by 2, so that the product of p and # the size is constant. ################################################################################ Size <- 5 Prob <- 0.5 par(mfrow=c(1,2)) X <- rbinom(10000, size=Size, prob=Prob) hpois(X, main=paste("B(",Size, ",", Prob,")")) qqpois(X) par(mfrow=c(1,2)) Y <- rpois(10000, lambda=Size*Prob) hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")")) qqpois(Y) ############ Size <- 5*2 Prob <- 0.5/2 par(mfrow=c(1,2)) X <- rbinom(10000, size=Size, prob=Prob) hpois(X, main=paste("B(",Size, ",", Prob,")")) qqpois(X) par(mfrow=c(1,2)) Y <- rpois(10000, lambda=Size*Prob) hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")")) qqpois(Y) ############ Size <- 5*4 Prob <- 0.5/4 par(mfrow=c(1,2)) X <- rbinom(10000, size=Size, prob=Prob) hpois(X, main=paste("B(",Size, ",", Prob,")")) qqpois(X) par(mfrow=c(1,2)) Y <- rpois(10000, lambda=Size*Prob) hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")")) qqpois(Y) ############ Size <- 5*8 Prob <- 0.5/8 par(mfrow=c(1,2)) X <- rbinom(10000, size=Size, prob=Prob) hpois(X, main=paste("B(",Size, ",", Prob,")")) qqpois(X) par(mfrow=c(1,2)) Y <- rpois(10000, lambda=Size*Prob) hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")")) qqpois(Y) ############ Size <- 5*16 Prob <- 0.5/16 par(mfrow=c(1,2)) X <- rbinom(10000, size=Size, prob=Prob) hpois(X, main=paste("B(",Size, ",", Prob,")")) qqpois(X) par(mfrow=c(1,2)) Y <- rpois(10000, lambda=Size*Prob) hpois(Y, main=paste("Po(lambda=",Size,"*", Prob,")")) qqpois(Y) ################################ # Cleaning up. rm(X,Y) # THE END!