################################################################################
#
# Stat-Tutorial-08-CI.R 
# The Confidence interval based on the normal and the t-distribution
#
# By R. Labouriau
# Last revised: Fall 2020
################################################################################
# Copyright © 2019 by Rodrigo Labouriau
#
# This tutorial illustrates how a confidence intervals (CI) for the mean 
# (i.e. expectation) based on independent and identically normally distributed
# observations are constructed. We will consider two situations:
#  1) when the variance is known so the CI will be based on the normal 
#     distribution and
#  2) the more realistic situation where the variance is not known and has
#     to be estimated from the data; in this case the CIs will be based on 
#     the t-distribution. 
#
# The following notions will be illustrated in this tutorial:
#  - CIs and coverage of CIs
#  - The construction of CIs using the normal distributions
#  - The t-distribution and the necessity of using CIs based on the 
#    when the variance is not known.
#
################################################################################

################################################################################
# SCENARIO:
#
# When reading and executing the codes keep in mind the following fictious
# practical scenario:
# An experimenter have performed a series of measurements, say in the lab 
# (you might think in the field as well, or home ...) all under the same 
# conditions, or approximately under the same conditions. She (or he) wants to 
# estimate, based the values that she observed, what is the range of variation 
# of a typical measurement of this type (performed under similar conditions). 
# More precisely, she wants to find an interval that will contain the expected
# value (or the theoretical mean) of the result of that experiment with a high 
# probability, say 95%. That is, she wants to establish a lower and an upper
# limit such that the probability the theoretical mean to be larger than the 
# lower limit and smaller than the upper limit is large, say larger than 
# (or equal to) 95%. We call this interval a 
#    CONFIDENCE INTERVAL (CI) WITH COVERAGE 95%. 
# This is a rough definition of CI (please consult a good book of statitics to 
# get a precise definition), but for the illustration proposes here, this 
# notion sufites.
#
################################################################################

##

################################################################################
#
# PRELIMINAIRES
#
# Here I set some basic characteristics of the scenario:
# She (that is the experimenter) have observed N, say N=20, values (you may 
# change this values if you want), which she calls the observations.
# She (and we) believe that the observations are independent (roughly said the 
# value of one observation does not affect the value of any other observation).
# The values of the observations vary typically around mu, say mu=10,
# which we call the (theoretical) mean (or expectation). 
# Normally, we do not know the value of the expectation, but we estimate it
# using the data. A reasonable estimate of the expectation is the sample mean
# (i.e. the sum of all the observed values divided by the number of 
# observations). But here we will simulate the scenario in the computer and
# therefore we will be in the priviledged position that we know the true
# value of the expectation, say 10, but she (the experimenter) does not know
# that (so we are in a divine position). We will generate the data using a 
# normal distribution with variance sigma2, say sigma2=1, and somehow pass the 
# data to her. In the first instance we say to her what is the variance, so the 
# variance is known to her (latter we will change this scenario and will not 
# inform her the value of the variance). So, from the point of view of the 
# experimenter, neither the distribution, nor the expectation are known, she 
# only has at hand N (say 20) observations and the information that the 
# variance is sigma2. She assumes that the data is normally distributed, the 
# observations are independent and the variance is known to be sigma2.
# Based on these conditions she will calculate a CI.
#
# We will simulate the scenario, i.e. perform an in silico experiment many 
# times and observe the results.
#
################################################################################

# Setting the number of observations
 
N <- 20   

# Setting the theoretical mean (expectation), which is known for us, but
# unknown for the person who would have performed the experiment

mu <- 10

# Seting the known variance

sigma2 <- 1

# Calculating the standard deviation (i.e. the square roo of the variance)

sigma <- sqrt(sigma2)

# Simulating the observations, i.e. performing the in silico experiment
# (20 observations with expectation 10 and variance 1)

Observations <- rnorm(n=N, mean=mu, sd=sigma)

# Examining the results

Observations

# Describing the results

summary(Observations)

hist(Observations, col="lightblue", freq=F)

# Estimating the expectation by the SAMPLE mean

( EstimateOfTheMean <- mean(Observations) )

# Running another experiment, i.e. simulating the observations again.

Observations <- rnorm(n=N, mean=mu, sd=sigma)
( EstimateOfTheMean <- mean(Observations) )

# Running a third experiment

Observations <- rnorm(n=N, mean=mu, sd=sigma)
( EstimateOfTheMean <- mean(Observations) )

##

################################################################################
#
# Regularity of the estimates of the (theoretical) mean.
#
# Next, we will simulate several times the "experiment" and calculate each
# time an estimate of the (theoretical) means. We will then use these values to
# demonstrate that there is a certain regularity in the estimates of the mean
# (as we should expect intuitively).
#
################################################################################


# Running 5 experiments and keeping the results of the estimation in a vector 
# (one entry per experiment), which we call "EstimatesOfTheMean"

EstimatesOfTheMean <- numeric(5) 

# The vector is iniciated with the zeroes in all the entries, we will fill 
# that with values of estimates obtained from independent "executions" of the
# experiment next

EstimatesOfTheMean

# Here we simulate the experiment, or "execute" the experiment if you prefer,
# 5 times and keep the results in the vector "EstimatesOfTheMean"

for(i in 1:5){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  EstimatesOfTheMean[i] <- mean(Observations)
}

EstimatesOfTheMean
hist(EstimatesOfTheMean)

# Running a large number of experiments and keeping the results of the 
# estimation in a vector (one entry per experiment). Here we simulate the
# experiment 43,600 times, which in our analogy would be equivalent to
# arrange that roughtly every student of the Aarhus university would execute
# the experiment one time and send the result to us.

Nexperiments <- 43600

for(i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  EstimatesOfTheMean[i] <- mean(Observations)
}

# Here we display the first 20 results 

EstimatesOfTheMean[1:20]

# Next we draw a histogram representing the (simulated) results of having
# executed the experiments 43,600 times!

hist(EstimatesOfTheMean, col="lightblue")

# Here we draw a reference line with the actual mean

abline(v=mu, lwd=3, col="red")

# Now, we calculate the 2.5% and the 0.975 quantiles of the estimates of the
# (theoretical) mean. That is, we calculate the value, say q, such that 2.5% of
# the estimates of the means based on the results of the experiments are 
# smaller than q and the value Q, such that only 2.5% of the estimates are 
# larger than Q (i.e., q and Q are the 2.5% and the 0.975 quantiles of the 
# estimates, respectively)

quantile(EstimatesOfTheMean, probs=c(0.025, 0.975))

##

# Repeating the sequence of experiments once more, i.e. abusing of the 
# students of the Aarhus university and asking them to perform once more
# the experiment and send the results to us (just to be sure about the result) 

for(i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  EstimatesOfTheMean[i] <- mean(Observations)
}
round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=2)

round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=10)

# Repeating again the sequence of experiments
for(i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  EstimatesOfTheMean[i] <- mean(Observations)
}
round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=2)
round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=10)

##

################################################################################
#
# Construction of empirical CIs
#
################################################################################
#
# Remark:
# Observe the regularity of the results above!
# The idea of CIs is to explore the fact that although the estimates of the
# mean are random quantities (they don't give the same result when the 
# experiment is repeated and they do not follow a completely predictable
# pattern), they still present some regularity. Observe the histogram of the
# results of the 43,600 experiments (one for each student).

hist(EstimatesOfTheMean, col="lightblue")

# Note that the estimates are symmetrically distributed. 
# The idea used to construct an empirical CI is to take advantage of the fact 
# that the fact that the estimates are symmetrically distributed (the 
# estimates are in fact normally distributed). We will use a brute-force 
# method, namely trying many possibilities until we get the CI with coverage
# 95% (i.e. with the probability of incuding mu=10 equal to 0.95).
#
################################################################################

# Suppose that the experimenter runs the experiment once and obtais

( Observations <- rnorm(n=N, mean=mu, sd=sigma) )

# Now she estimates the expectation by calculating the sample mean of the
# observations

( EstimateOfTheMean <- mean(Observations) )

# Next, she defines a lower limit and an upper limit by allowing deviations
# from the estimate of the mean (the sample mean) to be 0.2 units less than the
# estimate of the mean (lower limit) or 0.2 units more than the estimates of
# the mean, that is,

( LowerLimit <- EstimateOfTheMean-0.2)
( UpperLimit <- EstimateOfTheMean+0.2 )

# Here we check if the "real" value of the theoretical mean is in the 
# proposed CI

LowerLimit < mu & mu < UpperLimit


# Now she is satisfied, she got a CI interval for the theoretical mean. But, is 
# this a reasonable CI? Is the probability of this containing the true value 
# of the theoretical mean equal to 0.95?
# She cannot know that (unless she uses some theory, but she does not want to
# do that at this stage) and therefore she asks us to use our good connections
# with the students of the univerity of Aarhus (and our almost divine power
# of in fact knowing the true value of the expectation, namely mu=10) and 
# verify whether this procedure for calculating the CI is reasonable.
# We agreed and performed 43,600 times the experiment and ask the students to
# calculate the CI using the procedure proposed by the experimenter (althought
# protesting a bit) the students performed the experiments and calculated the
# CIS.

# The inclusion of mu in the CIs will be kept in this vector
IsMuInCI <- logical(Nexperiments)

# Here we "perform" 43,600 experiments, calculate the CIs 
# and register whether the CIs contain mu (the "real" value of the expectation)

for (i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.2
  UpperLimit <- EstimateOfTheMean+0.2
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

# Now, we calculate the proportion of times the CI includes the "real" value
# of the expectation (theoretical mean)

mean(IsMuInCI)

##

# Well, this proportion is (probably) very low, so we should make the CIs
# larger. After seeing the results of the experiments, she decided to update
# the proceedure to construct CIs and defines a lower limit and an upper limit 
# now by allowing deviations from the estimate of the mean (the sample mean) to 
# be 0.5 units less than the estimate of the mean (lower limit) or 0.5 units 
# more than the estimates of the mean. The students were convinced to run the
# the experiments with the new proceedure for calculatig CIs again. Let us see 
# what happened.

for (i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.5
  UpperLimit <- EstimateOfTheMean+0.5
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInCI)

##

# Loocking at the proportions of CIs containing the "true" value of the 
# expectation obtained, the experimenter realized that now the CIs
# were too wide, so the proportion of inclusion of the "real" value of
# the expectation was larger than 0.95. As she wanted to find the 
# smallest CI with coverage 0.95, therefore she decided to make
# the CIs slightly smaller. She decided to try with deviations of 0.3 units, 
# although many sudents started to protest! Let us see what happened when we 
# collected all the CIs calculated with the new proceedure by the poor students.

for (i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.3
  UpperLimit <- EstimateOfTheMean+0.3
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInCI)

# "Now the CIs are too short, let me make them wider" said the experimenter, 
# starting to have some fun with the play, but at this time 
# the students got tired of this abuse and denied to run the experiments!

##

################################################################################
#
# A theoretical CI for the expectation (theoretical mean) 
#
# or
#
# "THERE IS NOTHING MORE PRACTICAL THAN A GOOD THEORY"
#
################################################################################
#
# After some discussion, a student of statistics suggested that they should
# use some theory to find the method to calculate CIs ... , but, how?
#
# The student (of statistics) answered with a misterious expression in his face
#
# "Use deviations of size 0.4382612703."
#
# The other students were reticent, but performe the 43,600 experiments 
# (for the sake of the development of science) with the new proceedure to 
# calculate CIs. Loock at the results.

for (i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.4382612703
  UpperLimit <- EstimateOfTheMean+0.4382612703
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInCI)

round(mean(IsMuInCI), digits=2)

# Very close to 0.95 this time. The students got enthusistic and run the 
# experiment once more, just to see if it was not just luck.

for (i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.4382612703
  UpperLimit <- EstimateOfTheMean+0.4382612703
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInCI)

round(mean(IsMuInCI), digits=2)

# Indeed close, they accepted to run one more round of experiments,
# this time performing 2 experiments each; just to be really sure.

for (i in 1:(Nexperiments*2)){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.4382612703
  UpperLimit <- EstimateOfTheMean+0.4382612703
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInCI)

round(mean(IsMuInCI), digits=2)

##

# Amazing, but how was this constant 0.4382612703 calculated? Well, here is
# the answer of the student of statistics:
# "Note that the estimates of the expectation are normally distributed", said 
# student. Indeed, examining some of the previous results he draw the
# following normal QQ plot.

qqnorm(EstimatesOfTheMean) ; qqline(EstimatesOfTheMean)

# "This should not be a surprise that the estimates of the expectation are 
# normally distributed" continued the student "since the observations
# are normally distributed". "Indeed the estimates of the expectations are
# just sample means, and sample means of normally distributed observations
# are also normally distributed. So there is no surprise that the estimates
# of the expectation were normally distributed", added the student of statistics.
# The other students agreed. Enthusiasmed the student of statistics continued
# "Now, using some theory of probability (he just had taken a course of that), 
# I claim that it can be proved that the expectation of the sample 
# mean (note of the editor: he meant the estimates of the theoretical mean) is
# the 'real' expectation and the variance is sigma2/N". "I used then some
# more theory to see that the deviations that make the CI have a coverage of
# 0.95 is given by 1.959963985*sigma/sqrt(20)". Someone asked "Where did this 
# constant 1.959963985 came from?". The student of statistics answered 
# "1.959963985 is a number such that the probability of a standard normally
# distributed random variable ..."; "What?" another student interrupted him;
# "... the sdandard normal distribution is a normal distribution with mean 0
# and variance 1, which is used as a reference, e.g. in old tables ..."
# The student of statistics contined "... the constant 1.959963985 is such that
# if a quantity is distributed according to the standard normal distribution,
# then the probability of this quantity be larger than 1.959963985 is 2.5% AND
# the probability of this quantity being less than 1.959963985 is 2.5%", he added
# that here he used an approximation but one that is good enough, he calculated
# this in R writing "qnorm(0.975)". Another student asked "but you used in
# fact 1.959963985*sigma/sqrt(20) in your calculations, why did you multiplied
# by sigma/sqrt(20)?". The student of statistics answered: "Well, remember that
# I said that the sample mean (i.e the estimates of the expectation) were
# normally distributed WITH VARIANCE sigma2/N, multiplied by sigma/sqrt(20)
# in order to make the estimates of the expectation to have variance 1 and then 
# I could use the constant 1.959963985, say 1.96 or roughly 2, 
# to calculate the probability of the estimates being larger
# than a certain upper limit (2.5%), or smaller than a lower limit (2.5%), i.e.
# the probability of the CI would be then 95% ....". Here are the calculations.

print(qnorm(0.975),digits=10)

print(qnorm(0.975)/sqrt(20),digits=10) 

##

# Some one asked to the student of statistics (first year) "You said a lot of
# interesting things and the method worked, but COULD YOU PLEASE SUMMARIZE".
# The student of statistics answered:
#
# "When you want to calculate a CI with 95% coverage to the expectation, 
# then use the interval
#
#  (sample mean - 1.96*sigma/sqrt(N)) , sample mean + 1.96*sigma/sqrt(N)))
#
# This method for calculating the CI works when the observations are: 
#    - Independent
#    - Normally distributed WITH VARIANCE KNOWN TO BE sigma2."
#
# Some one asked "What should I do if I want a CI with other coverage? 
# Say I want a coverage of 0.99, just to be really sure that the true
# value of the parameter is in the CI."
# The student of statistics answered "That is one of the advantages of using
# a good theory, we can often adapt the results to a new situation. In this
# case, just change the constant 1.959963985 to 2.575829304 (routhly 2.58).
# You might consult a table to that or type "print(qnorm(0.995),digits=10)"
# in R!" (he just had installed the free package in his computer!).
# The students run the 43,600 experiments two times to check that.

print(qnorm(0.995)*sigma/sqrt(20),digits=10) 

for (i in 1:(Nexperiments*2)){
  Observations <- rnorm(n=N, mean=mu, sd=sigma) 
  EstimateOfTheMean <- mean(Observations)
  LowerLimit <- EstimateOfTheMean-0.5759729421
  UpperLimit <- EstimateOfTheMean+0.5759729421
  IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInCI)

round(mean(IsMuInCI), digits=2)

# Amazing, it seems to work! 

##

################################################################################
# 
# Please, try different values of mu, N and sigma.
#
################################################################################

################################################################################
#
# CI when the variance (or the standard deviation) is NOT known
#
################################################################################
# 
# Some of the other students asked "What if the variance is not known?"
# Another student, enthusiasmed with the last success said
# "I have an idea. Just estimate the variance using the sample variance,
# i.e. the sum of the squares of the deviations divided by N-1. Use then
# this estimate instead of the known sigma2. No one will realize that we
# made this estimation and we just use the same proceedure as aways" and
# he even dare to say "Nothing more practical than the quick-and-dirty 
# method!!!!".
# The student of statistics was a bit relutant, but finally agreed to try
# quick-and-dirty method. They run a new set of experiments.

# Here is one experiment

( Observations <- rnorm(n=N, mean=mu, sd=sigma) )

# Old proceedure (requires knowing the variance)

( LowerLimit <- mean(Observations) - qnorm(p=0.975)*sigma/sqrt(N) )
( UpperLimit <- mean(Observations) + qnorm(p=0.975)*sigma/sqrt(N) )

# New procedure (quick-and-dirty, plug-in an estimate of the variance)

( LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N) )
( UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N) )


( IsMuInTheCI <- LowerLimit < mu & mu < UpperLimit )

##

# Here are the 43,600 experiments

IsMuInTheCI <- logical(Nexperiments)
for(i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N)
  UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N)
  IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInTheCI)

# "It is not working, we got a coverage smaller than 0.95" shout one of the 
# students. They decided to perform more esperiments.

IsMuInTheCI <- logical(Nexperiments*2)
for(i in 1:(Nexperiments*2)){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N)
  UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N)
  IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInTheCI)

#"Definitely not working!"

##

################################################################################
#
# An adjusted procedure - t-distribution
#
################################################################################
#
# Well, said the student of statistics, a colleague student of statistics who 
# worked in a brewery explained to me that indeed we should not expect this 
# method to work properly (the colleague was William Sealy Gosset). Indeed, if 
# we replace the unknown variance by its estimate, we introduce a new source of 
# error since the method assumes the variance to be perfectly known and we use 
# an imperfect estimate. "_ What should we then do?" shouted a couple of other 
# students "I knew that by the end of the day this thing of theory would not 
# work!". Well, replied the student of statistics, we should improve the theory! 
# After a couple of beers my colleague come to the conclusion that we should use 
# a t-distribution instead of a normal distribution, this distribution is 
# calculated to compensate the imprecision of using an estimate of the variance 
# instead of the variance. The sudent draw some graphs (the density functions),
# most of the students did not follow him in this point but they believed that
# the new method would work....

# Drawing the two densities
f <- function(x) dnorm(x, mean=mu, sd=sigma)
plot(f, xlim=c(7,13), lwd=4)
f <- function(x) dt(x-mu, df=N-1)
plot(f, xlim=c(7,13), lwd=4, col="red", add=TRUE)
legend(x="topright", legend=c("Normal distribution", "t-distribution"),
       lwd=4, col=c("black","red"), bty="n")

# Comparing the 0.975 quantiles of the normal and the t- distibution
qnorm(p=0.975)

qt(p=0.975, df=N-1)

# Generating a new experiment
( Observations <- rnorm(n=N, mean=mu, sd=sigma) )

# New procedure
( LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N) )
( UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N) )

# Adjusted new proceedure
( LowerLimit <- mean(Observations) - qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) )
( UpperLimit <- mean(Observations) + qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) )

( IsMuInTheCI <- LowerLimit < mu & mu < UpperLimit )

# Testing the CI constructed with the t-distribution
IsMuInTheCI <- logical(Nexperiments)
for(i in 1:Nexperiments){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  LowerLimit <- mean(Observations) - qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N)
  UpperLimit <- mean(Observations) + qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N)
  IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInTheCI)

# Once more with even more experiments

IsMuInTheCI <- logical(Nexperiments*2)
for(i in 1:(Nexperiments*2)){
  Observations <- rnorm(n=N, mean=mu, sd=sigma)
  LowerLimit <- mean(Observations) - qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N)
  UpperLimit <- mean(Observations) + qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N)
  IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit
}

mean(IsMuInTheCI)

################################################################################

# THE END!