################################################################################ # # Stat-Tutorial-08-CI.R # The Confidence interval based on the normal and the t-distribution # # By R. Labouriau # Last revised: Fall 2020 ################################################################################ # Copyright © 2019 by Rodrigo Labouriau # # This tutorial illustrates how a confidence intervals (CI) for the mean # (i.e. expectation) based on independent and identically normally distributed # observations are constructed. We will consider two situations: # 1) when the variance is known so the CI will be based on the normal # distribution and # 2) the more realistic situation where the variance is not known and has # to be estimated from the data; in this case the CIs will be based on # the t-distribution. # # The following notions will be illustrated in this tutorial: # - CIs and coverage of CIs # - The construction of CIs using the normal distributions # - The t-distribution and the necessity of using CIs based on the # when the variance is not known. # ################################################################################ ################################################################################ # SCENARIO: # # When reading and executing the codes keep in mind the following fictious # practical scenario: # An experimenter have performed a series of measurements, say in the lab # (you might think in the field as well, or home ...) all under the same # conditions, or approximately under the same conditions. She (or he) wants to # estimate, based the values that she observed, what is the range of variation # of a typical measurement of this type (performed under similar conditions). # More precisely, she wants to find an interval that will contain the expected # value (or the theoretical mean) of the result of that experiment with a high # probability, say 95%. That is, she wants to establish a lower and an upper # limit such that the probability the theoretical mean to be larger than the # lower limit and smaller than the upper limit is large, say larger than # (or equal to) 95%. We call this interval a # CONFIDENCE INTERVAL (CI) WITH COVERAGE 95%. # This is a rough definition of CI (please consult a good book of statitics to # get a precise definition), but for the illustration proposes here, this # notion sufites. # ################################################################################ ## ################################################################################ # # PRELIMINAIRES # # Here I set some basic characteristics of the scenario: # She (that is the experimenter) have observed N, say N=20, values (you may # change this values if you want), which she calls the observations. # She (and we) believe that the observations are independent (roughly said the # value of one observation does not affect the value of any other observation). # The values of the observations vary typically around mu, say mu=10, # which we call the (theoretical) mean (or expectation). # Normally, we do not know the value of the expectation, but we estimate it # using the data. A reasonable estimate of the expectation is the sample mean # (i.e. the sum of all the observed values divided by the number of # observations). But here we will simulate the scenario in the computer and # therefore we will be in the priviledged position that we know the true # value of the expectation, say 10, but she (the experimenter) does not know # that (so we are in a divine position). We will generate the data using a # normal distribution with variance sigma2, say sigma2=1, and somehow pass the # data to her. In the first instance we say to her what is the variance, so the # variance is known to her (latter we will change this scenario and will not # inform her the value of the variance). So, from the point of view of the # experimenter, neither the distribution, nor the expectation are known, she # only has at hand N (say 20) observations and the information that the # variance is sigma2. She assumes that the data is normally distributed, the # observations are independent and the variance is known to be sigma2. # Based on these conditions she will calculate a CI. # # We will simulate the scenario, i.e. perform an in silico experiment many # times and observe the results. # ################################################################################ # Setting the number of observations N <- 20 # Setting the theoretical mean (expectation), which is known for us, but # unknown for the person who would have performed the experiment mu <- 10 # Seting the known variance sigma2 <- 1 # Calculating the standard deviation (i.e. the square roo of the variance) sigma <- sqrt(sigma2) # Simulating the observations, i.e. performing the in silico experiment # (20 observations with expectation 10 and variance 1) Observations <- rnorm(n=N, mean=mu, sd=sigma) # Examining the results Observations # Describing the results summary(Observations) hist(Observations, col="lightblue", freq=F) # Estimating the expectation by the SAMPLE mean ( EstimateOfTheMean <- mean(Observations) ) # Running another experiment, i.e. simulating the observations again. Observations <- rnorm(n=N, mean=mu, sd=sigma) ( EstimateOfTheMean <- mean(Observations) ) # Running a third experiment Observations <- rnorm(n=N, mean=mu, sd=sigma) ( EstimateOfTheMean <- mean(Observations) ) ## ################################################################################ # # Regularity of the estimates of the (theoretical) mean. # # Next, we will simulate several times the "experiment" and calculate each # time an estimate of the (theoretical) means. We will then use these values to # demonstrate that there is a certain regularity in the estimates of the mean # (as we should expect intuitively). # ################################################################################ # Running 5 experiments and keeping the results of the estimation in a vector # (one entry per experiment), which we call "EstimatesOfTheMean" EstimatesOfTheMean <- numeric(5) # The vector is iniciated with the zeroes in all the entries, we will fill # that with values of estimates obtained from independent "executions" of the # experiment next EstimatesOfTheMean # Here we simulate the experiment, or "execute" the experiment if you prefer, # 5 times and keep the results in the vector "EstimatesOfTheMean" for(i in 1:5){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimatesOfTheMean[i] <- mean(Observations) } EstimatesOfTheMean hist(EstimatesOfTheMean) # Running a large number of experiments and keeping the results of the # estimation in a vector (one entry per experiment). Here we simulate the # experiment 43,600 times, which in our analogy would be equivalent to # arrange that roughtly every student of the Aarhus university would execute # the experiment one time and send the result to us. Nexperiments <- 43600 for(i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimatesOfTheMean[i] <- mean(Observations) } # Here we display the first 20 results EstimatesOfTheMean[1:20] # Next we draw a histogram representing the (simulated) results of having # executed the experiments 43,600 times! hist(EstimatesOfTheMean, col="lightblue") # Here we draw a reference line with the actual mean abline(v=mu, lwd=3, col="red") # Now, we calculate the 2.5% and the 0.975 quantiles of the estimates of the # (theoretical) mean. That is, we calculate the value, say q, such that 2.5% of # the estimates of the means based on the results of the experiments are # smaller than q and the value Q, such that only 2.5% of the estimates are # larger than Q (i.e., q and Q are the 2.5% and the 0.975 quantiles of the # estimates, respectively) quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) ## # Repeating the sequence of experiments once more, i.e. abusing of the # students of the Aarhus university and asking them to perform once more # the experiment and send the results to us (just to be sure about the result) for(i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimatesOfTheMean[i] <- mean(Observations) } round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=2) round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=10) # Repeating again the sequence of experiments for(i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimatesOfTheMean[i] <- mean(Observations) } round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=2) round( quantile(EstimatesOfTheMean, probs=c(0.025, 0.975)) , digits=10) ## ################################################################################ # # Construction of empirical CIs # ################################################################################ # # Remark: # Observe the regularity of the results above! # The idea of CIs is to explore the fact that although the estimates of the # mean are random quantities (they don't give the same result when the # experiment is repeated and they do not follow a completely predictable # pattern), they still present some regularity. Observe the histogram of the # results of the 43,600 experiments (one for each student). hist(EstimatesOfTheMean, col="lightblue") # Note that the estimates are symmetrically distributed. # The idea used to construct an empirical CI is to take advantage of the fact # that the fact that the estimates are symmetrically distributed (the # estimates are in fact normally distributed). We will use a brute-force # method, namely trying many possibilities until we get the CI with coverage # 95% (i.e. with the probability of incuding mu=10 equal to 0.95). # ################################################################################ # Suppose that the experimenter runs the experiment once and obtais ( Observations <- rnorm(n=N, mean=mu, sd=sigma) ) # Now she estimates the expectation by calculating the sample mean of the # observations ( EstimateOfTheMean <- mean(Observations) ) # Next, she defines a lower limit and an upper limit by allowing deviations # from the estimate of the mean (the sample mean) to be 0.2 units less than the # estimate of the mean (lower limit) or 0.2 units more than the estimates of # the mean, that is, ( LowerLimit <- EstimateOfTheMean-0.2) ( UpperLimit <- EstimateOfTheMean+0.2 ) # Here we check if the "real" value of the theoretical mean is in the # proposed CI LowerLimit < mu & mu < UpperLimit # Now she is satisfied, she got a CI interval for the theoretical mean. But, is # this a reasonable CI? Is the probability of this containing the true value # of the theoretical mean equal to 0.95? # She cannot know that (unless she uses some theory, but she does not want to # do that at this stage) and therefore she asks us to use our good connections # with the students of the univerity of Aarhus (and our almost divine power # of in fact knowing the true value of the expectation, namely mu=10) and # verify whether this procedure for calculating the CI is reasonable. # We agreed and performed 43,600 times the experiment and ask the students to # calculate the CI using the procedure proposed by the experimenter (althought # protesting a bit) the students performed the experiments and calculated the # CIS. # The inclusion of mu in the CIs will be kept in this vector IsMuInCI <- logical(Nexperiments) # Here we "perform" 43,600 experiments, calculate the CIs # and register whether the CIs contain mu (the "real" value of the expectation) for (i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.2 UpperLimit <- EstimateOfTheMean+0.2 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } # Now, we calculate the proportion of times the CI includes the "real" value # of the expectation (theoretical mean) mean(IsMuInCI) ## # Well, this proportion is (probably) very low, so we should make the CIs # larger. After seeing the results of the experiments, she decided to update # the proceedure to construct CIs and defines a lower limit and an upper limit # now by allowing deviations from the estimate of the mean (the sample mean) to # be 0.5 units less than the estimate of the mean (lower limit) or 0.5 units # more than the estimates of the mean. The students were convinced to run the # the experiments with the new proceedure for calculatig CIs again. Let us see # what happened. for (i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.5 UpperLimit <- EstimateOfTheMean+0.5 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInCI) ## # Loocking at the proportions of CIs containing the "true" value of the # expectation obtained, the experimenter realized that now the CIs # were too wide, so the proportion of inclusion of the "real" value of # the expectation was larger than 0.95. As she wanted to find the # smallest CI with coverage 0.95, therefore she decided to make # the CIs slightly smaller. She decided to try with deviations of 0.3 units, # although many sudents started to protest! Let us see what happened when we # collected all the CIs calculated with the new proceedure by the poor students. for (i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.3 UpperLimit <- EstimateOfTheMean+0.3 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInCI) # "Now the CIs are too short, let me make them wider" said the experimenter, # starting to have some fun with the play, but at this time # the students got tired of this abuse and denied to run the experiments! ## ################################################################################ # # A theoretical CI for the expectation (theoretical mean) # # or # # "THERE IS NOTHING MORE PRACTICAL THAN A GOOD THEORY" # ################################################################################ # # After some discussion, a student of statistics suggested that they should # use some theory to find the method to calculate CIs ... , but, how? # # The student (of statistics) answered with a misterious expression in his face # # "Use deviations of size 0.4382612703." # # The other students were reticent, but performe the 43,600 experiments # (for the sake of the development of science) with the new proceedure to # calculate CIs. Loock at the results. for (i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.4382612703 UpperLimit <- EstimateOfTheMean+0.4382612703 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInCI) round(mean(IsMuInCI), digits=2) # Very close to 0.95 this time. The students got enthusistic and run the # experiment once more, just to see if it was not just luck. for (i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.4382612703 UpperLimit <- EstimateOfTheMean+0.4382612703 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInCI) round(mean(IsMuInCI), digits=2) # Indeed close, they accepted to run one more round of experiments, # this time performing 2 experiments each; just to be really sure. for (i in 1:(Nexperiments*2)){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.4382612703 UpperLimit <- EstimateOfTheMean+0.4382612703 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInCI) round(mean(IsMuInCI), digits=2) ## # Amazing, but how was this constant 0.4382612703 calculated? Well, here is # the answer of the student of statistics: # "Note that the estimates of the expectation are normally distributed", said # student. Indeed, examining some of the previous results he draw the # following normal QQ plot. qqnorm(EstimatesOfTheMean) ; qqline(EstimatesOfTheMean) # "This should not be a surprise that the estimates of the expectation are # normally distributed" continued the student "since the observations # are normally distributed". "Indeed the estimates of the expectations are # just sample means, and sample means of normally distributed observations # are also normally distributed. So there is no surprise that the estimates # of the expectation were normally distributed", added the student of statistics. # The other students agreed. Enthusiasmed the student of statistics continued # "Now, using some theory of probability (he just had taken a course of that), # I claim that it can be proved that the expectation of the sample # mean (note of the editor: he meant the estimates of the theoretical mean) is # the 'real' expectation and the variance is sigma2/N". "I used then some # more theory to see that the deviations that make the CI have a coverage of # 0.95 is given by 1.959963985*sigma/sqrt(20)". Someone asked "Where did this # constant 1.959963985 came from?". The student of statistics answered # "1.959963985 is a number such that the probability of a standard normally # distributed random variable ..."; "What?" another student interrupted him; # "... the sdandard normal distribution is a normal distribution with mean 0 # and variance 1, which is used as a reference, e.g. in old tables ..." # The student of statistics contined "... the constant 1.959963985 is such that # if a quantity is distributed according to the standard normal distribution, # then the probability of this quantity be larger than 1.959963985 is 2.5% AND # the probability of this quantity being less than 1.959963985 is 2.5%", he added # that here he used an approximation but one that is good enough, he calculated # this in R writing "qnorm(0.975)". Another student asked "but you used in # fact 1.959963985*sigma/sqrt(20) in your calculations, why did you multiplied # by sigma/sqrt(20)?". The student of statistics answered: "Well, remember that # I said that the sample mean (i.e the estimates of the expectation) were # normally distributed WITH VARIANCE sigma2/N, multiplied by sigma/sqrt(20) # in order to make the estimates of the expectation to have variance 1 and then # I could use the constant 1.959963985, say 1.96 or roughly 2, # to calculate the probability of the estimates being larger # than a certain upper limit (2.5%), or smaller than a lower limit (2.5%), i.e. # the probability of the CI would be then 95% ....". Here are the calculations. print(qnorm(0.975),digits=10) print(qnorm(0.975)/sqrt(20),digits=10) ## # Some one asked to the student of statistics (first year) "You said a lot of # interesting things and the method worked, but COULD YOU PLEASE SUMMARIZE". # The student of statistics answered: # # "When you want to calculate a CI with 95% coverage to the expectation, # then use the interval # # (sample mean - 1.96*sigma/sqrt(N)) , sample mean + 1.96*sigma/sqrt(N))) # # This method for calculating the CI works when the observations are: # - Independent # - Normally distributed WITH VARIANCE KNOWN TO BE sigma2." # # Some one asked "What should I do if I want a CI with other coverage? # Say I want a coverage of 0.99, just to be really sure that the true # value of the parameter is in the CI." # The student of statistics answered "That is one of the advantages of using # a good theory, we can often adapt the results to a new situation. In this # case, just change the constant 1.959963985 to 2.575829304 (routhly 2.58). # You might consult a table to that or type "print(qnorm(0.995),digits=10)" # in R!" (he just had installed the free package in his computer!). # The students run the 43,600 experiments two times to check that. print(qnorm(0.995)*sigma/sqrt(20),digits=10) for (i in 1:(Nexperiments*2)){ Observations <- rnorm(n=N, mean=mu, sd=sigma) EstimateOfTheMean <- mean(Observations) LowerLimit <- EstimateOfTheMean-0.5759729421 UpperLimit <- EstimateOfTheMean+0.5759729421 IsMuInCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInCI) round(mean(IsMuInCI), digits=2) # Amazing, it seems to work! ## ################################################################################ # # Please, try different values of mu, N and sigma. # ################################################################################ ################################################################################ # # CI when the variance (or the standard deviation) is NOT known # ################################################################################ # # Some of the other students asked "What if the variance is not known?" # Another student, enthusiasmed with the last success said # "I have an idea. Just estimate the variance using the sample variance, # i.e. the sum of the squares of the deviations divided by N-1. Use then # this estimate instead of the known sigma2. No one will realize that we # made this estimation and we just use the same proceedure as aways" and # he even dare to say "Nothing more practical than the quick-and-dirty # method!!!!". # The student of statistics was a bit relutant, but finally agreed to try # quick-and-dirty method. They run a new set of experiments. # Here is one experiment ( Observations <- rnorm(n=N, mean=mu, sd=sigma) ) # Old proceedure (requires knowing the variance) ( LowerLimit <- mean(Observations) - qnorm(p=0.975)*sigma/sqrt(N) ) ( UpperLimit <- mean(Observations) + qnorm(p=0.975)*sigma/sqrt(N) ) # New procedure (quick-and-dirty, plug-in an estimate of the variance) ( LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N) ) ( UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N) ) ( IsMuInTheCI <- LowerLimit < mu & mu < UpperLimit ) ## # Here are the 43,600 experiments IsMuInTheCI <- logical(Nexperiments) for(i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N) UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N) IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInTheCI) # "It is not working, we got a coverage smaller than 0.95" shout one of the # students. They decided to perform more esperiments. IsMuInTheCI <- logical(Nexperiments*2) for(i in 1:(Nexperiments*2)){ Observations <- rnorm(n=N, mean=mu, sd=sigma) LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N) UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N) IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInTheCI) #"Definitely not working!" ## ################################################################################ # # An adjusted procedure - t-distribution # ################################################################################ # # Well, said the student of statistics, a colleague student of statistics who # worked in a brewery explained to me that indeed we should not expect this # method to work properly (the colleague was William Sealy Gosset). Indeed, if # we replace the unknown variance by its estimate, we introduce a new source of # error since the method assumes the variance to be perfectly known and we use # an imperfect estimate. "_ What should we then do?" shouted a couple of other # students "I knew that by the end of the day this thing of theory would not # work!". Well, replied the student of statistics, we should improve the theory! # After a couple of beers my colleague come to the conclusion that we should use # a t-distribution instead of a normal distribution, this distribution is # calculated to compensate the imprecision of using an estimate of the variance # instead of the variance. The sudent draw some graphs (the density functions), # most of the students did not follow him in this point but they believed that # the new method would work.... # Drawing the two densities f <- function(x) dnorm(x, mean=mu, sd=sigma) plot(f, xlim=c(7,13), lwd=4) f <- function(x) dt(x-mu, df=N-1) plot(f, xlim=c(7,13), lwd=4, col="red", add=TRUE) legend(x="topright", legend=c("Normal distribution", "t-distribution"), lwd=4, col=c("black","red"), bty="n") # Comparing the 0.975 quantiles of the normal and the t- distibution qnorm(p=0.975) qt(p=0.975, df=N-1) # Generating a new experiment ( Observations <- rnorm(n=N, mean=mu, sd=sigma) ) # New procedure ( LowerLimit <- mean(Observations) - qnorm(p=0.975)*sd(Observations)/sqrt(N) ) ( UpperLimit <- mean(Observations) + qnorm(p=0.975)*sd(Observations)/sqrt(N) ) # Adjusted new proceedure ( LowerLimit <- mean(Observations) - qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) ) ( UpperLimit <- mean(Observations) + qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) ) ( IsMuInTheCI <- LowerLimit < mu & mu < UpperLimit ) # Testing the CI constructed with the t-distribution IsMuInTheCI <- logical(Nexperiments) for(i in 1:Nexperiments){ Observations <- rnorm(n=N, mean=mu, sd=sigma) LowerLimit <- mean(Observations) - qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) UpperLimit <- mean(Observations) + qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInTheCI) # Once more with even more experiments IsMuInTheCI <- logical(Nexperiments*2) for(i in 1:(Nexperiments*2)){ Observations <- rnorm(n=N, mean=mu, sd=sigma) LowerLimit <- mean(Observations) - qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) UpperLimit <- mean(Observations) + qt(p=0.975, df=N-1)*sd(Observations)/sqrt(N) IsMuInTheCI[i] <- LowerLimit < mu & mu < UpperLimit } mean(IsMuInTheCI) ################################################################################ # THE END!