################################################################################ # # Stat-Tutorial-04-Normal distribution # # R. Labouriau # # Last revised: Fall 2020 # # In this tutorial you will work with basic notions related to the normal # distribution (and some related distributions) # # We will cover the following: # 1) The density of the normal distribution # 2) Simulating the normal distribution # 3) Checking whether some observations are well described by the normal # distribution # 4) Estimating the mean of iid normal distribution with constant variance # 5) Calculating probabilities with normal distributions # 6) A bit on the t-distribution # ################################################################################ # Copyright © 2019 by Rodrigo Labouriau ################################################################################ # # 1) The density of the normal distribution # ################################################################################ # The probability density of a normal distribution is calculated using # the function dnorm. The density depends on two parameters: # the mean and the standard deviation (or equivalently the variance). # To calculate the value of the density of the normal with mean, say 0, and # standard deviation 1 (called the standard normal distribution) at the point # zero we write: dnorm(x=0, mean = 0, sd = 1) # or equivalently dnorm(0,0,1) # If you dont specify the mean and the standard deviation, R takes the default # which is mean = 0 and sd = 0. dnorm(0) # Now I calculate the value of the density of the standard normal # distribution at several values dnorm(1) dnorm(c(0,1)) seq(from=-3, to=3, by=0.1) dnorm(seq(from=-3, to=3, by=0.1)) # Displaying in a "table" cbind(seq(from=-3, to=3, by=0.1), dnorm(seq(from=-3, to=3, by=0.1))) # Now I plot the density of the standard normal distribution plot(dnorm, xlim=c(-3,3)) # Improving the plot plot(dnorm, xlim=c(-3,3), lwd=3, ylab="Density of the standard normal distribution", xlab="") # Now I calculate the normal distribution with a different mean, say 10 # and standard deviation at the point say 9 dnorm(x=9, mean=10, sd=1) # Here I plot the density f <- function(x) dnorm(x, mean=10, sd=1) plot(f, xlim=c(7,13), lwd=3, ylab="Density of the normal distribution (mean=10)", xlab="") # Question: Why the following two calculations give the same result? dnorm(x=10, mean=10, sd=1) ; dnorm(x=0, mean=0, sd=1) # Observe also the following pairs of calculations: dnorm(x=9, mean=10, sd=1) ; dnorm(x=-1, mean=0, sd=1) dnorm(x=12, mean=10, sd=1) ; dnorm(x=2, mean=0, sd=1) # And observe this also: dnorm(x=9, mean=10, sd=1) ; dnorm(x=-1, mean=0, sd=1) dnorm(x=11, mean=10, sd=1) ; dnorm(x=1, mean=0, sd=1) # Can you explain what happens in those calculations? # I plot the density of the standard normal distribution and the # normal distribution with mean 10 and standard deviation 1 # in the same plot (with different collors) # First I plot the standard normal distribution plot(dnorm, xlim=c(-3,13), lwd=3, xlab="", ylab="Density", main="Two normal distribution with different means and same variance") # then I superpose the other density with other collour f <- function(x) dnorm(x, mean=10, sd=1) plot(f, xlim=c(-3,13), lwd=3, col="red", add=TRUE) # Now I plot several densities with the mean varying in a range # between 0 and 10 plot(dnorm, xlim=c(-3,13), lwd=3, xlab="", ylab="Density") for(i in 1:9){ f <- function(x) dnorm(x, mean=i, sd=1) plot(f, xlim=c(-3,13), lwd=2, lty=2, col=i, add=TRUE) } f <- function(x) dnorm(x, mean=10, sd=1) plot(f, xlim=c(-3,13), lwd=3, col="red", add=TRUE) # Now I will keep the mean fixed and change the variance (or equivalently # the standard deviation) # Please, run the codes up to the next mark and observe the plot produced plot(dnorm, xlim=c(-5,5), lwd=3, xlab="", ylab="Density") # Observe the plot and then run the next two lines # I will add a plot of the density of the normal distribution with mean zero # and s.d. 2 f <- function(x) dnorm(x, mean=0, sd=2) plot(f, xlim=c(-5,5), lwd=3, col="blue", add=TRUE) # Now I add the density with s.d. 3 f <- function(x) dnorm(x, mean=0, sd=3) plot(f, xlim=c(-5,5), lwd=3, col="red", add=TRUE) # Now I add a sequence of densities with increasing s.d. (or variances) for(i in seq(from=1.1, to=2.9, by=0.1) ){ f <- function(x) dnorm(x, mean=0, sd=i) plot(f, xlim=c(-5,5), lwd=2, lty=2, add=TRUE) } # Here I superpose the densities with s.d. 2 and 3 again to get a better plot f <- function(x) dnorm(x, mean=0, sd=2) plot(f, xlim=c(-5,5), lwd=3, col="blue", add=TRUE) f <- function(x) dnorm(x, mean=0, sd=3) plot(f, xlim=c(-5,5), lwd=3, col="red", add=TRUE) ################################################################################ # # 2) Simulating the normal distribution # ################################################################################ # Here I produce a vector with 10000 simulated values of a standard # normal distibution (mean 0 and variance 1) # and I print the first 10 values of this vector X <- rnorm(n=10000, mean=0, sd=1) X[1:10] # Here draw a histogram of those values hist(X) # Now, I improve the quality of the histogram hist(X, col="lightblue", main="Simulated Standard Normal Distribution" ) # Now, I change the scale of the histogram so the vertical # axis display the relative frequency hist(X, col="lightblue", freq=FALSE, main="Simulated Standard Normal Distribution" ) # Here I superpose the graph of he density of the standard norma # distribution to the histogram. # Do we have a good agreement between the simulated data and the # theoretical density? plot(dnorm, xlim=c(-4,4), lwd=2, add=TRUE) # Now I superpose the graph of the density with a different mean. # Which density fits the data best? f <- function(x) dnorm(x, mean=1, sd=1) plot(f, xlim=c(-4,4), lwd=3, col="red", add=TRUE) # Now, I hist(X, col="lightblue", freq=FALSE, main="Simulated Standard Normal Distribution" ) plot(dnorm, xlim=c(-4,4), lwd=2, add=TRUE) f <- function(x) dnorm(x, mean=0, sd=2) plot(f, xlim=c(-4,4), lwd=3, col="red", add=TRUE) f <- function(x) dnorm(x, mean=0, sd=1/2) plot(f, xlim=c(-4,4), lwd=3, col="blue", add=TRUE) ################################################################################ # # 3) Checking whether some observations are well described by the normal # distribution # ################################################################################ # There is a several ways to verify whether some values are well described by # the normal distribution (or in short, are normally distributed). # One (informal) way is to draw a histogram and compare with the density, as # we did above. Sometimes it is difficult to judge whether a histogram # fits with the plot of a density. An alternative way is to draw a normal # QQ-plot, i.e. a plot of the observed quantiles (the ones we have from the # observations, also calle sample quantilrs) against the theoretical # quantiles (i.e. the ones we would expect if the data were normally # distributed). Clearly, if the observations are normally distributed we # should obtain a plot with the points (one for each observation) disposed # along a straight line. Departures from the straight line indicate # departures from the normality. # This plot can be drawn using the functions qqnorm and qqline. # Observe the codes below. X <- rnorm(1000) qqnorm(X) qqline(X) X <- rnorm(100) qqnorm(X); qqline(X) X <- rnorm(100) qqnorm(X); qqline(X) X <- rnorm(100, mean=10, sd=1.5) qqnorm(X); qqline(X) # Now, let us see what happens if the data is not normally distributed # I simulate something unifrmly distributed, draw a histogram superposed # by the density of the normal distribution with the same mean and same # variance. Then I draw a normal QQ-plot X <- runif(1000) par(mfrow=c(2,1)) # THis is to display the next two plots one over the other hist(X,col="lightblue", freq=FALSE, ylim=c(0,1.8), xlim=c(-0.3,1.3)) f <- function(x) dnorm(x, mean=mean(X), sd=sd(X)) plot(f, xlim=c(-0.3,1.3), lwd=3, add=TRUE) abline(h=0) qqnorm(X); qqline(X) par(mfrow=c(1,1)) # Here I simulate normally distributed observations and # change one of the values to a very hight value (an "outlier") X <- rnorm(100) par(mfrow=c(2,1)) qqnorm(X, main="Original Observations"); qqline(X) X[1] <- 10 qqnorm(X, main="Observations with one (artificial) outlier"); qqline(X) par(mfrow=c(1,1)) # Here I simulate two normal distributions, X and Y, with different means and # put them toghether. The result should not be normally distributed as you # will see. par(mfrow=c(2,1)) X <- rnorm(n=100, mean=10) hist(X, col="lightblue") qqnorm(X); qqline(X) Y <- rnorm(n=100, mean=15) hist(Y, col="lightblue") qqnorm(Y); qqline(Y) # Ploting X and Y in the same scale for comparing hist(X, col="lightblue", xlim=c(5,20)) hist(Y, col="red", xlim=c(5,20)) # Here I put the two vectors in one single vector, called Z # and verify the normality Z <- c(X, Y) hist(Z) qqnorm(Z); qqline(Z) # Here I display the two normal distributions with different collors par(mfrow=c(2,1)) hist(X, col="lightblue", xlim=c(5,20)) hist(Y, col="red", xlim=c(5,20), add=TRUE) VectorWithCollors <- c( rep("lightblue",100), rep("red",100) ) VectorWithCollors qqnorm(Z, col=VectorWithCollors) qqline(Z) par(mfrow=c(1,1)) # Exercise: Please describe a situation where the scenario simulated above # could happens. # Note that not aways the differences of means are so large. # Observe the following codes and graphs. X <- rnorm(n=100, mean=10) Y <- rnorm(n=100, mean=12) Z <- c(X,Y) par(mfrow=c(2,1)) hist(X, col="blue", xlim=range(Z), breaks=20) hist(Y, col="red", xlim=range(Z), breaks=20, add=TRUE) VectorWithCollors <- c( rep("blue",100), rep("red",100) ) qqnorm(Z, col=VectorWithCollors); qqline(Z) par(mfrow=c(1,1)) ################################################################################ # Here I introduce a formal test for normality. The Shapiro-Wilks test. X <- rnorm(100) shapiro.test(X) # Please run the codes above several times and observe the p-values obtained # (of course they vary, since the simulated data also vary). # Observe also the following: X <- runif(100) shapiro.test(X) # Here I insert an artificial outlier: X <- rnorm(100) shapiro.test(X) X[1] <- 10 shapiro.test(X) # Please note that the detection power of the test is not good when the # number of observations is small. Observe the results below: X <- runif(100) shapiro.test(X) X <- runif(10) shapiro.test(X) X <- runif(5) shapiro.test(X) X <- runif(3) shapiro.test(X) # Here I perform the test several times and keep the results # in a vector called Pvalues. First I show how to extract only the p-values # from the output of the function shapiro.test X <- runif(10) shapiro.test(X) shapiro.test(X)$p.value # Now I perform the test several times and keep the results (the p-values) # for understanding how the test behaves. Pvalues <- numeric(1000) for(i in 1:1000){ X <- runif(3) Pvalues[i] <- shapiro.test(X)$p.value } # Displaying the first 10 p-values Pvalues [1:10] # Calculating the minimum and the maximum of the 1000 p-values range(Pvalues) # Calculating the proportion of p-values that are smaller than 0.05 # (which tipically would be declared that statistically significant departures # of the normality were deected) mean(Pvalues < 0.05) # As you see, the test is detecting departures of the normal distribution in # only a small proportion of the simulations, when we only use 3 observations. # That is not surprisingly; we don't have very much information on the # distribution when working with so few observations. The performance of the # test improves when the number of observations increase. Observe the following # results. # Using 10 observations Pvalues <- numeric(1000) for(i in 1:1000){ X <- runif(10) Pvalues[i] <- shapiro.test(X)$p.value } mean(Pvalues < 0.05) # Using 20 observations Pvalues <- numeric(1000) for(i in 1:1000){ X <- runif(20) Pvalues[i] <- shapiro.test(X)$p.value } mean(Pvalues < 0.05) # Using 50 observations Pvalues <- numeric(1000) for(i in 1:1000){ X <- runif(50) Pvalues[i] <- shapiro.test(X)$p.value } mean(Pvalues < 0.05) ################################################################################ # # 4) Estimating the mean of iid normal distribution with constant variance # ################################################################################ # Suppose that you have some results of an experiment. # Assume that the data is normally distributed (we can check this assumption), # the observations are independent and have all the same expectation and the # same variance. The problem is then to estimate the expectation based in the # data we have at hand. # It is possible to show that the sample mean (i.e. the sum of the observations # divided by the number of observations) is a good estimate of the expectation. # Here we will work a bit with this estimates and see that in fact this estimate # is a random quantity and has a known distribution. # First I simulate some observations. This corresponds to running an experiment # and registering the results. The difference is that here we know what we # simulated, for instance we know that the simulated values behave as being # normally distributed, we know the mean and the variance (we are in an almost # divine situation). # Here I simulate (imagine that I performed the experiment) X <- rnorm(n=100, mean=10, sd=1) # Here I calculate the estimate mean(X) # Here I simulate again and get a slightly different result. # Please, repeat the process several times X <- rnorm(n=100, mean=10, sd=1) mean(X) # Next, I will simulate several times, say 1000 times, and keep the results in # a vector called Results Results <- numeric(1000) for(r in 1:1000){ X <- rnorm(n=100, mean=10, sd=1) Results[r] <- mean(X) } # Displyaing the first 10 results. # Each of these numbers corresponds to the estimates that we would have # obtained if we had used the estimate (the sample mean) based on 100 # observations. Results[1:10] hist(Results, col="lightblue") # Note that according to the basic theory, the estimates should be normally # distributed (I am not proving that, but believe me please). Here we # verify that. qqnorm(Results); qqline(Results) shapiro.test(Results) # Acording to the theory the estimates should have expected value equal to the # expected value of each observation. This is confirmed by the fact that # the mean of the results is close to 10 (the value that we simulated). mean(Results) # According to the theory, the variance of the estimates (the values in the # vector Results) should have variance equal 1/1000. # To see that, # Var(sum(Results)/100) = Var(sum(Results))/100^2 # = 100 Var(One observation)/ 100^1 = 1/1000. var(Results) # Now I repeat the same procedure but using 10 repetitions instead of # 100. We will get similar results apart from the fact that the variance # of the estimates should be 1/10 (Why?) Results <- numeric(1000) for(r in 1:1000){ X <- rnorm(n=10, mean=10, sd=1) # Here I changed the number of replicates Results[r] <- mean(X) } qqnorm(Results); qqline(Results) shapiro.test(Results) mean(Results) var(Results) ################################################################################ # # 5) Calculating probabilities with normal distributions # ################################################################################ # Here we address the question of how to calculate the probabilities of # a normally distribution of a random quantity assume some values. # I will expose that by calculating some key examples. Please observe the # codes below. # Suppose that X ~ N(0,1). Whart is the probability of observing # a negative value? pnorm(q=0, mean=0, sd=1) # Or equivalently pnorm(0) # What is the probability of X <= -1.96? pnorm(-1.96) # What is the probability of X > 1.96? 1- pnorm(1.96) # or equivalently pnorm(1.96, lower.tail=FALSE) # Suppose that Y~N(10,2). What is the probability of Y > 11 pnorm(11, mean=10, sd=sqrt(2), lower.tail=FALSE) # Now, I test this result by simulating several times, say 1,000,000 times, # and checking whether we get approximately the theoretical probability # calculated here. # Simulated one million values Y <- rnorm(n=1000000, mean=10, sd=sqrt(2)) # Counting how many times Y exceeds 11 sum(Y>11) # Calculating the proportion of this event sum(Y>11)/1000000 # Alternatively mean(Y>11) pnorm(11, mean=10, sd=sqrt(2), lower.tail=FALSE) round( pnorm(11, mean=10, sd=sqrt(2), lower.tail=FALSE), digits=6) # Exercise: what is the probability of Y take values between 9 and 11? # Test your result by simulating. ################################################################################ # # 6) A bit on the t-distribution # ################################################################################ # Sometimes we need to know the value of the variance to calculate some # quantities (making tests and calculating confidence intervals) related # to statistical models. One is then tempted to just replace the unknown # variance by an estimate of the variance. Now, if we do that, then we # introduce some additional uncertainty and the distribution becomes a # t distribution. The t- distribution tends to present larger values more # often than the normal distribution. I will show that ploting the densities # of several t-distributions and the normal and by simulating. # The t-distribution depends on a parameter coaaled the degrees of freedom (df) # (an integer > 0). Large values of the df (say > 30) yields a distribution # the resambles the normal distribution very much. The function dt caldulate the # density of the t-distribution. # I will plot first the density of a standard normal distribution plot(dnorm, xlim=c(-4,4), lwd=2) f <- function(x) dt(x, df=10) plot(f, xlim=c(-4,4), lwd=2, col="red", lty=2, add=TRUE) f <- function(x) dt(x, df=5) plot(f, xlim=c(-4,4), lwd=2, col="red", lty=3, add=TRUE) f <- function(x) dt(x, df=3) plot(f, xlim=c(-4,4), lwd=2, col="red", add=TRUE) # Now I will simulate the t-distribution to demonstrate that we get # more extreme values than what we would expect from a normal distribution. # Please, execute the codes below several times and observe the results. X <- rt(n=1000, df=3) hist(X, freq=FALSE, ylim=c(0,0.3), breaks=100, col="blue") f <- function(x) dnorm(x, mean=mean(X), sd=sd(X)) plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE) par(mfrow=c(3,3)) for(i in 1:9){ X <- rt(n=1000, df=3) hist(X, freq=FALSE, ylim=c(0,0.3), breaks=100, col="blue") f <- function(x) dnorm(x, mean=mean(X), sd=sd(X)) plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE) } par(mfrow=c(1,1)) par(mfrow=c(3,3)) for(i in 1:9){ X <- rt(n=1000, df=3) qqnorm(X); qqline(X) } par(mfrow=c(1,1)) # Here I increase the number of degrees of freedom to 5 par(mfrow=c(3,3)) for(i in 1:9){ X <- rt(n=1000, df=5) hist(X, freq=FALSE, ylim=c(0,0.3), breaks=100, col="blue") f <- function(x) dnorm(x, mean=mean(X), sd=sd(X)) plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE) } par(mfrow=c(1,1)) par(mfrow=c(3,3)) for(i in 1:9){ X <- rt(n=1000, df=5) qqnorm(X); qqline(X) } par(mfrow=c(1,1)) # Here I increase the number of degrees of freedom to 30 and get that the # results are close to the normal distribution. par(mfrow=c(3,3)) for(i in 1:9){ X <- rt(n=1000, df=30) hist(X, freq=FALSE, ylim=c(0,0.9), breaks=100, col="blue") f <- function(x) dnorm(x, mean=mean(X), sd=sd(X)) plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE) } par(mfrow=c(1,1)) par(mfrow=c(3,3)) for(i in 1:9){ X <- rt(n=1000, df=30) qqnorm(X); qqline(X) } par(mfrow=c(1,1)) # THE END!