################################################################################
#
# Stat-Tutorial-04-Normal distribution 
#
# R. Labouriau
#
# Last revised: Fall 2020
#
# In this tutorial you will work with basic notions related to the normal
# distribution (and some related distributions)
#
# We will cover the following:
#   1) The density of the normal distribution
#   2) Simulating the normal distribution
#   3) Checking whether some observations are well described by the normal
#      distribution
#   4) Estimating the mean of iid normal distribution with constant variance
#   5) Calculating probabilities with normal distributions
#   6) A bit on the t-distribution
#
################################################################################
# Copyright © 2019 by Rodrigo Labouriau

################################################################################
#
# 1) The density of the normal distribution
#
################################################################################

# The probability density of a normal distribution is calculated using
# the function dnorm. The density depends on two parameters:
# the mean and the standard deviation (or equivalently the variance).
# To calculate the value of the density of the normal with mean, say 0, and
# standard deviation 1 (called the standard normal distribution) at the point
# zero we write:

dnorm(x=0, mean = 0, sd = 1)

# or equivalently

dnorm(0,0,1)

# If you dont specify the mean and the standard deviation, R takes the default
# which is mean = 0 and sd = 0.

dnorm(0)

# Now I calculate the value of the density of the standard normal
# distribution at several values 

dnorm(1)

dnorm(c(0,1))

seq(from=-3, to=3, by=0.1)

dnorm(seq(from=-3, to=3, by=0.1))

# Displaying in a "table"
cbind(seq(from=-3, to=3, by=0.1), dnorm(seq(from=-3, to=3, by=0.1)))

# Now I plot the density of the standard normal distribution

plot(dnorm, xlim=c(-3,3))

# Improving the plot

plot(dnorm, xlim=c(-3,3), lwd=3, 
  ylab="Density of the standard normal distribution", 
  xlab="")

# Now I calculate the normal distribution with a different mean, say 10
# and standard deviation at the point say 9

dnorm(x=9, mean=10, sd=1)

# Here I plot the density
f <- function(x) dnorm(x, mean=10, sd=1)
plot(f, xlim=c(7,13), lwd=3, 
  ylab="Density of the normal distribution (mean=10)", 
  xlab="")

# Question: Why the following two calculations give the same result?
dnorm(x=10, mean=10, sd=1) ; dnorm(x=0, mean=0, sd=1)

# Observe also the following pairs of calculations:
dnorm(x=9, mean=10, sd=1) ; dnorm(x=-1, mean=0, sd=1)

dnorm(x=12, mean=10, sd=1) ; dnorm(x=2, mean=0, sd=1)

# And observe this also:
dnorm(x=9, mean=10, sd=1) ; dnorm(x=-1, mean=0, sd=1)
dnorm(x=11, mean=10, sd=1) ; dnorm(x=1, mean=0, sd=1)

# Can you explain what happens in those calculations?

# I plot the density of the standard normal distribution and the 
# normal distribution with mean 10 and standard deviation 1 
# in the same plot (with different collors)

# First I plot the standard normal distribution
plot(dnorm, xlim=c(-3,13), lwd=3, 
     xlab="", ylab="Density", 
     main="Two normal distribution with different means and same variance")

# then I superpose the other density with other collour
f <- function(x) dnorm(x, mean=10, sd=1)
plot(f,  xlim=c(-3,13), lwd=3, col="red", add=TRUE) 

# Now I plot several densities with the mean varying in a range
# between 0 and 10 

plot(dnorm, xlim=c(-3,13), lwd=3, xlab="", ylab="Density")
for(i in 1:9){
  f <- function(x) dnorm(x, mean=i, sd=1)
  plot(f,  xlim=c(-3,13), lwd=2, lty=2, col=i, add=TRUE) 
}
f <- function(x) dnorm(x, mean=10, sd=1)
plot(f,  xlim=c(-3,13), lwd=3, col="red", add=TRUE) 

# Now I will keep the mean fixed and change the variance (or equivalently
# the standard deviation)

# Please, run the codes up to the next mark and observe the plot produced
plot(dnorm, xlim=c(-5,5), lwd=3, xlab="", ylab="Density")

# Observe the plot and then run the next two lines
# I will add a plot of the density of the normal distribution with mean zero
# and s.d. 2
f <- function(x) dnorm(x, mean=0, sd=2)
plot(f,  xlim=c(-5,5), lwd=3, col="blue", add=TRUE) 

# Now I add the density with s.d. 3
f <- function(x) dnorm(x, mean=0, sd=3)
plot(f,  xlim=c(-5,5), lwd=3, col="red", add=TRUE) 

# Now I add a sequence of densities with increasing s.d. (or variances)
for(i in seq(from=1.1, to=2.9, by=0.1) ){
  f <- function(x) dnorm(x, mean=0, sd=i)
  plot(f,  xlim=c(-5,5), lwd=2, lty=2, add=TRUE) 
}

# Here I superpose the densities with s.d. 2 and 3 again to get a better plot 
f <- function(x) dnorm(x, mean=0, sd=2)
plot(f,  xlim=c(-5,5), lwd=3, col="blue", add=TRUE) 
f <- function(x) dnorm(x, mean=0, sd=3)
plot(f,  xlim=c(-5,5), lwd=3, col="red", add=TRUE) 

################################################################################
#
# 2) Simulating the normal distribution
#
################################################################################

# Here I produce a vector with 10000 simulated values of a standard
# normal distibution (mean 0 and variance 1)
# and I print the first 10 values of this vector 

X <- rnorm(n=10000, mean=0, sd=1)
X[1:10]

# Here draw a histogram of those values
hist(X)

# Now, I improve the quality of the histogram
hist(X, col="lightblue", 
     main="Simulated Standard Normal Distribution"
  )

# Now, I change the scale of the histogram so the vertical
# axis display the relative frequency 
hist(X, col="lightblue", freq=FALSE, 
     main="Simulated Standard Normal Distribution"
  )

# Here I superpose the graph of he density of the standard norma
# distribution to the histogram. 
# Do we have a good agreement between the simulated data and the 
# theoretical density?

plot(dnorm, xlim=c(-4,4), lwd=2, add=TRUE)

# Now I superpose the graph of the density with a different mean.
# Which density fits the data best?
f <- function(x) dnorm(x, mean=1, sd=1)
plot(f,  xlim=c(-4,4), lwd=3, col="red", add=TRUE) 

# Now, I 
hist(X, col="lightblue", freq=FALSE, 
  main="Simulated Standard Normal Distribution"
)
plot(dnorm, xlim=c(-4,4), lwd=2, add=TRUE)
f <- function(x) dnorm(x, mean=0, sd=2)
plot(f,  xlim=c(-4,4), lwd=3, col="red", add=TRUE) 

f <- function(x) dnorm(x, mean=0, sd=1/2)
plot(f,  xlim=c(-4,4), lwd=3, col="blue", add=TRUE) 

################################################################################
#
#   3) Checking whether some observations are well described by the normal
#      distribution
#
################################################################################

# There is a several ways to verify whether some values are well described by
# the normal distribution (or in short, are normally distributed).
# One (informal) way is to draw a histogram and compare with the density, as 
# we did above. Sometimes it is difficult to judge whether a histogram
# fits with the plot of a density. An alternative way is to draw a normal 
# QQ-plot, i.e. a plot of the observed quantiles (the ones we have from the
# observations, also calle sample quantilrs) against the theoretical 
# quantiles (i.e. the ones we would expect if the data were normally 
# distributed). Clearly, if the observations are normally distributed we 
# should obtain a plot with the points (one for each observation) disposed 
# along a straight line. Departures from the straight line indicate 
# departures from the normality.
# This plot can be drawn using the functions qqnorm and qqline. 
# Observe the codes below.

X <- rnorm(1000)
qqnorm(X)
qqline(X)

X <- rnorm(100)
qqnorm(X); qqline(X)

X <- rnorm(100)
qqnorm(X); qqline(X)

X <- rnorm(100, mean=10, sd=1.5)
qqnorm(X); qqline(X)

# Now, let us see what happens if the data is not normally distributed

# I simulate something unifrmly distributed, draw a histogram superposed
# by the density of the normal distribution with the same mean and same 
# variance. Then I draw a normal QQ-plot

X <- runif(1000)

par(mfrow=c(2,1)) # THis is to display the next two plots one over the other
hist(X,col="lightblue", freq=FALSE, ylim=c(0,1.8), xlim=c(-0.3,1.3))
f <- function(x) dnorm(x, mean=mean(X), sd=sd(X)) 
plot(f, xlim=c(-0.3,1.3), lwd=3, add=TRUE)
abline(h=0)

qqnorm(X); qqline(X)
par(mfrow=c(1,1))

# Here I simulate normally distributed observations and 
# change one of the values to a very hight value (an "outlier") 

X <- rnorm(100)
par(mfrow=c(2,1))
qqnorm(X, main="Original Observations"); qqline(X)
X[1] <- 10
qqnorm(X, main="Observations with one (artificial) outlier"); qqline(X)
par(mfrow=c(1,1))

# Here I simulate two normal distributions, X and Y, with different means and
# put them toghether. The result should not be normally distributed as you
# will see.

par(mfrow=c(2,1))
X <- rnorm(n=100, mean=10)
hist(X, col="lightblue")
qqnorm(X); qqline(X)

Y <- rnorm(n=100, mean=15)
hist(Y, col="lightblue")
qqnorm(Y); qqline(Y)

# Ploting X and Y in the same scale for comparing

hist(X, col="lightblue", xlim=c(5,20))
hist(Y, col="red", xlim=c(5,20))

# Here I put the two vectors in one single vector, called Z
# and verify the normality

Z <- c(X, Y)
hist(Z)
qqnorm(Z); qqline(Z)

# Here I display the two normal distributions with different collors
par(mfrow=c(2,1))
hist(X, col="lightblue", xlim=c(5,20))
hist(Y, col="red", xlim=c(5,20), add=TRUE)

VectorWithCollors <- c( rep("lightblue",100), rep("red",100) )
VectorWithCollors

qqnorm(Z, col=VectorWithCollors)
qqline(Z)
par(mfrow=c(1,1))

# Exercise: Please describe a situation where the scenario simulated above
#           could happens.

# Note that not aways the differences of means are so large. 
# Observe the following codes and graphs.

X <- rnorm(n=100, mean=10)
Y <- rnorm(n=100, mean=12)
Z <- c(X,Y)
par(mfrow=c(2,1))
hist(X, col="blue", xlim=range(Z), breaks=20)
hist(Y, col="red", xlim=range(Z), breaks=20, add=TRUE)
VectorWithCollors <- c( rep("blue",100), rep("red",100) )
qqnorm(Z, col=VectorWithCollors); qqline(Z)
par(mfrow=c(1,1))

################################################################################

# Here I introduce a formal test for normality. The Shapiro-Wilks test.

X <- rnorm(100)
shapiro.test(X)

# Please run the codes above several times and observe the p-values obtained
# (of course they vary, since the simulated data also vary).
# Observe also the following:

X <- runif(100)
shapiro.test(X)

# Here I insert an artificial outlier:

X <- rnorm(100)
shapiro.test(X)
X[1] <- 10
shapiro.test(X)

# Please note that the detection power of the test is not good when the
# number of observations is small. Observe the results below:

X <- runif(100)
shapiro.test(X)

X <- runif(10)
shapiro.test(X)

X <- runif(5)
shapiro.test(X)

X <- runif(3)
shapiro.test(X)

# Here I perform the test several times and keep the results
# in a vector called Pvalues. First I show how to extract only the p-values
# from the output of the function shapiro.test

X <- runif(10)
shapiro.test(X)

shapiro.test(X)$p.value

# Now I perform the test several times and keep the results (the p-values)
# for understanding how the test behaves.

Pvalues <- numeric(1000)
for(i in 1:1000){
  X <- runif(3)
  Pvalues[i] <- shapiro.test(X)$p.value
}

# Displaying the first 10 p-values
Pvalues [1:10] 

# Calculating the minimum and the maximum of the 1000 p-values
range(Pvalues)

# Calculating the proportion of p-values that are smaller than 0.05
# (which tipically would be declared that statistically significant departures
#  of the normality were deected)
 
mean(Pvalues < 0.05)

# As you see, the test is detecting departures of the normal distribution in
# only a small proportion of the simulations, when we only use 3 observations.
# That is not surprisingly; we don't have very much information on the 
# distribution when working with so few observations. The performance of the
# test improves when the number of observations increase. Observe the following
# results.

# Using 10 observations
Pvalues <- numeric(1000)
for(i in 1:1000){
  X <- runif(10)
  Pvalues[i] <- shapiro.test(X)$p.value
}
mean(Pvalues < 0.05)

# Using 20 observations
Pvalues <- numeric(1000)
for(i in 1:1000){
  X <- runif(20)
  Pvalues[i] <- shapiro.test(X)$p.value
}
mean(Pvalues < 0.05)

# Using 50 observations
Pvalues <- numeric(1000)
for(i in 1:1000){
  X <- runif(50)
  Pvalues[i] <- shapiro.test(X)$p.value
}
mean(Pvalues < 0.05)

################################################################################
#
#   4) Estimating the mean of iid normal distribution with constant variance
#
################################################################################

# Suppose that you have some results of an experiment.
# Assume that the data is normally distributed (we can check this assumption),
# the observations are independent and have all the same expectation and the
# same variance. The problem is then to estimate the expectation based in the
# data we have at hand.
# It is possible to show that the sample mean (i.e. the sum of the observations
# divided by the number of observations) is a good estimate of the expectation.
# Here we will work a bit with this estimates and see that in fact this estimate
# is a random quantity and has a known distribution.

# First I simulate some observations. This corresponds to running an experiment
# and registering the results. The difference is that here we know what we
# simulated, for instance we know that the simulated values behave as being
# normally distributed, we know the mean and the variance (we are in an almost
# divine situation).

# Here I simulate (imagine that I performed the experiment)
X <- rnorm(n=100, mean=10, sd=1)

# Here I calculate the estimate
mean(X)

# Here I simulate again and get a slightly different result.
# Please, repeat the process several times
X <- rnorm(n=100, mean=10, sd=1)
mean(X)

# Next, I will simulate several times, say 1000 times, and keep the results in 
# a vector called Results

Results <- numeric(1000)
for(r in 1:1000){
  X <- rnorm(n=100, mean=10, sd=1)
  Results[r] <- mean(X)
}

# Displyaing the first 10 results. 
# Each of these numbers corresponds to the estimates that we would have 
# obtained if we had used the estimate (the sample mean) based on 100
# observations.
Results[1:10] 

hist(Results, col="lightblue")
# Note that according to the basic theory, the estimates should be normally
# distributed (I am not proving that, but believe me please). Here we
# verify that.

qqnorm(Results); qqline(Results)

shapiro.test(Results)

# Acording to the theory the estimates should have expected value equal to the
# expected value of each observation. This is confirmed by the fact that
# the mean of the results is close to 10 (the value that we simulated).

mean(Results)

# According to the theory, the variance of the estimates (the values in the
# vector Results) should have variance equal 1/1000.
# To see that, 
# Var(sum(Results)/100) = Var(sum(Results))/100^2 
#                       = 100 Var(One observation)/ 100^1 = 1/1000.
var(Results)

# Now I repeat the same procedure but using 10 repetitions instead of
# 100. We will get similar results apart from the fact that the variance
# of the estimates should be 1/10 (Why?)

Results <- numeric(1000)
for(r in 1:1000){
  X <- rnorm(n=10, mean=10, sd=1) # Here I changed the number of replicates 
  Results[r] <- mean(X)
}

qqnorm(Results); qqline(Results)

shapiro.test(Results)

mean(Results)

var(Results)

################################################################################
#
#   5) Calculating probabilities with normal distributions
#
################################################################################

# Here we address the question of how to calculate the probabilities of 
# a normally distribution of a random quantity assume some values.
# I will expose that by calculating some key examples. Please observe the
# codes below.

# Suppose that X ~ N(0,1). Whart is the probability of observing
# a negative value?

pnorm(q=0, mean=0, sd=1)

# Or equivalently
pnorm(0)

# What is the probability of X <= -1.96?

pnorm(-1.96)

# What is the probability of X > 1.96?

1- pnorm(1.96)

# or equivalently
pnorm(1.96, lower.tail=FALSE)

# Suppose that Y~N(10,2). What is the probability of Y > 11
pnorm(11, mean=10, sd=sqrt(2), lower.tail=FALSE)

# Now, I test this result by simulating several times, say 1,000,000 times,
# and checking whether we get approximately the theoretical probability 
# calculated here.

# Simulated one million values
Y <- rnorm(n=1000000, mean=10, sd=sqrt(2))
# Counting how many times Y exceeds 11
sum(Y>11)
# Calculating the proportion of this event
sum(Y>11)/1000000
# Alternatively
mean(Y>11)

pnorm(11, mean=10, sd=sqrt(2), lower.tail=FALSE)

round( pnorm(11, mean=10, sd=sqrt(2), lower.tail=FALSE), digits=6)

# Exercise: what is the probability of Y take values between 9 and 11?
# Test your result by simulating.


################################################################################
#
#   6) A bit on the t-distribution
#
################################################################################

# Sometimes we need to know the value of the variance to calculate some 
# quantities (making tests and calculating confidence intervals) related
# to statistical models. One is then tempted to just replace the unknown
# variance by an estimate of the variance. Now, if we do that, then we
# introduce some additional uncertainty and the distribution becomes a 
# t distribution. The t- distribution tends to present larger values more
# often than the normal distribution. I will show that ploting the densities
# of several t-distributions and the normal and by simulating.

# The t-distribution depends on a parameter coaaled the degrees of freedom (df)
# (an integer > 0). Large values of the df (say > 30) yields a distribution 
# the resambles the normal distribution very much. The function dt caldulate the
# density of the t-distribution.

# I will plot first the density of a standard normal distribution

plot(dnorm, xlim=c(-4,4), lwd=2)

f <- function(x) dt(x, df=10)
plot(f, xlim=c(-4,4), lwd=2, col="red", lty=2, add=TRUE)

f <- function(x) dt(x, df=5)
plot(f, xlim=c(-4,4), lwd=2, col="red", lty=3, add=TRUE)

f <- function(x) dt(x, df=3)
plot(f, xlim=c(-4,4), lwd=2, col="red", add=TRUE)

# Now I will simulate the t-distribution to demonstrate  that we get 
# more extreme values than what we would expect from a normal distribution.
# Please, execute the codes below several times and observe the results.

X <- rt(n=1000, df=3)

hist(X, freq=FALSE, ylim=c(0,0.3), breaks=100, col="blue")
f <- function(x) dnorm(x, mean=mean(X), sd=sd(X))
plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE)

par(mfrow=c(3,3))
for(i in 1:9){
  X <- rt(n=1000, df=3)
  hist(X, freq=FALSE, ylim=c(0,0.3), breaks=100, col="blue")
  f <- function(x) dnorm(x, mean=mean(X), sd=sd(X))
  plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE)
}
par(mfrow=c(1,1))

par(mfrow=c(3,3))
for(i in 1:9){
  X <- rt(n=1000, df=3)
  qqnorm(X); qqline(X)
}
par(mfrow=c(1,1))

# Here I increase the number of degrees of freedom to 5

par(mfrow=c(3,3))
for(i in 1:9){
  X <- rt(n=1000, df=5)
  hist(X, freq=FALSE, ylim=c(0,0.3), breaks=100, col="blue")
  f <- function(x) dnorm(x, mean=mean(X), sd=sd(X))
  plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE)
}
par(mfrow=c(1,1))

par(mfrow=c(3,3))
for(i in 1:9){
  X <- rt(n=1000, df=5)
  qqnorm(X); qqline(X)
}
par(mfrow=c(1,1))

# Here I increase the number of degrees of freedom to 30 and get that the
# results are close to the normal distribution.

par(mfrow=c(3,3))
for(i in 1:9){
  X <- rt(n=1000, df=30)
  hist(X, freq=FALSE, ylim=c(0,0.9), breaks=100, col="blue")
  f <- function(x) dnorm(x, mean=mean(X), sd=sd(X))
  plot(f, xlim=c(-10,10), lwd=3, col="red", add=TRUE)
}
par(mfrow=c(1,1))

par(mfrow=c(3,3))
for(i in 1:9){
  X <- rt(n=1000, df=30)
  qqnorm(X); qqline(X)
}
par(mfrow=c(1,1))

# THE END!