################################################################################
#
# Stat-Tutorial-09-z-test.R 
# The z-test (bilateral)
#
# By R. Labouriau
# Last revised: Fall 2020
################################################################################
# Copyright © 2019 by Rodrigo Labouriau
#
# This tutorial illustrates how the classic z-test for testing whether the
# theoretical mean is equal to a given value, using a sample of independent and
# identically distributed normally distributed with fixed and KNOWN variance.
# 
# The following basic notions of hypotheses tests will be illustrated:
#   - Level of significance (and error of type I, i.e. wrongly rejecting H0)
#   - Power function (and error of type II, i.e. wrongly accepting H0 )
#   - How the power is influenced by the level of significance choosen and the
#     number of observations
#   - The p-value.
#
################################################################################

################################################################################
# PRELIMINAIRES
#
################################################################################

# Here I set some parameters to simulate (you might change it):

# mu will be the true value of the theoretical mean (expectation)
mu <- 10   

# sigma2 will be the variance (assumed to be known)
sigma2 <- 1    

# sigma will be the standard deviation
sigma <- sqrt(sigma2) 

# N will be the number of (iid, i.e. independent and identically
# distributed) observations
N <- 100       

# The function below returns the density of the normal distribution. I will use
# that to plot the density
Density <- function(x) dnorm(x,mean=mu, sd=sigma)
plot(Density, xlim=c(mu-3.5*sigma , mu+3.5*sigma))

# I improve the plot of the density
plot(Density, 
     xlim=c(mu-3.5*sigma , mu+3.5*sigma),
     lwd=4
)

# Here I simulate N independent observations of a normal distribution 
# with mean mu and  variance sigma2 (or standard deviation sigma).
X <- rnorm(N, mean=mu, sd=sigma)

# I draw a histogram of the simulated data using the frequencies 
hist(X, col="lightblue")

# I draw a histogram of the simulated data using the frequencies 
hist(X, col="lightblue", freq=F)

# Now I draw the histogram using relative frequencies
hist(X, col="lightblue", freq=F, ylim=c(0,0.5),
     main=paste(N, "simulated N(", mu, ",", sigma2, ")", "observations")
)
# and add vertical lines at the value of mu and the sample mean (interruped line)
abline(v=mu, col="red", lwd=4)
abline(v=mean(X), col="red", lwd=3, lty=2)
# and here I draw the density over the graph of the histogram
plot(Density, 
     xlim=c(mu-3.5*sigma , mu+3.5*sigma),
     lwd=4,
     add=T
)

################################################################################
#
# Constructing a test (the z-test)
#
################################################################################
#
# We will consider the following system of hypotheses:
#   H0: mu  = 10   (the null hypothesis)
#   HA: mu != 10   (the alternative hypothesis)
#
# Here is a first rejection rule (a naive and very strict):
# "Reject H0 when the sample mean is smaller than 9.95 or larger than 10.05"
#
################################################################################

# The sample mean is
mean(X)

# The logic variable below takes the vale "TRUE" if we reject H0 based on the
# rule in play or "FALSE" otherwise.
Reject <- 9.95 > mean(X) | mean(X) > 10.05
Reject

# Here I illustrate 
hist(X, col="lightblue", freq=F, 
     main=paste(N, "simulated N(", mu, ",", sigma2, ")", "observations"))
abline(v=c(9.95, 10.05), col="red", lwd=4, lty=3)
abline(v=mean(X), col="blue", lwd=5, lty=1)
# plot(Density, xlim=c(mu-3.5*sigma , mu+3.5*sigma),lwd=2, col="grey", add=T)
legend("topright", legend=c("Sample mean","Limits rejection region"),
       col=c("blue", "red"), lwd=c(5,4), lty=c(1,3), bty="n"
       )

# Here I repeat this procedure 4 times (generating a new X every time)
par(mfrow=c(2,2))
for(i in 1:4){
  X <- rnorm(N, mean=mu, sd=sigma)
  hist(X, col="lightblue", freq=F, main="")
  abline(v=c(9.95, 10.05), col="red", lwd=4, lty=3)
  abline(v=mean(X), col="blue", lwd=5, lty=1)
  plot(Density, xlim=c(mu-3.5*sigma , mu+3.5*sigma),lwd=2, col="grey", add=T)
}
par(mfrow=c(1,1))

# Next, I repeat the procedure 100,000 times and keep track of the results
# in a logical vector (called "Results")

Ntests <- 100000
Results <- logical(Ntests)
for(i in 1:Ntests){
  X <- rnorm(N, mean=mu, sd=sigma)
  Reject <- 9.95 > mean(X) | mean(X) > 10.05
  Results[i] <- Reject
}

# This displays the results of this "in silico experiment
table(Results)
# and here I calculate the proportion of rejection of H0
# (detail: When we make calculations with the logial vector, R converts the 
# entries of the logical vector into 0 when FALSE and 1 when TRUE, so that 
# when we calculate the mean of this vector we get the proportion of TRUEs)
mean(Results)

# Next, we build a less strict rejection rule.
# Second rejection rule (a less strict):
# "Reject H0 when the sample mean is smaller than 9.9 or larger than 10.1"

mean(X)
Reject <- mean(X) < 9.9 | mean(X) > 10.1
Reject

hist(X, col="lightblue", freq=F, main=paste(N, "simulated N(", mu, ",", sigma2, ")", "observations"))
abline(v=c(9.9, 10.1), col="red", lwd=4, lty=3)
abline(v=mean(X), col="blue", lwd=5, lty=1)
# plot(Density, xlim=c(mu-3.5*sigma , mu+3.5*sigma),lwd=2, col="grey", add=T)
legend("topleft", legend=c("Sample mean","Limits of the rejection region"),
       col=c("blue", "red"), lwd=c(5,4), lty=c(1,3), bty="n"
)

# We perform again the test 100,000 times and keep track of the results. We 
# keep track also of the sample means obtained in each of the 100,000 
# simulations (so we will be able to change the rejection rule and see what
# happens, without having to simulate 100,000 tests again)
Ntests <- 100000
Results <- logical(Ntests)
Means <- numeric(Ntests)
for(i in 1:Ntests){
  X <- rnorm(N, mean=mu, sd=sigma)
  Means[i] <- mean(X)
  Reject <- mean(X) < 9.9 | mean(X) > 10.1
  Results[i] <- Reject
}

# Here are the results of the 100,000 tests
table(Results)
mean(Results)

# Since we kept the sample means in the vector "Means" we might calculate
# the proportion of rejection as below
mean(Means < 9.9 | Means > 10.1)

# Alternatively,
delta <- 0.1
mean(Means < mu-delta | Means > mu+delta)

# Changing the rejection rule

delta <- 0.15
mean(Means < mu-delta | Means > mu+delta)

# I will estimate the probability of rejection for several rejection rules.
# I express the rejection rules in terms of how many standard deviations 
# (i.e. sigmas) divided by the suqre roots of the number of observations
# that we admit the sample mean to depart from the hypothised theoretical
# mean we admit, before reject the null hypothesis. I will denote this 
# number of standard deviations by "k".
k <- 1
delta <- k*sigma/sqrt(N); delta
mean(Means < mu-delta | Means > mu+delta)

k <- 1
mean(Means < mu-k*sigma/sqrt(N) | Means > mu+k*sigma/sqrt(N))

k <- 1.5
mean(Means < mu-k*sigma/sqrt(N) | Means > mu+k*sigma/sqrt(N))

k <- 1.96
mean(Means < mu-k*sigma/sqrt(N) | Means > mu+k*sigma/sqrt(N))

# Next, I will calculate the probability of rejecting H0 when varying k
# in a range of values and then plot the probability of rejecting H0
# as a function of k.
Ks <- seq(from=0.5, to=3, by=0.01)

( Ks <- seq(from=0.5, to=3, by=0.01) )

length(Ks)

alpha <- numeric(length(Ks))
for(i in 1:length(Ks)){
  k <- Ks[i]
  alpha[i] <- mean( Means < mu-k*sigma/sqrt(N) | Means > mu+k*sigma/sqrt(N) )
}

plot(Ks, alpha, type="l", lwd=3, 
     xlab="Distance from mu in standard deviations of the sample mean")
abline(h=c(0.1, 0.05, 0.01), lty=c(2,3,4))

plot(Ks, alpha, type="l", lwd=3, 
     xlab="Distance from mu in standard deviations of the sample mean")
segments(x0=0, y0=0.05, x1=1.96, y1=0.05, lty=2)
segments(x0=1.96, y0=0, x1=1.96, y1=0.05, lty=2)
abline(h=0)

################################################################################
# Here are some theoretical considerations:
# If X ~ N(mu, sigma2), then the sample mean ~ N(mu, sigma2/N)

# Then (sample mean - mu) / sqrt(sigma2/N) ~ N (0 , 1) i.e.

#  sqrt(N) (sample mean - mu) / sigma ~ N (0 , 1)

# P(sqrt(N) (sample mean - mu) / sigma <= 1.96) = Phi (1.96) approx 0.975

pnorm(-1.96)

qnorm(0.025)

#  P(sqrt(N) (sample mean - mu) / sigma > 1.96) approx 0.025

# But sqrt(N) (sample mean - mu) / sigma > 1.96
# <=> sample mean  > 1.96 * sigma/sqrt(N)
# and
# P(sample mean  > 1.96 * sigma/sqrt(N) ) approx 0.025

# By symmetry,
# P(sample mean  < -1.96 * sigma/sqrt(N) ) approx 0.025
# 
# Therefore, if X ~ N(mu, sigma2) and we use k=1.96
# P(reject H0) approx 0.025 + 0.025 = 0.05 = 5%

# Analogously, if we want a test of level 1% we use k such that
# the probability of a standard normal distributed random variable
# be larger than k is 0.5%, i.e. 
qnorm(0.005, lower.tail=F)
# 2.575829

################################################################################
# Studying the power of the test
################################################################################
# 
# In hypothesis tests there are two possible errors we might make:
# 
# Type I error: reject the null hypothesis when the null hypothesis is cirrect
#
# or
#
# Type II error: accept the null hypothesis when the null
#                huypothesis is wrong
#
# Next we will use a fixed rejection rule, but change the value of mu
# i.e. departure from the null hypothesis H0: E(X) = 10

k <- 1.96
muH0 <- 10
mu <- 10.01
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100

X <- rnorm(N, mean=mu, sd=sigma)
( Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N) )

Ntests <- 10000
Results <- logical(Ntests)
for(i in 1:Ntests){
  X <- rnorm(N, mean=mu, sd=sigma)
  Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
  Results[i] <- Reject
}

mean(Results)

mu <- 10.05
Ntests <- 10000
Results <- logical(Ntests)
for(i in 1:Ntests){
  X <- rnorm(N, mean=mu, sd=sigma)
  Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
  Results[i] <- Reject
}
mean(Results)

mu <- 10.1
Ntests <- 1000
Results <- logical(Ntests)
for(i in 1:Ntests){
  X <- rnorm(N, mean=mu, sd=sigma)
  Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
  Results[i] <- Reject
}
mean(Results)

( Mus <- seq(from=muH0-5*sigma/sqrt(N), to=muH0+5*sigma/sqrt(N), by=1/100) )
length(Mus)
Ntests <- 10000
Beta <- numeric(length(Mus))
for(j in 1:length(Mus)){
  for(i in 1:Ntests){
    X <- rnorm(N, mean=Mus[j], sd=sigma)
    Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
    Results[i] <- Reject
  }
  Beta[j] <- mean(Results) 
}

# Ploting the power function
plot(Mus, Beta, 
     type="l",
     lwd=4,
     main="Power function",
     ylab=expression(beta),
     xlab=expression(mu)
)
abline(h=0.05, lty=2)
abline(v=muH0)

Beta0.05 <- Beta

# Power function varying the significance level
qnorm(c(0.05, 0.025, 0.005), lower.tail=F)

k <- 1.644854 # Setting the significance level to 10%
Mus <- seq(from=muH0-5*sigma/sqrt(N), to=muH0+5*sigma/sqrt(N), by=1/100) 
Ntests <- 10000
Beta <- numeric(length(Mus))
for(j in 1:length(Mus)){
  for(i in 1:Ntests){
    X <- rnorm(N, mean=Mus[j], sd=sigma)
    Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
    Results[i] <- Reject
  }
  Beta[j] <- mean(Results) 
}

Beta0.10 <- Beta

plot(Mus, Beta0.10, 
     type="l",
     lwd=4,
     main="Power function",
     ylab=expression(beta),
     xlab=expression(mu)
)


############

k <- 2.575829 # Setting the significance level to 1%
Mus <- seq(from=muH0-5*sigma/sqrt(N), to=muH0+5*sigma/sqrt(N), by=1/100) 
Ntests <- 10000
Beta <- numeric(length(Mus))
for(j in 1:length(Mus)){
  for(i in 1:Ntests){
    X <- rnorm(N, mean=Mus[j], sd=sigma)
    Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
    Results[i] <- Reject
  }
  Beta[j] <- mean(Results) 
}

Beta0.01 <- Beta

#####
# pdf("~/Desktop/Plot.pdf")
plot(Mus, Beta0.05, 
     type="l",
     lwd=10,
     main="Power function",
     ylab=expression(beta),
     xlab=expression(mu),
     ylim=c(0,1),
     col="gray"
)
points(Mus, Beta0.10, 
       type="l",
       lwd=10,
       col="yellow"
)
points(Mus, Beta0.01, 
       type="l",
       lwd=10,
       col="lightblue"
)
segments(x0=0, y0=0.05, x1=muH0, y1=0.05, lty=2)
segments(x0=0, y0=0.10, x1=muH0, y1=0.10, lty=2, col="orange")
segments(x0=0, y0=0.01, x1=muH0, y1=0.01, lty=2, col="blue")
segments(x0=0, y0=0, x1=muH0, y1=0)
abline(v=muH0)
legend("bottomright", 
       legend=c("1% significance", "5% significance", "10% significance"),
       lwd=15, 
       col=c("lightblue","gray","yellow"),
       bty="n"
)
# dev.off()

# ###########


# Theoretical calculations of the power functions
#
# Reject H0 when mean
#
# "Reject H0 <=> SM < muH0 - k*sigma/sqrt(N) or SM > muH0 + k*sigma/sqrt(N)"
#
# where SM is the sample mean, E is the actual value of the E(X)
# P/E (SM < muH0 - k*sigma/sqrt(N)) = P/E (SM -E < muH0 - k*sigma/sqrt(N) -E)
#     =  P/E ((sqrt(N)/sigma)*(SM -E) < (sqrt(N)/sigma)* (muH0 - k*sigma/sqrt(N) -E) )
#     = Phi(sqrt(N)/sigma)* (muH0 - k*sigma/sqrt(N) -E)
#     = Phi(sqrt(N)/sigma)* (muH0 - E) - k),
#
# where P/E is the probability assuming the E(X)=E and Phi is the accumulated
# distribution function of a standard normal distribution.
#
# Analougous calculations yields,
# 
# P/E (SM > muH0 + k*sigma/sqrt(N)) = 1 - Phi(sqrt(N)/sigma)* (muH0 - E) + k),
#
# therefore,
#
# P/E(rejecting H0) = 1 - Phi(sqrt(N)/sigma)* (muH0 - E) + k) + Phi(sqrt(N)/sigma)* (muH0 - E) - k)
#
# which can be calculated using the function bellow.
#

Power <- function(E) {
  z <- sqrt(N)/sigma *(muH0-E)
  1-pnorm(z+k)+pnorm(z-k)
}

# Theoretical power at a significance level of 5%
k <- 1.96
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
Power(Mus)
points(Mus, Power(Mus), type="l", lwd=2)

# Theoretical power at a significance level of 1%
k <- 2.575829
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
points(Mus, Power(Mus), type="l", lwd=2, col="blue")

# Theoretical power at a significance level of 10%
k <- 1.644854
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
points(Mus, Power(Mus), type="l", lwd=2, col="red")

# pdf("~/Desktop/Plot.pdf")
plot(Mus, Beta0.05, 
     type="l",
     lwd=10,
     main="Power function",
     ylab=expression(beta),
     xlab=expression(mu),
     ylim=c(0,1),
     col="gray"
)
points(Mus, Beta0.10, 
       type="l",
       lwd=10,
       col="yellow"
)
points(Mus, Beta0.01, 
       type="l",
       lwd=10,
       col="lightblue"
)
segments(x0=0, y0=0.05, x1=muH0, y1=0.05, lty=2)
segments(x0=0, y0=0.10, x1=muH0, y1=0.10, lty=2, col="orange")
segments(x0=0, y0=0.01, x1=muH0, y1=0.01, lty=2, col="blue")
segments(x0=0, y0=0, x1=muH0, y1=0)
abline(v=muH0)

k <- 1.96
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
points(Mus, Power(Mus), type="l", lwd=2)

# Theoretical power at a significance level of 1%
k <- 2.575829
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
points(Mus, Power(Mus), type="l", lwd=2, col="blue")

# Theoretical power at a significance level of 10%
k <- 1.644854
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
points(Mus, Power(Mus), type="l", lwd=2, col="red")


legend("bottomright", cex=0.75,
       legend=c(
                "1% significance (theoretical)", 
                "5% significance (theoretical)", 
                "10% significance (theoretical)",
                "1% significance (Simulated)", 
                "5% significance (Simulated)", 
                "10% significance (Simulated)"
               ),
       lwd=c(2,2,2,15,15,15), 
       col=c("blue", "black", "red", "lightblue","gray","yellow"),
       bty="n"
)
# dev.off()

##################################
### Now we vary N, setting alpha =5%
##################################

N <- 100
( Mus <- seq(from=muH0-3.4*sigma/sqrt(N), to=muH0+3.4*sigma/sqrt(N), by=1/100) ) 
k <- 1.96
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 5
plot(Mus, Power(Mus), 
     type="l", 
     lwd=5,      
     main="Power function",
     ylab=expression(beta),
     xlab=expression(mu),
     ylim=c(0,1),
     col="yellow"
)
abline(v=muH0)
segments(x0=0, y0=0.05, x1=muH0, y1=0.05, lty=2)
segments(x0=0, y0=0, x1=muH0, y1=0)

N <- 10
points(Mus, Power(Mus), 
       type="l", 
       lwd=5,      
       main="Power function",
       col="orange"
)

N <- 20
points(Mus, Power(Mus), 
       type="l", 
       lwd=5,      
       main="Power function",
       col="red"
)

N <- 50
points(Mus, Power(Mus), 
     type="l", 
     lwd=5,      
     main="Power function",
     col="blue"
)

N <- 100
points(Mus, Power(Mus), 
       type="l", 
       lwd=5,      
       main="Power function",
       col="black"
)

N <- 1000
points(Mus, Power(Mus), 
       type="l", 
       lwd=5,      
       main="Power function",
       col="gray"
)

text(10.3, 0.98, "N=1000", col="gray", cex=0.8)
text(10.3, 0.73, "N=100", col="black", cex=0.8)
text(10.3, 0.46, "N=50", col="blue", cex=0.8)
text(10.3, 0.22, "N=20", col="red", cex=0.8)
text(10.3, 0.13, "N=10", col="orange", cex=0.8)
text(10.3, 0.08, "N=5", col="yellow", cex=0.8)

################################################################################
#                                                                              #
# The notion of p-value                                                        #
#                                                                              #
################################################################################
muH0 <- 10
sigma2 <- 1
sigma <- sqrt(sigma2)
N <- 100
# Have some samples, say X as below
X <- rnorm(N, mean=muH0, sd=sigma)

# For 5% signnificance
k <- 1.96
Reject <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
Reject

# Now we vary k
mean(X)

Ks <- seq(from=0.01, to=2, by=0.0001)
length(Ks)
Results <- logical(length(Ks))
for(i in 1:length(Ks)){
  k <- Ks[i]
  Results[i] <- mean(X) < muH0-k*sigma/sqrt(N) | mean(X) > muH0+k*sigma/sqrt(N)
}

Results[1]
as.numeric(Results[1])

Results[length(Ks)]
as.numeric(Results[length(Ks)])

plot(1:length(Ks), Results, type="l", lwd=5)

which.min(Results)

Results[which.min(Results)-1]
Results[which.min(Results)]

# This is the "last" k for which we still reject H0
K <- Ks[which.min(Results)-1]; K
# that is, the less strict value that still rejects (well, we are making a 
# search for some of the possible values of k, the values contained in the 
# vector Ks). Now, what is the probability of wrongly rejecting H0 when
# we define the rejection rule defined with THIS (last rejecting) value of k?
# Let us call K this (special) value of k (i.e. the largest value of k that
# defines a rejection rule that rejects H0).
# That is, what is the significance level obtained when using K?
# Well, 
# P/H0 (reject H0) = 
# = P/H0(mean(X) < muH0-K*sigma/sqrt(N) | mean(X) > muH0+K*sigma/sqrt(N)) 
# = P/H0(mean(X) < muH0-K*sigma/sqrt(N)) + P/H0(mean(X) > muH0+K*sigma/sqrt(N))
# = P/H0( sqrt(N)*(mean(X) - muH0)/sigma < -K) +
#   P/H0( sqrt(N)*(mean(X) - muH0)/sigma > +K)
# = Phi (-K) + 1 - Phi(K) = 2*Phi(-K)
#
# Therefore, the the significance level obtained when using K is,
2*pnorm(-K)
# This value is (approximately) the p-value (the exact p-value is obtained using
# the exact K, not an approximation as we did above).

# Consider K such mean(X) = muH0-K*sigma/sqrt(N) . Solving for K we obtain
# K = sqrt(N)* (mean(X) - muH0)/sigma
K <- sqrt(N)* (mean(X) - muH0)/sigma
K
# and then the p-value is
2*pnorm(-K)

#####################

# THE END