################################################################################
#                                                                              
# Stst-Tutorial-11-AnscombeQuartet                                                            
#                                                                              
# Presents four pairs of variables that produce the same linear regressions,   
# with a relative high determination coeficient (r2), BUT present very         
# different relaltionships (some definetely not linear) as aparent from the    
# inspection of the scatter-plots   
#
# Data presented by the statistician Anscombe 
# Ref. Anscombe, F. J. (1973). "Graphs in Statistical Analysis". 
#      American Statistician. 27 (1): 17–21.
#
# The data is known as the "Anscombe Quartet".
#                                                                              
# R. Labouriau                                                                 
# Last revision: Spring 2020
#
# Copyright © 2019 by Rodrigo Labouriau
################################################################################

M <- matrix( 
  c(
    10, 10, 10,  8,  8.04, 9.14,  7.46,  6.58,
    8,  8,  8,  8,  6.95, 8.14,  6.77,  5.76,
    13, 13, 13,  8,  7.58, 8.74, 12.74,  7.71,
    9,  9,  9,  8,  8.81, 8.77,  7.11,  8.84,
    11, 11, 11,  8,  8.33, 9.26,  7.81,  8.47,
    14, 14, 14,  8,  9.96, 8.10,  8.84,  7.04,
    6,  6,  6,  8,  7.24, 6.13,  6.08,  5.25,
    4,  4,  4, 19,  4.26, 3.10,  5.39, 12.50,
    12, 12, 12,  8, 10.84, 9.13,  8.15,  5.56,
    7,  7,  7,  8,  4.82, 7.26,  6.42,  7.91,
    5,  5,  5,  8,  5.68, 4.74,  5.73,  6.89),
  byrow = TRUE, ncol = 8
)
colnames(M) <- c("x1", "x2", "x3", "x4", "y1", "y2", "y3", "y4")
A <- as.data.frame(M)
names(A) <- c("x1", "x2", "x3", "x4", "y1", "y2", "y3", "y4")
str(A)

print(A)

# Here we perform four different linear regressions: all of them produce the
# same determination coefficient (r2), namely 0.66, which would indicate,
# according to some authors, a reasonable adjustment of the regression line,
# but have a look at the plots of those equally good regressions ...

M1 <- lm(y1 ~ x1, data = A)
summary (M1)

M2 <- lm(y2 ~ x2, data = A)
summary (M2)

M3 <- lm(y3 ~ x3, data = A)
summary (M3)

M4 <- lm(y4 ~ x4, data = A)
summary (M4)

# This function calculates the determination coefficient (r2) of a regression
# of X on Y 
r2Calc <- function(X, Y, digits = 4){
  return(round(summary(lm(X ~ Y))$r.squared , digits = digits))
} 

# Here we make some interesting plots of the "EQUALLY good" regressions ...
Opar <- par(mfrow = c(2,2))
( Cor <- round(cor(x=A$x1, y=A$y1), digits = 4) )
( r2 <- r2Calc(A$y2, A$x2) )
plot(A$x1, A$y1, xlab = "X", ylab = "Y", pch = 19,
     sub = paste("Pearson corr. =", Cor, " r-squared =", r2))
abline(M1, lwd = 2)

( Cor <- round(cor(x=A$x2, y=A$y2), digits = 4) )
( r2 <- r2Calc(A$y1, A$x1) )
plot(A$x2, A$y2, xlab = "X", ylab = "Y", pch = 19,
     sub = paste("Pearson corr. =", Cor, " r-squared =", r2))
abline(M2, lwd = 2)

( Cor <- round(cor(x=A$x3, y=A$y3), digits = 4) )
( r2 <- r2Calc(A$y3, A$x3) )
plot(A$x3, A$y3, xlab = "X", ylab = "Y", pch = 19,
     sub = paste("Pearson corr. =", Cor, " r-squared =", r2))
abline(M2, lwd = 2)

( Cor <- round(cor(x=A$x4, y=A$y4), digits = 4) )
( r2 <- r2Calc(A$y4, A$x4) )
plot(A$x4, A$y4, xlab = "X", ylab = "Y", pch = 19,
     sub = paste("Pearson corr. =", Cor, " r-squared =", r2))
abline(M2, lwd = 2)
par(Opar)

################################################################################

# Here is another example of a linear regression classified as very good
# when using the determination coefficient criterium. Indeed a r2 of 0.99!

X <- rep(1, 100)
set.seed(1324)
Y <- rnorm(100, mean = 1, sd = 0.09)

X <- c(X, 10)
Y <- c(Y, 10)

plot(X, Y)
summary(lm(Y ~ X))  
abline(lm(Y ~ X))
r2Calc(Y, X)
# plot(X, residuals(lm(Y ~ X)))

################################################################################

#                            THE END!

################################################################################
#
# Optional part:
# Here I will modify a bit the data of the second regression 
#
################################################################################

# Here I re-run the second regression 
X <- A$x2
Y <- A$y2
plot(X,Y)
cor.test(X,Y)
LinRegr <- lm(Y ~ X)
# summary(LinRegr)
r2Calc(Y, X)
abline(LinRegr)
( CC <- coef(LinRegr) )
# Making some residual analysis
Residuals <- residuals(LinRegr)
scatter.smooth(X, Residuals)
abline(h = 0, lty = 2)

# Constructing a derived new data with a determination coeficient = 0.9
K <- 0.471
RescaledRes <- Residuals * K
plot(X, RescaledRes)
abline(h = 0, lty = 2)

Yr <- CC[1] + CC[2]*X + RescaledRes
NewRegression <- lm(Yr ~ X)

plot(X, Yr)
r2Calc(Yr, X)
summary(NewRegression)  
abline(NewRegression)

ResidualsNR <- residuals(NewRegression)
scatter.smooth(X, ResidualsNR)
abline(h = 0, lty = 2)

################################################################################
#                         THE (second) END!!
################################################################################