################################################################################ # # Stst-Tutorial-11-AnscombeQuartet # # Presents four pairs of variables that produce the same linear regressions, # with a relative high determination coeficient (r2), BUT present very # different relaltionships (some definetely not linear) as aparent from the # inspection of the scatter-plots # # Data presented by the statistician Anscombe # Ref. Anscombe, F. J. (1973). "Graphs in Statistical Analysis". # American Statistician. 27 (1): 17–21. # # The data is known as the "Anscombe Quartet". # # R. Labouriau # Last revision: Spring 2020 # # Copyright © 2019 by Rodrigo Labouriau ################################################################################ M <- matrix( c( 10, 10, 10, 8, 8.04, 9.14, 7.46, 6.58, 8, 8, 8, 8, 6.95, 8.14, 6.77, 5.76, 13, 13, 13, 8, 7.58, 8.74, 12.74, 7.71, 9, 9, 9, 8, 8.81, 8.77, 7.11, 8.84, 11, 11, 11, 8, 8.33, 9.26, 7.81, 8.47, 14, 14, 14, 8, 9.96, 8.10, 8.84, 7.04, 6, 6, 6, 8, 7.24, 6.13, 6.08, 5.25, 4, 4, 4, 19, 4.26, 3.10, 5.39, 12.50, 12, 12, 12, 8, 10.84, 9.13, 8.15, 5.56, 7, 7, 7, 8, 4.82, 7.26, 6.42, 7.91, 5, 5, 5, 8, 5.68, 4.74, 5.73, 6.89), byrow = TRUE, ncol = 8 ) colnames(M) <- c("x1", "x2", "x3", "x4", "y1", "y2", "y3", "y4") A <- as.data.frame(M) names(A) <- c("x1", "x2", "x3", "x4", "y1", "y2", "y3", "y4") str(A) print(A) # Here we perform four different linear regressions: all of them produce the # same determination coefficient (r2), namely 0.66, which would indicate, # according to some authors, a reasonable adjustment of the regression line, # but have a look at the plots of those equally good regressions ... M1 <- lm(y1 ~ x1, data = A) summary (M1) M2 <- lm(y2 ~ x2, data = A) summary (M2) M3 <- lm(y3 ~ x3, data = A) summary (M3) M4 <- lm(y4 ~ x4, data = A) summary (M4) # This function calculates the determination coefficient (r2) of a regression # of X on Y r2Calc <- function(X, Y, digits = 4){ return(round(summary(lm(X ~ Y))$r.squared , digits = digits)) } # Here we make some interesting plots of the "EQUALLY good" regressions ... Opar <- par(mfrow = c(2,2)) ( Cor <- round(cor(x=A$x1, y=A$y1), digits = 4) ) ( r2 <- r2Calc(A$y2, A$x2) ) plot(A$x1, A$y1, xlab = "X", ylab = "Y", pch = 19, sub = paste("Pearson corr. =", Cor, " r-squared =", r2)) abline(M1, lwd = 2) ( Cor <- round(cor(x=A$x2, y=A$y2), digits = 4) ) ( r2 <- r2Calc(A$y1, A$x1) ) plot(A$x2, A$y2, xlab = "X", ylab = "Y", pch = 19, sub = paste("Pearson corr. =", Cor, " r-squared =", r2)) abline(M2, lwd = 2) ( Cor <- round(cor(x=A$x3, y=A$y3), digits = 4) ) ( r2 <- r2Calc(A$y3, A$x3) ) plot(A$x3, A$y3, xlab = "X", ylab = "Y", pch = 19, sub = paste("Pearson corr. =", Cor, " r-squared =", r2)) abline(M2, lwd = 2) ( Cor <- round(cor(x=A$x4, y=A$y4), digits = 4) ) ( r2 <- r2Calc(A$y4, A$x4) ) plot(A$x4, A$y4, xlab = "X", ylab = "Y", pch = 19, sub = paste("Pearson corr. =", Cor, " r-squared =", r2)) abline(M2, lwd = 2) par(Opar) ################################################################################ # Here is another example of a linear regression classified as very good # when using the determination coefficient criterium. Indeed a r2 of 0.99! X <- rep(1, 100) set.seed(1324) Y <- rnorm(100, mean = 1, sd = 0.09) X <- c(X, 10) Y <- c(Y, 10) plot(X, Y) summary(lm(Y ~ X)) abline(lm(Y ~ X)) r2Calc(Y, X) # plot(X, residuals(lm(Y ~ X))) ################################################################################ # THE END! ################################################################################ # # Optional part: # Here I will modify a bit the data of the second regression # ################################################################################ # Here I re-run the second regression X <- A$x2 Y <- A$y2 plot(X,Y) cor.test(X,Y) LinRegr <- lm(Y ~ X) # summary(LinRegr) r2Calc(Y, X) abline(LinRegr) ( CC <- coef(LinRegr) ) # Making some residual analysis Residuals <- residuals(LinRegr) scatter.smooth(X, Residuals) abline(h = 0, lty = 2) # Constructing a derived new data with a determination coeficient = 0.9 K <- 0.471 RescaledRes <- Residuals * K plot(X, RescaledRes) abline(h = 0, lty = 2) Yr <- CC[1] + CC[2]*X + RescaledRes NewRegression <- lm(Yr ~ X) plot(X, Yr) r2Calc(Yr, X) summary(NewRegression) abline(NewRegression) ResidualsNR <- residuals(NewRegression) scatter.smooth(X, ResidualsNR) abline(h = 0, lty = 2) ################################################################################ # THE (second) END!! ################################################################################