###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Data
## (B) Model building
## (C) Diagnostics
## (D) Correcting heteroscedasticity
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/smda/labs/")
source("utilities.R")


# (A) Data ----------------------------------------------------------------
## Data refer to n=150 participants which were recruited to evaluate whether a new school program increased the cognitive performance 
## w.r.t. a standard school program. The dataset contains the following variables:
#group: Boolean variable indicating whether a participant belongs to the experimental program ("S") or control group ("C)
#perstTes1: Total score on a personality test
## The goal is to predict 'cogscore' as a function of the other variables.

## The goal is to predict rts under sleep deprivation w.r.t. personality measures.
load("data/data5.Rda")
n = NROW(datax); J=NCOL(datax)-1
datax$group = relevel(x = datax$group,ref = "C") #relevel the categorical variable 'group'


# (B) Model building ------------------------------------------------------
## We have two strategies in this case: (i) use a top-down approach through which the
## model building is performed from a theoretical viewpoint (e.g., we may ask the designers of the experiment)
## or (ii) use a statistical procedure which will select the best model according to a given criterion (e.g., minimum-AIC).
## Let's consider a mixed strategy, where a first model (baseline) is defined by the theory underlying the experimental design
## which will be in turn improved by a statistical procedure. In particular, we can start with a first additive model 
## and verify later whether high-order terms (e.g., interactions) will improve the overall fit.
mod0 = lm(formula = cogscore~group+persTest1,data=datax)

mod1 = update(object = mod0,formula = ~.+ group:persTest1)
anova(mod0,mod1,test="F")
summary(mod1)
# The adjusted R2 index of the model shows a overall satisfactory fit. 
# Participants in the experimental group (S) show higher levels of 'cogscore' w.r.t. participants in the control group (C: Intercept) (b=0.261, SE=1.234).
# The outcome 'cogscore' decreased as a function of the predictor 'persTest1' (b=-2.358, SE=0.149). 
# The interaction term 'group:persTest1' indicates that participants in the experimental group (S) shows that 'cogscore' decreased 
# as a function of 'persTest1' if compared to those in the control group (C), with group:S showing less strong increasing then group:C
plot(effects::effect(mod = mod1,term="group"))
plot(effects::effect(mod = mod1,term="persTest1"))
plot(effects::effect(mod = mod1,term="group:persTest1"))


# (C) Diagnostics ---------------------------------------------------------
x11();par(mfrow=c(2,2));plot(mod1,which=c(2,3),bty="n") #Normality of residuals and Homoscedasticity
x11();car::leveragePlots(mod1) #analysis of leverage points
x11();car::influencePlot(mod1,bty="n") #influence plot1
x11();influential_plot(fitted_model = mod1) #influence plot2
x11();posterior_pcheck_Normal(fitted_model = mod1)

# Overall, the fitted model has some problems w.r.t. normality of residuals (see the tails of the qq-plot) but,
# most importantly, it shows non-constant variance (heteroscedasticity is present). As a consequence, the influential plots
# also indicate that some observations are currently influencing the fit of the model (even though this behavior 
# should be interpreted in light of the heteroscedasticity). The posterior-check plot shows that predictive distributions do not completely
# resemble the observed one (indeed the Normal linear model do not take into accout non-constant variance).

## Heteroscedasticity:
x11();plot(performance::check_heteroscedasticity(mod1)) #similar to: plot(mod1,which=3)
# A non-random pattern is clearly visible in the plot. 
## We can run the Breusch-Pagan test under the null hypothesis H0 that there is homoscedasticity:
lmtest::bptest(mod1)
# Of course, the test rejects H0: Heteroscedasticity (non-constant variance) is present in the data.
## We may also plot the Pearson residuals as a function of the predictors.
x11(); par(mfrow=c(1,2))
plot(residuals(mod1,type = "pearson")~datax$group,frame=FALSE)
plot(datax$persTest1,residuals(mod1,type = "pearson"),bty="n")
# We can notice that residuals are higher in group:C and, most interestingly, they show a (quasi non-linear) pattern w.r.t the predictor 'persTest1'  



# (D) Correct heteroscedasticity ------------------------------------------
## Heteroscedasticity can be corrected using a number of strategies, e.g.:
## 1. Transform the outcome variable (see: Fox, par. 12.2). For instance, for
## always positive outcomes, the Box-Cox transformation may be used -- see: caret::BoxCoxTrans().
## 2. Computing the standard errors (SEs) of the estimated regression coefficients using robust estimation techniques.
## 3. Using Generalized Least Squares (GLS) methods where the variance term is modeled according to a specific submodel (e.g., exponential)

## Robust estimators ('HC1') using lm_robust() from the 'estimatr' library
library(estimatr)
mod1b = estimatr::lm_robust(formula = cogscore~group*persTest1,data=datax,se_type = "HC1")
summary(mod1) #for the sake of comparison
summary(mod1b,)

## GLS: Modeling the pattern of non-constant variance via gls() and varFun() functions from the 'nlme' library
library(nlme)
mod1a=nlme::gls(model = cogscore~group*persTest1,data=datax,weights = varExp()) 
mod1b=nlme::gls(model = cogscore~group*persTest1,data=datax,weights = varConstProp())
#mod1c=nlme::gls(model = cogscore~group*persTest1,data=datax,weights = varPower())
AIC(mod1a,mod1b)
# The varExp() model of the variance seems to be the best choice in this case.
summary(mod1a)
x11();plot(fitted(mod1a),residuals(mod1a,type="pearson"),bty="n");abline(h = 0,lty=2)



