###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) A simple linear model from scratch
### (A.1) Data
### (A.2) Model definition and estimation
### (A.3) Testing all the coefficients
### (A.4) Testing subset of coefficients (submodels)
### (A.5) Confidence intervals for mu
### (A.6) Prediction intervals for mu
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/smda/labs/")



# (A) A simple linear model from scratch ----------------------------------

## (A.1) Data 
## Consider again the dataset used in 'lab1.R'
load("data/data1.Rda")
n = NROW(datax)
J = NCOL(datax)-1

## (A.2) Model definition and estimation
## For the sake of simplicity, we use the lm() function
defMod = formula(eai~extrav+social) #additive model
estMod = lm(formula = defMod,data = datax)
summary(estMod)
confint(object = estMod,level = 0.95)

## In the previous laboratory we have seen how to test regression coefficients 
## individually. Now, we can proceed by see how to use the omnibus test and how 
## to test subset of coefficients.

## (A.3) Testing all the coefficients (omnibus test)
## In this case, do not forget that H0: beta=0 (all coeffs are zero except the intercept)
R2 = summary(estMod)$r.squared
W = (R2/(1-R2))*((n-J-1)/J) # see slide 35, Module B
pW = 1-pf(q = W,df1 = J,df2 = n-J-1) #p-value of the statistic under H0
# In this case, H0 can be rejected. We recall that H0 implies the 'null model'. i.e. that model with the intercept only.
# The same results is reported at the bottom of the output given by the summary() function.
# As we will see later, the omnibus test contrasts the 'null model' against the 'current model' as follows:
estMod0 = lm(eai~1,data=datax) #null model
anova(estMod0,estMod,test="F") #the anova() function implements the omnibus F test
# We notice that the result is the same (note that the current model has less degree of freedom as it has more parameters to be estimated)

## (A.4) Testing subsets of coefficients (submodels)
## With the same rationale (see slides 37-39, Module B), we can test submodels with different subsets of parameters.
## In the current example, we can define all the possible submodels given the predictors in the dataset (excluding interactions):
estMod = lm(eai~.,data=datax) #full model
## As the submodels above are nested, then incremental F-test can be run via add1() function:
add1(object = estMod0,scope = estMod,test = "F",data=datax) 
## where estMod0 is the null model whereas estMod is the full model. 
# We can notice as follows: each row of the output table is the starting-point model where a variable has been added (direction: forward).
# The variable with the highest F-value (i.e., 'extrav') has to be added to the starting-point model ('EstMod0'):
estMod0 = update(object = estMod0,formula. = .~.+extrav) # new starting-point model
## Then the forward procedure can be run again:
add1(object = estMod0,scope = estMod,test="F",data=datax) 
# The results suggest that the other variable 'social' need to be added as well. Then, the final model
# is the same as that defined by 'estMod', which is the best model w.r.t. the variables in the dataset.
## This result can also be achieved by means of AIC/BIC indices, although the results provided by them are not always overlapped those
## provided by the incremental F-test. To this purpose, we can use the function 'compare_performance' from the 'performance' library.
estMod0 = lm(eai~1,data=datax) #null model 
estMod1 = lm(eai~extrav,data=datax)
estMod2 = lm(eai~social,data=datax)
performance::compare_performance(estMod0,estMod1,estMod2,estMod,metrics = "all",bayesfactor = FALSE)
## The function computes several indices, including AIC/BIC, R2/R2adj, RMSE (Root Mean Square Error: the lower, the better), and Sigma which corresponds to the parameter sigma_y of the model.
# The model that reaches the minimum AIC (and, similarly, the highest R2 adj and lower RMSE) is the full model 'estMod' which includes all the variable.
## Note: the AIC index can also be computed using the function AIC():
print(c(AIC(estMod0,estMod1,estMod2,estMod)))
# From now on, we can select 'estMod' as final model and use it for further analyses:
summary(estMod)

## (A.5) Confidence intervals for mu (slides 40-41, Module B)
## Once the final model has been selected, we can proceed by computing the prediction intervals for the mean of the model (mu):
out = predict(object = estMod,se.fit = TRUE,level = 0.95,interval = "confidence")
## The function predict() gives as output a list of four elements:
## $fit: a matrix containing the estimated mean (y_hat), the lower and the upper bounds for the mean
## $se.fit: the standard error of the estimated mean
## $df and $residual.scale: degree of freedom of the model (useful for computing the reference t-value) and the estimated sigma_y
## Then, the estimated intervals can be plotted by using the partial regression plots (see lab1.R) or via the 'effects' library:
plot(effects::allEffects(estMod))
## where the confidence bounds are plotted partially for each regressor in the model.

## (A.6) Prediction intervals for mu
## Finally, we may also compute future predictions for the outcome variable (i.e., 'eai') given new values of the predictors:
newdata = data.frame(extrav=37.87,social=7.65) #e.g.: observation with higher 'extrav' score and lower 'social' score
out = predict(object = estMod,newdata = newdata,se.fit = TRUE,level = 0.95,interval = "prediction")
print(out)

