###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) A simple linear model from scratch
### (A.1) Data
### (A.2) Descriptive statistics
### (A.3) Model definition
### (A.4) Parameter estimation: Using optim()
### (A.5) Parameter estimation: Using ML solutions
### (A.6) Model evaluation and inference

## (B) Linear mdels via the lm() function
### (B.1) Model definition and parameter estimation
### (B.1) Computing CIs
### (B.2) Partial regression plots
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/smda/labs/")


# (A) A simple linear model from scratch ----------------------------------

## (A.1) Data 
## Consider the data stored in 'data1.Rda'. They refer to n=50 participants practicing sports and J=3 variables
## about personality, quality of life, and sport/exercise addiction. In particular:
## 'eai' is the total score from Exercise Addiction Inventory (EAI), a short screening tool consisting of six questions based upon 
## six general components of addiction. In general, a score greater then 24 indicates exercise addiction;
## 'extrav' is the score of the subscale Extraversion from NEO PI-R personality inventory;
## 'social' is the subscale Sociality from Short-Form 36 (SF-36), a questionnaire used to assess health-related quality of life. The subscale codifies the social functioning.
## The goal here is to predict the exercise addiction score (eai) as a function of extraversion and social functioning.
load("data/data1.Rda")

## (A.2) Descriptive statistics
## We can start by computing descriptive statistics of the data, including graphical representations.
head(datax)
str(datax)
# All the variables are continuous in this case. Some statistics can easily be computed via describe() from the 'psych' library:
psych::describe(datax)

## Some basic plots to explore the data:
x11();par(mfrow=c(1,3)); #note: X11() creates a new window for the plots. It works for Linux and Windows systems. For Mac users, it should be replaced by quartz()
hist(datax$eai,xlab = "eai",main=""); hist(datax$extrav,xlab = "extrav",main=""); hist(datax$social,xlab = "social",main="")
# The outcome 'eai' shows a symmetric distribution.

x11();par(mfrow=c(1,2))
plot(datax$extrav,datax$eai,main="",xlab="extrav",ylab="eai",bty="n")
plot(datax$social,datax$eai,main="",xlab="social",ylab="eai",bty="n")


## (A.3) Model definition
## We want to predict 'eai' as a function of 'extrav' and 'social' by means of a Normal linear model. 
## The marginal plots in lines 46-48 suggest that the outcome varies linearly as a function of the predictors.
## Then, a Normal linear model can be adequately defined:
## eai = b0 + extrav*b1 + social*b2 + e
## e ~ N(0,sigma_y)
## eai ~ N(mu,sigmay), with mu = b0 + extrav*b1 + social*b2 

X = cbind(1,datax$extrav,datax$social)
X = model.matrix(~extrav+social,data=datax) #it creates the matrix with all the predictors plus the intercept (first column) for the additive model being considered
head(X)
J = NCOL(X)-1 #number of predictors
n = NROW(X)
y = datax$eai

## (A.4) Parameter estimation: Using optim() on the log-likelihood of the model
## Such a model is identified if the parameters {b0,b1,b2,sigma_y} can adequately be recovered given the sample of data y (i.e., 'eai').
## To do so, we need to estimate the model parameter by solving the Maximum Likelihood problem defined in slide 10 (Module B).

## First, we can define the log-likelihood function as follows:
loglikel = function(pars,y,X){
  b = pars[1:NCOL(X)] #it extracts the first three parameters of the array 'pars'
  mu = X%*%b
  sigma_y = pars[NCOL(X)+1] #it takes the last element of the array 'pars'
  xval = sum(dnorm(y,mu,sigma_y,log = TRUE))
  return(xval)
}

## Next, we can recover 'pars' by maximizing the log-likelihood of the model numerically via optim() function:
res = optim(par = c(0,1,1,0.5), #starting points for the numerical routine function
            lower = c(-Inf,-Inf,-Inf,0), 
            upper = c(Inf,Inf,Inf,Inf), #lower and upper bounds for the parameters (note: sigma_y cannot be negative!)
            fn = function(x){loglikel(x,y,X)}, #function to be maximized/minimized (note: 'x' is currently the array of parameters 'pars')       
            method = "L-BFGS-B", #numerical method used for solving the optimization problem
            control = list(fnscale=-1,trace=3)) #fnscale<0: maximization; fnscale>0: minimization; trace = verbose

#The L-BFGS-B converged to a feasible point. The estimated parameters are as follows:
b_est = res$par[1:(J+1)] #estimated beta 
sigmay_est = res$par[J+2] #estimated sigma_y


## (A.5) Parameter estimation: Using ML solutions (slide 13, Module B)
## We can verify that the optim() based solutions are the same as those computed using the ML solutions for the Normal linear model:
b_est_2 = solve(t(X)%*%X)%*%t(X)%*%y
sigmay_est_2 = sqrt( 1/n * t(y-X%*%b_est_2)%*%(y-X%*%b_est_2) )
A = rbind(cbind(b_est,b_est_2),c(sigmay_est,sigmay_est_2)); colnames(A) = c("optim","ML sol"); rownames(A)=c("b0","b1","b2","sigma_y")
print(A)
# We can notice that they are the same (redardless the last decimals).
# The estimates can be interpreted as follows:
# b0=-1.353 (intercept)
# b1=1.866 as 'extrav' increases, 'eai' increases as well
# b2=-0.793 as 'social' increases, 'eai' decreases
# The exercise addiction score is positively associated to extraversion and negatively associated to social functioning.

## (A.6) Model evaluation and inference (testing individual coefficients)
## The overall R2 index (raw and adjusted) can be obtained by using the formula in slides 25-26 (Module B):
y_hat = X%*%b_est # Fitted response variable
y_mean = matrix(1,n,1)*mean(y)
R2 = 1 - (t(y-y_hat)%*%(y-y_hat)) / (t(y-y_mean)%*%(y-y_mean))
R2_adj = 1-(1-R2)*((n-1)/(n-J-1))
# The R2 of the model (and the adjusted version too) is large, which shows that the linear model fit the data adequately.

## Testing the coefficients individually can be performed using t-statistic (slides 30-31, Module B)
## We fix H0:beta=0, H1:beta!=0 (two-side), alpha=0.05
sigma_beta = sqrt(diag(solve(t(X)%*%X))*sigmay_est^2) #standard errors of the regression coefficients
ts = b_est/sigma_beta #t-statistics
## The reference values under H0 for the t-statistic is:
t0 = qt(1-0.05/2,n-J-1)
print(cbind(ts,t0))
# Since the t-statistics are higher then the reference t0, we can reject H0 at fixed alpha=0.05.
## Alternatively, we can compute the observed significance level (p-value) for all the regression coefficients:
ps = sapply(X = ts,FUN = function(x){2*min(pt(abs(x),n-J-1),pt(-abs(x),n-J-1))})
print(cbind(ts,t0,ps))

## CIs for the regression coeffs (alpha=0.05)
CI = cbind(b_est-qt(1-0.05/2,n-J-1)*sigma_beta,
           b_est+qt(1-0.05/2,n-J-1)*sigma_beta)
print(CI)


# (B) Linear mdels via the lm() function ----------------------------------

## (B.1) Model definition and parameter estimation
## We can easily define and run a Normal linear model by means of the well-known lm() function of the (default) 'stats' library.
## To do so, we define the model first as a formula object:
defMod = formula(eai~extrav+social) #additive model
estMod = lm(formula = defMod,data = datax)
summary(estMod) #visualize and compute relevant statistics of the fitted model

# The output can be interpreted as follows:
# Resisuals: statistics for the residuals of the model -- we will see it later
# Coeffs: extrav (b1), social (b2), Intercept (b0)
## Estimate: regression coefficients
## Std. Error: standard errors of regression coefficients
## t value: statistic (in this case: t-statistic) used to make inference on betas under H0: beta_j=0
## Pr(>|t|): probability associated to the t value under H0 (p-value)
## Residual standard error: sigma_y of the Normal linear model
## degrees of freedom: n-J-1 with J being the number of variables (in this case, p=2) -- we will see it later
## F-statistic: statistic associated to the R2 of the model (omnibus test) -- we will see it later

# Overall, the interpretation of the estimated regression coefficients follows that given in Sec. (A)

## (B.2) Computing CIs
# CIs can be computed using the default confint() function:
confint.lm(estMod,level = 0.95)

## (B.3) Partial regression plots
## Partial regression plots (see slides 23 and 48, Module B) via the avPlots() function of the 'car' library.
car::avPlots(estMod,main="",id=FALSE)






