# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/glms/")
source("utilities.R")


# (A) Data ----------------------------------------------------------------
## Data refer to a study of 'whodas', the WHO Disability Assessment Schedule (total score) which has been used 
## to provide a measure of disability for Huntington neuropsychiatric disorder. Higher scores indicating higher disability.  
## The current sample consists of n=100 statistical units (patients) measured over M=15 occasions (trials). The total number of
## observations is equal to n*M=1500. The dataset contains the following variables:
## 'group': boolean variable indicating patients with lower disorder (LD) or higher disorder (HD)
## 'xenazine': VMAT2 inhibitors dose for each m=1,..,M trial
## 'sbj': the variable indicating the patient's iid (i.e., the grouping variable)
## The goal is to predict 'whodas' as a function of age and group by considering as much as possible the inner-patient variability.
load("data4.Rda")
head(datax)
str(datax)

n = length(unique(datax$sbj))
M = length(datax$sbj==1)
J = NCOL(datax)-1


# (A) Exploratory analyses ------------------------------------------------
## Some graphical representations for each individual trajectory
lattice::xyplot(whodas~xenazine|as.factor(sbj),group=group,data=datax,type="b",pch=20) #overall plot using lattice

colfunc = colorRampPalette(c("blue", "orange")); cols = colfunc(n) #keep the first n colors

x11(); par(mfrow=c(1,2))
# group=LD
iid = as.numeric(unique(datax$sbj[datax$group=="LD"]))
i=iid[1]; plot(datax$xenazine[datax$sbj==i],datax$whodas[datax$sbj==i],pch=20,bty="n",xlab="xenazine",ylab="whodas",col=cols[i],ylim=c(20,250),main="LD")
for(i in iid[-1]){points(datax$xenazine[datax$sbj==i],datax$whodas[datax$sbj==i],pch=20,col=cols[i])}
# group=HD
iid = as.numeric(unique(datax$sbj[datax$group=="HD"]))
i=iid[1]; plot(datax$xenazine[datax$sbj==i],datax$whodas[datax$sbj==i],pch=20,bty="n",xlab="xenazine",ylab="whodas",col=cols[i],ylim=c(20,250),main="HD")
for(i in iid[-1]){points(datax$xenazine[datax$sbj==i],datax$whodas[datax$sbj==i],pch=20,col=cols[i])}

## Graphical representation for aggregated data (via mean function)
datax_agg = aggregate(datax$whodas,list(datax$sbj,datax$group),mean) #aggregating whodas -- more info via ?aggregate 
datax_agg$xenazine = aggregate(datax$xenazine,list(datax$sbj,datax$group),mean)$x #aggregating xenazine
head(datax_agg); colnames(datax_agg) = c("sbj","group","whodas","xenazine")

boxplot(datax_agg$whodas~datax_agg$group,frame=FALSE,xlab="",ylab="whodas")
plot(datax_agg$xenazine[datax_agg$group=="LD"],datax_agg$whodas[datax_agg$group=="LD"],bty="n",pch=20,col="blue",ylim=c(20,250),xlab="xenazine",ylab="whodas")
points(datax_agg$xenazine[datax_agg$group=="HD"],datax_agg$whodas[datax_agg$group=="HD"],col="orange",pch=20)



# (A) Models --------------------------------------------------------------
library(lmerTest); library(lme4)

mod1 = lmer(formula = whodas~group+xenazine+(1|sbj),data=datax)
## Note: The syntax (1|sbj) allows for adding the random effect codified through the variable 'sbj'.
## Generaly, the syntax for random-effects linear Normal models is the same as for fixed-effects linear Normal models.
## The lmer syntax allows for specifying more complex mixed-effects models as well. Further info are available at:
## https://cran.r-project.org/web/packages/lme4/vignettes/lmer.pdf
## For the purpose of this course, only the simple random-effect model (i.e., intercept model) will be used.
summary(mod1)
## The output of this model is the same as for the Normal linear model except that the summary now contains the
## 'Random effects' section with the variance/std.dev. of the random component 'sbj'. Note that 'df' and 'Pr(>|t|)' 
## are computed using the Sattherwhite approximation provided by the library 'lmerTest'.
## Confidence intervals for the model parameters can also be computed via 'profile likelihood':
lme4::confint.merMod(object = mod1,level = 0.95,method = "profile")
## or 'bootstrap' method:
lme4::confint.merMod(object = mod1,level = 0.95,method = "boot")

## The overall fit of the model can be evaluated by means of pseudo-R2 indices, e.g.:
performance::r2(model = mod1)
## The 'Marginal R2' index considers only the variance of the fixed effects and indicates how much of the model's variance is 
## explained by the fixed effects part only. The 'Conditional R2' index takes both the fixed and random effects into account and 
## indicates how much of the model's variance is explained by the complete model (random+fixed effects).
## Similarly, the ICC index can be easily computed as follows:
performance::icc(model = mod1) #look at the adjusted coeff
## which is the same as:
out = insight::get_variance(x = mod1) #it helps extracting the variance components from the fitted model
var_fixed = out$var.residual
var_random = out$var.random
mod1_icc = var_random/(var_fixed+var_random)
print(mod1_icc)

## We may evaluate whether adding/removing a term from the model:
mod2 = lmer(formula = whodas~group+xenazine+group:xenazine+(1|sbj),data=datax)
anova(mod1,mod2,test="Chisq")
## The anova(..,test="Chisq") function implements the LR test via ML method (see slide 16, Module C)
# In this case, we can see how 'mod2' should be chosen over 'mod1' (AIC is also lowest for this model).
## In a similar way, the performance of the two submodels can be assessed via graphical posterior predictive check as follows:
x11();par(mfrow=c(1,2))
posterior_pcheck_Normal(fitted_model = mod1,M = 100,titlep = "mod1")
posterior_pcheck_Normal(fitted_model = mod2,M = 100,titlep = "mod2")

## Plotting results:
## More info at: https://www.r-bloggers.com/2014/08/plotting-mixed-effects-model-results-with-effects-package/
x11();plot(effects::allEffects(mod2))
x11();lattice::dotplot(ranef(mod2)) #random-effect components
## Note that random-effect estimates can be accessed trhough the ranef() function.

# (A) Diagnostics ---------------------------------------------------------
## Once the model has been chosen, we can check the diagnostics.

## Normality of residuals
x11();plot(performance::check_normality(mod2))

## Homoscedasticity
x11();plot(performance::check_heteroscedasticity(mod2))

## Influence analysis
## Further info: https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Nieuwenhuis~et~al.pdf
out=influence.ME::influence(mod2,group = "sbj") #main routine for the influence analysis from the 'influence.ME' library
## To get info on the output: ?influence.ME::influence
## Plotting the output:
x11();influence.ME::plot.estex(x = out,which = "cook",sort = FALSE) #Cook's distance for the units
x11();influence.ME::plot.estex(x = out,which = "dfbetas") #Diff-beta statistics (which units would drastically change the estimates)

## Using the library 'ggResidpanel' we can compare the two competing models in terms diagnostics:
x11();ggResidpanel::resid_compare(models = list(mod1,mod2))
## From: https://goodekat.github.io/ggResidpanel/articles/introduction.html
## Residual Plot (upper left): This is a plot of the residuals versus predictive values from the model to assess the linearity and constant variance assumptions. The curving trend seen in the penguin_model plot suggests a violation of the linearity assumption, and there appears to be a violation of the constant variance assumption as well since the variance of the residuals is getting larger as the predicted values increase.
## Normal Quantile Plot (upper right): Also known as a qq-plot, this plot allows us to assess the normality assumption. There appears to be a deviation from normality in the upper end of the residuals from the penguin_model, but this is not as much of a concern as linearity and constant variance issues.
## Histogram (lower right): This is a histogram of the residuals with a overlaid normal density curve with mean and standard deviation computed from the residuals. It provides an additional way to check the normality assumption. This plot makes it clear that there is a slight right skew in the residuals from the penguin_model.
## Index Plot (lower left): This is a plot of the residuals versus the observation numbers. It can help to find patterns related to the way that the data has been ordered, which may provide insights into additional trends in the data that have not been accounted for in the model. There is no obvious trend in the penguin_model index plot.



# (B) Data ----------------------------------------------------------------
rm(list = ls()); graphics.off()
load(file = "Ohio.RData")
str(Ohio); head(Ohio)
# The dataset refers to a subset of the six-city study (n=2148), a longitudinal study of the health effects of
# air pollution on children. The variables are as follows:
# resp: an indicator of wheeze status (1=yes, 0=no)
# id: a numeric vector for subject id
# age: a (rescaled) numeric vector of age: {-2,-1,0,1,2} <-> {7,8,9,10} (0 corresponds to 9 years)
# smoke: an indicator of maternal smoking at the first year of the study

Ohio$resp = factor(x = Ohio$resp,levels = c(0,1),labels = c("no wheeze","wheeze"))
Ohio$smoke = factor(x = Ohio$smoke,levels = c(0,1),labels = c("no smoke","smoke"))

# First exploratory analyses
p_tab = prop.table(table(Ohio$resp,Ohio$age,Ohio$smoke),margin = c(2,3))
print(p_tab)

p_tab[1,,] #proportions by fixing the first margin (no wheeze)
p_tab[2,,] #proportions by fixing the first margin (wheeze)

# plot proportions of nosmoke/smoke for subjects showing wheeze as a function of age
agex = 7:10
plot(agex,p_tab[2,,][,1],bty="n",type="b",pch=1,lty=1,ylim=c(0.10,0.25),xlab="age",ylab="prop wheeze") #no smoke
points(agex,p_tab[2,,][,2],bty="n",type="b",pch=2,lty=2) #no smoke
legend("topright",legend=c("no smoke","smoke"),bty="n",pch=c(1,2),lty=c(1,2))
# The proportion of wheeze is higher in the group with smoke=1 (regardless of age), with higher proportions at age 8.


# (B) Models --------------------------------------------------------------
Ohio$smoke = relevel(Ohio$smoke,ref = "no smoke")
Ohio$resp = relevel(Ohio$resp,ref = "no wheeze")

# We may try defining and fitting a glm for repeated measures.
mod1 = lme4::glmer(data = Ohio,formula = resp~age+smoke+(1|id),family=binomial)
mod2 = lme4::glmer(data = Ohio,formula = resp~age*smoke+(1|id),family=binomial)
anova(mod1,mod2)

# ICC coefficient: 
# the proportion of the variance explained by the grouping structure in the population. 
# This index goes from 0, if the grouping conveys no information, to 1, if all observations in a group are identical.
# It is calculated by dividing the random effect variance by the total variance, i.e. the sum of the random effect variance and the residual variance.
performance::icc(model = mod1)

# Plot of the model
plot(effects::allEffects(mod1))

summary(mod1)
# For a fixed subject, wheeze decreases as a function of age and the proportion does not significantly differ between smoke and no-smoke groups.










