# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2020_2021/GLMs/")


# Case study 1: Data ------------------------------------------------------
load(file = "data/Neonati.RData")
# Data are already in R format (.Rdata or .Rda) and we can load them directly into the current R environment.
str(Neonati)
# They refer to n=32 newborns and their mothers with the following variables:
# peso (weight): continuous variable about the weight at birth
# durata (duration): integer variable about the duration of the gestation in weeks
# fumo (smoke): boolean variable about the smoke habit of the mother (1/F: smoker; 2/NF: non smoker)

# The goal here is to define a Normal linear model for 'peso' in terms of 'durata' and 'fumo' as follows:
# Y_i ~ Normal(mu_i,sigma_i)
# mu_i = E[Y_i] = beta0 + beta1*x1 + beta2*x2 + beta3*(x1*x2) (mean component)
# Y_i := peso; x1 := durata; x2 := fumo; x1*x2 := interaction 
# sigma_i = sigma (variance component: it is the same over n=32 observations)
# Note that as x2 is a categorical variable with two levels, beta2 represents the difference of weight at birth for newborns with smoker against non-smoker mothers



# Case study 1: Descriptive analyses --------------------------------------
summary(Neonati)

par(mfrow=c(1,3)) 
plot(Neonati$peso~Neonati$durata,bty="n",ylab="weigth",xlab="duration",main="A") #marginal bivariate plots
boxplot(Neonati$peso~Neonati$fumo,ylab="weigth",xlab="smoke",main="B",frame=FALSE) 
plot(Neonati$peso[Neonati$fumo=="F"]~Neonati$durata[Neonati$fumo=="F"],bty="n",ylab="weigth",xlab="duration",main="C",col="red",pch=10)
points(Neonati$peso[Neonati$fumo=="NF"]~Neonati$durata[Neonati$fumo=="NF"],col="blue",pch=10)

# Panel (A): As duration in months increases, weight increases as well with higher durations being associated to larger weights (as expected)
# Panel (B): Newborns with non-smoker mothers show higher weith at birth then those with smoker mothers
# Panel (C): As duration increases, weight at birth for newborns with non-smoker mothers increases more than weight for newborns with smoker mothers



# Case study 1: Models ----------------------------------------------------
mod1 = lm(formula = peso~durata*fumo,data = Neonati)
summary(mod1)
# Resisuals: statistics for the residuals of the model (we will se later)
# Coeffs: durata (beta1), fumoF (beta2), Intercept (beta0: reference level for fumo=NF), durata*fumoF (beta3)
## Estimate: betas of the model
## Std. Error: standard errors of betas
## t value: statistic (in this case: t statistic) used to make inference on betas under H0: beta_j=0
## Pr(>|t|): probability associated to the t value under H0 (pvalue)
## Residual standard error: sigma of the model (see above)
## degrees of freedom: n-p with p being the number of variables (in this case, p=4)
## F-statistic: statistic associated to the R2 of the model (omnibus test)

# The R2 of the model is large, which shows that the linear model fit the data adequately.
# We observe that weight at birth does not vary as a function of smoke. Similarly, there is no interaction effect between smoke*duration on weight at birth.

# mod1 is the largest model containing all the high-order effects given one categorical variable ('fumo') and one continuous variable ('durata'). 
# The inferential tests on betas suggest that some of them may be removed from the model, as they do not add any information when explaining the response variable 'peso'.
# One can get the same results by using the omnibus F-test on the whole model as implemented by the anova() function:
mod0 = lm(formula = peso~durata+fumo,data = Neonati)
anova(mod0,mod1,test="F")
# by means of which we can conclude that mod1 does not improve the smaller model mod0. 
# Similarly:
AIC(mod0,mod1)
# Once the final model has been chosen, one can:
## compute the confidence intervals of betas:
confint.lm(mod0,level=0.95)

## graphical model checking:
par(mfrow=c(2,2))
plot(mod0,which = 1:4)
# Adjusted from Crawley, The R Book, p.464:
# The first graph (top left) shows residuals on the y axis against fitted values on the x axis. Ideally, as here, the points should look like the sky at night. It is a major problem if the scatter increases as the fitted values get bigger.
# The next plot (top right) shows the normal qqnorm plot (p. 406) which should be a straight line if the errors are normally distributed. Again, the present example looks fine. If the pattern were S-shaped or banana-shaped, we would need to fit a different model to the data.
# The third plot (bottom left) is a repeat of the first, but on a different scale; it shows the square root of the standardized residuals (where all the values are positive) against the fitted values. If there was a problem, such as the variance increasing with the mean, then the points would be distributed inside a triangular shape, with
# the scatter of the residuals increasing as the fitted values increase. The red line would then show a pronounced upward trend. 
# The fourth and final plot (bottom right) shows Cook’s distances for each of the observed values of the response variable. The point of this plot is to highlight those y values that have the biggest effect on the parameter estimates (high influence).

## plot the fitted linear model:
par(mfrow=c(1,1))
plot(Neonati$peso~Neonati$durata,bty="n",ylab="weigth",xlab="duration"); abline(mod0,col=2,lwd=2)

plot(Neonati$peso[Neonati$fumo=="F"]~Neonati$durata[Neonati$fumo=="F"],bty="n",ylab="weigth",xlab="duration",col="red",pch=10); points(Neonati$peso[Neonati$fumo=="NF"]~Neonati$durata[Neonati$fumo=="NF"],col="blue",pch=10)
abline(a = mod0$coefficients[1],mod0$coefficients[2],col="blue",lwd=2)
abline(a = mod0$coefficients[1]+mod0$coefficients[3],mod0$coefficients[2],col="red",lwd=2)

## make predictions based on the chosen model:
# For instance, we may be interested in predicting the weight at birth of a newboarn with duration of 41 weeks for both smoker and non-smoker mothers.
topredict = data.frame(fumo=c("F","NF"),durata=rep(41,2)) #the same structure as for 'Neonati'
print(topredict)
predict.lm(mod0,newdata = topredict,interval = "predict",level = 0.95)

# Again, predicting the weight at birth of a newborn with smaller duration (29) and non-smoker mother:
topredict = data.frame(fumo="NF",durata=29) #the same structure as for 'Neonati'
predict.lm(mod0,newdata = topredict,interval = "predict",level = 0.95)


# Case study 2: Data ------------------------------------------------------
rm(list=ls())
load(file = "data/Clotting.RData")
str(Clotting); head(Clotting)
Clotting$lotto = relevel(Clotting$lotto,"uno")
# They refer to n=18 observations about mean blood clotting times in seconds for nine percentage concentrations of normal plasma and two lots of clotting agent.
# Variables:
# tempo (time): discrete variable about clotting time (in seconds)
# u: plasma concentration (in precentage)
# lotto (lot): categorical variable with two levels, 'uno' (one) and 'two' (due)

# The goal here is to define and fit a linear model for clotting time as a function of log(u) and lot.

# Case study 2: Descriptive analyses --------------------------------------
summary(Clotting)

plot(log(Clotting$u[Clotting$lotto=="uno"]),Clotting$tempo[Clotting$lotto=="uno"],bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=1)
points(log(Clotting$u[Clotting$lotto=="due"]),Clotting$tempo[Clotting$lotto=="due"],bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=2)
legend("topright",legend=c("lot 1","lot 2"),bty="n",pch=20,col=c(1,2))

# We observe that clotting time is a non-linear function of plasma concentration for both groups. 
# In order to to define and fit a Normal linear model we may try to transform the response variable in order to make the relation with the 
# explanatory variables linear. To this end we try two types of transformation, i.e. logarithm (A) and reciprocal (B)
par(mfrow=c(1,2))

#(A)
plot(log(Clotting$u[Clotting$lotto=="uno"]),log(Clotting$tempo[Clotting$lotto=="uno"]),bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=1,main="A")
points(log(Clotting$u[Clotting$lotto=="due"]),log(Clotting$tempo[Clotting$lotto=="due"]),bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=2)
legend("topright",legend=c("lot 1","lot 2"),bty="n",pch=20,col=c(1,2))

#(B)
plot(log(Clotting$u[Clotting$lotto=="uno"]),1/(Clotting$tempo[Clotting$lotto=="uno"]),bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=1,main="B")
points(log(Clotting$u[Clotting$lotto=="due"]),1/(Clotting$tempo[Clotting$lotto=="due"]),bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=2)
legend("topleft",legend=c("lot 1","lot 2"),bty="n",pch=20,col=c(1,2))

# Both transformations seems quite reasonably and reduce the amount of non-linearity in between log(u) and time. 
# We can now proceed in defining and fit two Normal linear model for both the transformations. 
# Please, be aware that interpretation of coefficients on the response variable should be made on the transformed scale
# e.g., logarithm of clotting time or reciprocal or clotting time.

# Case study 2: Models ----------------------------------------------------
mod0_log = lm(formula = log(tempo)~I(log(u))+lotto,data=Clotting)
summary(mod0_log)

mod0_rec = lm(formula = 1/tempo~I(log(u))+lotto,data=Clotting)
summary(mod0_rec)

AIC(mod0_log,mod0_rec)

par(mfrow=c(2,2))
plot(mod0_log,which = 1:4)

par(mfrow=c(2,2))
plot(mod0_rec,which = 1:4)

# It seems the reciprocal transformation allows for a better fit of the Normal linear model. We will keep it in the next analyses.
mod1_rec = lm(formula = 1/tempo~I(log(u))*lotto,data=Clotting)
summary(mod1_rec)

anova(mod0_rec,mod1_rec,test="F")

par(mfrow=c(1,1))
plot(log(Clotting$u[Clotting$lotto=="uno"]),1/(Clotting$tempo[Clotting$lotto=="uno"]),bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=1)
points(log(Clotting$u[Clotting$lotto=="due"]),1/(Clotting$tempo[Clotting$lotto=="due"]),bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=2)
legend("topleft",legend=c("lot 1","lot 2"),bty="n",pch=20,col=c(1,2))

abline(a = mod1_rec$coefficients[1],b = mod1_rec$coefficients[2],col=1,lwd=2,lty=2)
abline(a = mod1_rec$coefficients[1]+mod1_rec$coefficients[3],b = mod1_rec$coefficients[2]+mod1_rec$coefficients[4],col=2,lwd=2,lty=2)





