# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/glms/")

## function to ungroup datasets in grouped style
ungroupData = function(mod_fitted=NULL,datax=NULL,y=NULL,n=NULL){
  data_long = c()
  for(i in 1:NROW(datax)){
    data_long = rbind(data_long,
                      cbind(c(rep(1,datax[i,y]),rep(0,datax[i,n]-datax[i,y])),
                            rep(mod_fitted$fitted.values[i],datax[i,n])))
  }
  data_long = data.frame(data_long,row.names = NULL); names(data_long) = c("y","yfitted")
  return(data_long)
}


# Case study 1: Data ------------------------------------------------------
load(file = "data/Beetles10.RData")
str(Beetles10); head(Beetles10)
# They refer to n=481 observations referring to beetles which died following a 5-hour exposure to gaseous carbon disulphide.
# Variables:
# logdose10: continuous variable about concentration of carbon disulphide (mg. per litre) in log scale
# ucciso (killed): integer variable about beetles' death (0: survived, 1: dead)

# The goal here is to define a linear model for 'ucciso' as a function of log.dose10. As the response variabile is dicothomous
# (it represents samples from a Binomial random variable), we need to use a Binomial linear model. Here, the term linear refers
# to the so-called linear predictor of the mean Eta_i = Xb. Note also that in the standard Binomial linear model, mean and variance are both
# function of the same explanatory variables and they do not account for eventual overdispersion (a condition where VAR[Y_i]>E[Y_i]).

# In the case of Beetles10, data are not grouped. This means that the response variable Y_i is represented as a collection of zero and one:
# Y_i ~ Bin(1,pi_i)
# with pi_i = g^-1(Xb), g^-1 is the inverse link function (e.g., logistic function)
# Similarly, the Binomial linear model works for grouped data as well (i.e., for each observation the total number of cases are reported).
# For instance:
X=table(Beetles10)
Beetles = data.frame(as.numeric(rownames(X)),X[,1]+X[,2],X[,2],row.names = NULL)
colnames(Beetles) = c("logdose","m","killed")
print(Beetles) # 'm' is the total number of beetles 
# In this case, the Binomial linear model is:
# m_i*Y_i ~ Bin(m_i,p_i)
# with pi_i being defined as above.

# This is to say that we can use the same GLM for both grouped and ungrouped dichotomous responses. We will use the grouped one (i.e., Beetles)
# for the next analyses.
# The response variable in the case of grouped data is as follows:
Beetles$y = Beetles$killed/Beetles$m #proportions


# Case study 1: Models ----------------------------------------------------
mod1_logit = glm(data=Beetles,formula = y~logdose,family = binomial(link=logit),weights = m) 
summary(mod1_logit)
# As we are dealing with grouped data, the total number of cases 'm' has to be used as weights for the model.
# This is the same as
# mod1_logit = glm(data=Beetles10,formula = ucciso~log.dose10,family = binomial(link=logit))
# for the case of ungrouped data (weights=1 and it can be omitted).
# Similarly, the syntax produces the same results as:
# mod1_logit = glm(data=Beetles,formula = cbind(killed,m-killed)~logdose,family = binomial(link=logit)) 

# We may try using different link functions
mod1_probit = glm(data=Beetles,formula = y~logdose,family = binomial(link=probit),weights = m) 
AIC(mod1_logit,mod1_probit)
# Overall, the probit choice seems working slightly better then standard logit.
# In addition, the probit link gets smaller residual variance then the logit one.

plot(Beetles$logdose,Beetles$y,bty="n",xlab="logdose",ylab="y",pch=20,lwd=3)
lines(Beetles$logdose,mod1_probit$fitted.values,lty=2,col="gray",lwd=2)
lines(Beetles$logdose,mod1_logit$fitted.values,lty=4,col="gray",lwd=2)
legend("topleft",legend=c("logit","probit"),bty="n",lty=c(4,2))

# Alternatively:
# plot(effects::allEffects(mod1_probit)) 

# Interpretation of the coefficients should be made according to the link function scale:
mod1_probit$coefficients
# beta = 19.73 is the estimated amount by which the log odds of y would increase if logdose were one unit higher. 
# The log odds of y when logdose is 0 is just Intercept. In this case, increasing the logdose, increases the probability to kill beetles.
# Useful discuss: https://stats.stackexchange.com/questions/34636/interpretation-of-simple-predictions-to-odds-ratios-in-logistic-regression/34638#34638

# Some useful machineries (for the logit case):
# Do not forget that pi_i = g^-1(eta_i), eta_i = b0 + logdose*b1, whereas eta_i = g(pi)
# where in general g(z):=exp(z)/(1+exp(z)) whereas g^-1(u):=log(u/(1-u))
# Indeed,
eta=cbind(1,Beetles[,1])%*%mod1_logit$coefficients
print(eta)
# which is the same as
predict(mod1_logit,type = "link")
# Instead, 
mu=cbind(1,Beetles[,1])%*%mod1_logit$coefficients
probs=exp(mu)/(1+exp(mu))
# which is the same as
predict(mod1_logit,type = "response")
# or
log(probs/(1-probs))

# The overall fit of the model can be evaluated in terms of AUC of the ROC curve.
# First, we need to get the long-version of the aggregated dataset Beetles (ROC curve requires dis-aggregated data)


# Beetles_long = c() #it will contains two columns, one for the observed response and the second one for the fitted response data
# for(i in 1:NROW(Beetles)){
#   Beetles_long = rbind(Beetles_long,
#                        cbind(c(rep(1,Beetles$killed[i]),rep(0,Beetles$m[i]-Beetles$killed[i])),
#                              rep(mod1_probit$fitted.values[i],Beetles$m[i])))
# }
# Beetles_long = data.frame(Beetles_long,row.names = NULL); names(Beetles_long) = c("y","yfitted")

Beetles_long = ungroupData(mod_fitted = mod1_probit,datax = Beetles,y = "killed",n = "m")

mod1_probit_roc = pROC::roc(Beetles_long$y,Beetles_long$yfitted)
plot(mod1_probit_roc,print.auc=TRUE)
# The AUC indicates that mod1_probit accurately resembles observed data in terms of predicted data:
# predictions resembles observations.

# Residual analysis
par(mfrow=c(2,2))
plot(mod1_probit,which=1:4)

binomTools::halfnorm(mod1_logit,resType = "pearson",env = TRUE) 
# halfnormal plot: All the residuals are well inside the simulated envelope.

# Given a certain logdose of carbon disulphide, what is the probability for a beetle to die?
## predicting on link scale
linpred = predict(object = mod1_probit,newdata = data.frame(logdose=1.8002),se.fit = TRUE,type="link")
print(linpred)

## CI for the prediction on link scale
linpred_ci = linpred$fit + c(-1,+1)*qnorm(p=0.975)*linpred$se.fit

## CI for the probability for a beetle to die given logdose
prob_ci = mod1_probit$family$linkinv(linpred_ci)


# Case study 2: Data ------------------------------------------------------
rm(list=ls()); graphics.off()
admission = read.csv("admission.csv",header=TRUE)
head(admission)
summary(admission)
# The dataset refers to n=400 students for the admissione to the UCLA University
## A researcher is interested in assessing to define and fit a linear model which includes:
## gre (gre: Graduate Record Exam scores),
## gpa (GPA: Grade Point Average) 
## rank (prestige of the undergraduate institution, effect admission into graduate school)
## as predictors of the outcome variable admit.

admission$rank = factor(admission$rank,levels=1:4,labels=1:4)
summary(admission)

# Case study 2: Models ------------------------------------------------------
mod1 = glm(admit~gre+gpa+rank, family=binomial(link = "logit"), data=admission)
#alternatively: mod1 = glm(admit~.,family="binomial",data=admission)
#Note: The default link function for the binomial family is the logit. Therefore, we can omit 
# (link = "logit") from the formula above.

#Diagnostics
## Linearity
par(mfrow=c(2,2)); plot(mod1, pch=19,cex=0.1)
car::residualPlots(mod1, type = "deviance", pch=20, smooth=list(col="red"))

## Outliers, leverage, Cook's distance
car::influenceIndexPlot(mod1,vars=c("Studentized", "hat", "Cook"), id=list(n=c(4)))
car::outlierTest(mod1) # Testing outliers

## Is the observation i=198 influential w.r.t. the parameter estimates?
mod2 =update(mod1,subset=-c(198))
car::compareCoefs(mod1,mod2)

mod3 = update(mod1,subset=-c(156))
car::compareCoefs(mod1,mod3)

#Interpretation
## The code below shows how the p-values for the Wald test are computed.
# Wald test for testing the association between admit and each explanatory variable:
# H_0: beta_j=0 vs. H_1: beta_j != 0
# names(summary(mod1))
# summary(mod1)$coefficients
# beta.est <- summary(mod1)$coefficients[,1]
# se.est <- summary(mod1)$coefficients[,2]
# z.values <- beta.est/se.est
# p.values <- 2*pnorm(abs(z.values),lower.tail=FALSE)
# data.frame(beta.est,se.est,z.values,p.values)
summary(mod1)

## Odds ratios and Wald CIs
results = cbind(coefs=mod1$coefficients, OR = exp(coef(mod1)), exp(confint.default(mod1)))
exp(summary(mod1)$coefficients[,1]-qnorm(0.975)*summary(mod1)$coefficients[,2])
exp(summary(mod1)$coefficients[,1]+qnorm(0.975)*summary(mod1)$coefficients[,2])

results = cbind(coefs=mod1$coefficients, OR = exp(coef(mod1)), exp(confint(mod1)))
print(results)

## Percentage change
100*(exp(coef(mod1))-1)

## Some plots
library(ggeffects)
plot(ggpredict(mod1,"gpa"))
plot(ggpredict(mod1,c("gpa","rank")))
plot(ggpredict(mod1,c("gpa","gre")))
plot(ggpredict(mod1,c("gpa","gre","rank")))

#Model selection
mod.empty = glm(admit~1,family="binomial",data=admission)
anova(mod.empty,mod1,test="Chisq")

mod.fin = step(mod.empty, direction="forward",scope=formula(mod1))
print(mod.fin)
summary(mod.fin)







