# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/glms/")


# Data --------------------------------------------------------------------
load("data3.Rdata")
head(rejections)
summary(rejections)
# The dataset data_exam.Rdata contains data collected from n=296 students enrolled in the University of Padova 
# and contains some information about exams related to the course PSP6075525.
# The dataset contains the following variables:
# • ID student: Personal student ID
# • gender: categorical variable with two levels (1 female, 2 male)
# • bonus: numeric variable containing the total amount of extra points students have received over the exams
# • total_errors: numeric variable about the number of total errors students have accumulated over the exams
# • rejections: numeric variable about the number of final mark rejections students have done over the exams
# • year: categorical variable with two levels (1: 2020, 2: 2021)
# • exam_session: categorical variable indicating when exams have been taken (1: summer, 2: winter)
str(rejections)

# Exploratory analyses ----------------------------------------------------
par(mfrow=c(2,2))
boxplot(rejections~gender,data=rejections,frame=FALSE)
boxplot(rejections~year,data=rejections,frame=FALSE)
boxplot(rejections~exam_session,data=rejections,frame=FALSE)
barplot(table(rejections$rejections),bty="n")

xtabs(formula = total_errors~gender+exam_session,data = rejections)
xtabs(formula = total_errors~gender+exam_session,data = rejections,subset = rejections==0)


# Models ------------------------------------------------------------------
out = glm(formula = rejections~year+exam_session+total_errors+bonus,data=rejections,family="poisson")
summary(out)

## Improving the model by removing all those variables which do not effectively contribute to explain the outcome variable (if any)
step(out,direction = "backward") #from the full model to the simplest one

## Using the final model
out = glm(formula = rejections~exam_session+total_errors+bonus,data=rejections,family="poisson")
sjPlot::plot_model(model = out,type = "eff")

# alternatively:
library(ggeffects)
plot(ggpredict(out,c("exam_session","total_errors","bonus")))

null = glm(formula = rejections~1,data=rejections,family="poisson")
R2 = 1-logLik(out)/logLik(null) #McFadden pseudo-R2 index

## Evaluating whether students who have taken the advantage of extra bonus during the summer session show higher rejections. 
out = glm(formula = rejections~exam_session+total_errors+bonus+exam_session:bonus,data=rejections,family="poisson")
plot(effects::effect(mod = out,term = "exam_session:bonus"))
confint(object = out,level = 1-0.05)[5,]
exp(confint(object = out,level = 1-0.05)[5,])

# There is no evidence that students who have taken the exams during the summer session along with extra bonus show higher levels of rejections
# ( β̂ session:bonus = −0.0161, σ̂ β session:bonus = 0.1334, t β session:bonus = −0.12). The 95% confidence
# interval for β̂ session:bonus is [−0.268, 0.256] (on the linear predictor scale). In terms of expected
# rejections, the same approximate confidence interval is [0.764, 1.292], which indicates that
# the increment of rejections goes from 0.764 to 1.292 units.



# Checking for overdispersion ---------------------------------------------
# A Poisson linear model which takes into account extravariation in the response variable can be fit via quasi-likelihood approach.
out = glm(formula = rejections~exam_session+total_errors+bonus,data=rejections,family="quasipoisson")
summary(out)
# In this case, the dispersion parameter φ  = 0.541 is not large enough (φ < 1) to state that 
# rejections is affected by overdispersion.




