###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Paolo Girardi (paolo.girardi@unipd.it)
###########################################################################

#### POISSON REGRESSION ####

### CONTENTS ###########################################
## (A) A simple linear model from scratch
### (A.1) Data
### (A.2) Descriptive statistics
### (A.3) Model definition and estimation using GLM
### (A.4) Model diagnostic
### (A.5) Parameter interpretation
### (A.6) Model selection
### (A.7) Model with Overdispersion
########################################################
# Y_i ~ Poisson (mi_i)
# is a GLM regression with g() is a link function
# E[Y_i]=g(mi_i)=  Beta0+BetaX_i
# g() = log()
# estimate Beta0 and Beta coeff that are connected with expected mean

# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("~/Dropbox/SMDA/Parte B/labs/data")
#******************************************************************************************************


#-----------------------------------------------------------------------------------------------------
# Setting directory and loading packages
#-----------------------------------------------------------------------------------------------------
## (A.1) Data 
# Importing the data and check
# The data set absences.csv contains information on the number of absences of high school students in a school. 
# The variables in the data set are:
# gender: gender ("male,"female")
# math: the score on a math test (range: 0 - 100)
# nabs: number absences in last school year (in days)
# prog: the type of instructional program in which the student is enrolled ("General", "Academic", "Vocational")

# The school director would like to understand the attendance behaviour of the students and 
# relate it to the characteristics of the students and the study program they chose described 
# by the variables above  

#-----------------------------------------------------------------------------------------------------
# Import data and descriptive statistics
#-----------------------------------------------------------------------------------------------------
absence <- read.csv("absences.csv")
head(absence)
absence$gender<-factor(absence$gender)
absence$prog<-factor(absence$prog)
summary(absence)

### (A.2) Descriptive statistics
library(ggplot2)
# Histograms showing the distribution of the variables
ggplot(gather(absence[,2:3]), aes(value)) + 
  geom_histogram(bins=8,color="black", fill="grey70") + 
  facet_wrap(~key, scales = 'free') +
  theme_bw()

ggplot(gather(absence[,c(1,4)]), aes(value)) + 
  geom_histogram(color="black", fill="grey70", stat="count") + 
  facet_wrap(~key, scales = 'free') +
  theme_bw()


# Dependence of Y on the explanatory variables
# Math score
ggplot(absence, aes(x=math,y=nabs))+ geom_point()+
  theme_bw()

# Program
ggplot(absence, aes(nabs, fill = prog)) + geom_histogram(aes(y=..density..),color="black",binwidth = 1) + 
  facet_grid(cols = vars(prog), scales = "fixed")+ theme(legend.position="bottom")

# Gender
ggplot(absence, aes(nabs, fill=gender)) + geom_histogram(aes(y=..density..),color="black",binwidth = 1) + 
  facet_grid(cols = vars(gender), scales = "fixed")+ theme(legend.position="bottom")

# Given the descriptives, what would you expect the sign of the coefficients of the estimated PRM to be?
# Take three minutes to think about this and record your expectation below

# gender ref. female: 
# program ref. academic: 
# math:

# We will check at the end

### (A.3) Model definition and estimation using GLM
#-----------------------------------------------------------------------------------------------------
# Model estimation
#-----------------------------------------------------------------------------------------------------
# The code below estimates a PRM using the glm function. 
# The function requires the specification of the dependent and explanatory variables 
# using the usual formula:
#          dep ~ explanatory variables separated by +

modPoi1 <- glm(nabs ~ math + prog + gender, family="poisson", data=absence)
summary(modPoi1)
library(sjPlot)
tab_model(modPoi1)


### (A.4) Model diagnostic
#-----------------------------------------------------------------------------------------------------
# Model diagnostics
#-----------------------------------------------------------------------------------------------------
# A better way to visualize the diagnostics
# Linearity
library(car)
residualPlots(modPoi1, type = "deviance", pch=20, smooth=list(col="red"))

# Outliers, leverage, Cook's distance
influenceIndexPlot(modPoi1,vars=c("Studentized", "hat", "Cook"), id=list(n=c(4)))

plot(modPoi1)

#-----------------------------------------------------------------------------------------------------
# Model fit
#-----------------------------------------------------------------------------------------------------
# Test the goodness of fit of the model
modPoi2 <-  glm(nabs ~ 1, family="poisson", data=absence)
anova(modPoi2, modPoi1, test="Chisq") # Deviance G test 

### (A.5) Parameter interpretation

#-----------------------------------------------------------------------------------------------------
# Parameter interpretation
#-----------------------------------------------------------------------------------------------------
# Factor and percentage change
# IRR= INCIDENCE RATE RATIOS
# to install: install.packages("sjPlot")
library(sjPlot)
tab_model(modPoi1)

resPoi <- round(data.frame(summary(modPoi1)$coefficients, IRR=exp(modPoi1$coefficients), 
                           Per_change=(exp(modPoi1$coefficients)-1)*100),digits=3)
resPoi


### (A.6) Model selection
# Presence of interaction
add1(modPoi1,~.^2,test="Chisq")
modPoi3 <- glm(nabs ~ math * prog + gender, family="poisson", data=absence)
summary(modPoi3)

add1(modPoi3,~.^2,test="Chisq")
#ok only interaction math and prog

#no three wasy
### (A.7) Model with Overdispersion
#-----------------------------------------------------------------------------------------------------
# Have we forgotten anything? 
#-----------------------------------------------------------------------------------------------------
# Overdispersion/Underdispersion!
with(absence, tapply(nabs, prog, function(x) {
  sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
with(absence, tapply(nabs, gender, function(x) {
  sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))

modPoi4 <-  glm(nabs ~ ., family="quasipoisson", data=absence)
summary(modPoi4)
plot(modPoi4)


