###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Paolo Girardi (paolo.girardi@unipd.it)
###########################################################################

#### Logistic Regression ####

### CONTENTS ###########################################
## (A) A simple linear model from scratch
### (A.1) Data
### (A.2) Descriptive statistics
### (A.3) Model definition and estimation using GLM
### (A.4) Model diagnostic
### (A.5) Parameter interpretation
### (A.6) Hypothesis testing
### (A.7) Model selection

## (B) Logistic model for grouped data (contingency tables)
### (B.1) Estimation
### (B.2) Interpretation
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("~/Dropbox/SMDA/Parte B/labs/data")
#******************************************************************************************************


#-----------------------------------------------------------------------------------------------------
# Setting directory and loading packages
#-----------------------------------------------------------------------------------------------------
## (A.1) Data 
## Consider the data stored in 'admission.csv'.
# They refer to n=400 students for the admissione to the UCLA University
## A researcher is interested in how variables, such as 
## gre (gre: Graduate Record Exam scores),
## gpa (GPA: Grade Point Average) 
## rank: prestige of the undergraduate institution, effect admission into graduate school. 
## The response variable, admit/don’t admit, is a binary variable.

# Importing the data and check
admission <- read.csv("admission.csv",header=TRUE)
head(admission)
summary(admission)


# Recoding the variable rank as a factor
admission$rank <- factor(admission$rank,levels=1:4,labels=1:4)
summary(admission)

### (A.2) Descriptive statistics
#-----------------------------------------------------------------------------------------------------
# Some descriptive statistics
#-----------------------------------------------------------------------------------------------------
# Histograms showing the distribution of the variables
histData <- gather(admission, key=key, value=value)
histData$value <- as.integer(histData$value)
library(ggplot2)
plot1= ggplot(histData, aes(value)) +
  geom_histogram(bins = 10, color= "black", fill="grey70") +
  facet_wrap(~key, scales = "free_x", nrow = 2, ncol = 2) +
  theme_bw()
plot1
# Scatter matrix
plot(admission)

# Proportions of admitted by gpa, gre and rank
gpaCat <- cut(admission$gpa,c(seq(4,6,0.2)), labels=FALSE)
prop.admit.gpa <- tapply(admission$admit,gpaCat,mean)

greCat <- cut(admission$gre,c(seq(260,960,50)), labels=FALSE)
prop.admit.gre <- tapply(admission$admit,greCat,mean) 

prop.admit.rank <-tapply(admission$admit,admission$rank,mean)
par(mfrow=c(2,2))
plot(prop.admit.rank,pch=19,xlab="rank")
plot(seq(4.1,6,0.2),prop.admit.gpa,pch=19,xlab="gpa")
plot(seq(275,935,50), prop.admit.gre,pch=19,xlab="gre")
plot(prop.admit.rank,pch=19,xlab="rank")
par(mfrow=c(1,1))


### (A.3) Model definition and estimation
#-----------------------------------------------------------------------------------------------------
# Model definition and estimation
#-----------------------------------------------------------------------------------------------------
# The code below estimates a logistic regression model using the 
# glm (generalized linear model) function. 
# This function is used to fit generalized linear models and requires the specification 
# of the
# - dependent and explanatory variables using the usual formula
#   dependent variable ~ explanatory variables separated by +
# - description of the error distribution using the "family" argument
#   For the logistic model family = binomial(link = "logit")
mod1 <- glm(admit~gre+gpa+rank, family=binomial(link = "logit"), data=admission[,1:4])

# When a model includes all the other variables in the data frame
# we can avoid to list all the variables by using
mod1 <- glm(admit~.,family="binomial",data=admission)
# The default link function for the binomial family is the logit. Therefore, we can omit
# (link = "logit") from the formula above and use the upper commas.

### (A.4) Model diagnostic
#-----------------------------------------------------------------------------------------------------
# Model diagnostics
#-----------------------------------------------------------------------------------------------------
# Standard way not too helpful
par(mfrow=c(2,2))
plot(mod1, pch=19,cex=0.1)
par(mfrow=c(1,1))

# A better way to visualize the diagnostics
# Linearity
# install.packages("car") if not yet installed
library(car)
residualPlots(mod1, type = "deviance", pch=20, smooth=list(col="red"))

# Outliers, leverage, Cook's distance
influenceIndexPlot(mod1,vars=c("Studentized", "hat", "Cook"), id=list(n=c(4)))
outlierTest(mod1) # Testing outliers
CookThreshold <- 5/400*qchisq(0.05,1,lower.tail=FALSE) # Cook?s distance threshold for GLM
CookThreshold

# Is 198 really influential?
mod2 <-update(mod1,subset=-c(198))
compareCoefs(mod1,mod2)

mod3 <-update(mod1,subset=-c(156))
compareCoefs(mod1,mod3)

### (A.5) Parameter interpretation
#-----------------------------------------------------------------------------------------------------
# Parameter interpretation (inference)
#-----------------------------------------------------------------------------------------------------
# The commented code shows how the p-values of the Wald test are computed 
# Wald test for testing the association between admit and each explanatory variable:
# H_0: beta_j=0 vs. H_1: beta_j != 0
# names(summary(mod1))
# summary(mod1)$coefficients
# beta.est <- summary(mod1)$coefficients[,1]
# se.est <- summary(mod1)$coefficients[,2]
# z.values <- beta.est/se.est
# p.values <- 2*pnorm(abs(z.values),lower.tail=FALSE)
# data.frame(beta.est,se.est,z.values,p.values)

summary(mod1)

# Odds ratios and Wald CIs
results <- cbind(coefs=mod1$coefficients, OR = exp(coef(mod1)), exp(confint.default(mod1)))
exp(summary(mod1)$coefficients[,1]-qnorm(0.975)*summary(mod1)$coefficients[,2])
exp(summary(mod1)$coefficients[,1]+qnorm(0.975)*summary(mod1)$coefficients[,2])

# Odds ratios and profile-likelihood CIs
results <- cbind(coefs=mod1$coefficients, OR = exp(coef(mod1)), exp(confint(mod1)))
results

# Percentage change
100*(exp(coef(mod1))-1)


library(ggeffects)
plot(ggpredict(mod1,"gpa"))
plot(ggpredict(mod1,c("gpa","rank")))
plot(ggpredict(mod1,c("gpa","gre")))
plot(ggpredict(mod1,c("gpa","gre","rank")))


### (A.6) Hypothesis testing
#-----------------------------------------------------------------------------------------------------
# Hypothesis testing
#-----------------------------------------------------------------------------------------------------
# More than one parameter:
# Model fit (overall test): H_0: beta_1=...=beta_p=0
mod.empty <- glm(admit~1,family="binomial",data=admission)
anova(mod.empty,mod1,test="Chisq")

# Computing the test by hand
# G.value <- with(mod1, null.deviance - deviance)
# G.value 
# df.G <- with(mod1, df.null - df.residual)
# df.G
# pvalue.G <- pchisq(G.value,df.G,lower.tail=FALSE)
# pvalue.G
# quantile.G <- qchisq(0.05,df.G) 
# quantile.G


# Subset of parameters
# E.g. H_0 = beta_{r2}=beta_{r3}=beta_{r4}
mod.red <- glm(admit~gre+gpa, family="binomial", data=admission)
anova(mod.red,mod1,test="Chisq")

# By hand
# G.value <- mod.red$deviance - mod1$deviance
# G.value 
# df.G <- with(mod1, df.null - df.residual)-with(mod.red, df.null - df.residual)
# df.G
# pvalue.G <- pchisq(G.value,df.G,lower.tail=FALSE)
# pvalue.G
# quantile.G <- qchisq(0.05,df.G) 
# quantile.G

### (A.7) Model selection
#-----------------------------------------------------------------------------------------------------
# Model selection
#-----------------------------------------------------------------------------------------------------
# Forward selection: start from the model with only the intercept:
mod.fin <- step(mod.empty, direction="forward",
                scope=formula(mod1))
mod.fin 
summary(mod.fin)

### (A.8) Probit model
#-----------------------------------------------------------------------------------------------------
# Probit model
#-----------------------------------------------------------------------------------------------------
mod2 <- glm(admit~gre+gpa+rank,family=binomial(link = "probit"),data=admission[,1:4])
summary(mod2)
library(sjPlot)
tab_model(mod1, mod2)

# no difference in terms of p-value, a different interpretation


### (B) Logistic model for grouped data (contingency tables)
## the dataset cointains the number passengers who died or survived during the Titanic disaster
## Data are regrouped by
## Economic.status = Crew, 1st, 2nd, 3rd class
## Age.group = A >25 B <25 years
## Gender= W=Women, M=Male

### (B.1) Model estimation
#-----------------------------------------------------------------------------------------------------
# Grouped data
#-----------------------------------------------------------------------------------------------------
titanic <- read.csv("titanic.csv",header=TRUE)
head(titanic)

modTitanic <- glm(cbind(Survived,Died)~.,data=titanic,family="binomial")
step(modTitanic,direction="both")
# the full model is ok
# checking the presence of interaction
add1(modTitanic,~.^2)
modTitanic2 <- glm(cbind(Survived,Died)~Economic.status*Gender+ Age.group,data=titanic,family="binomial")
add1(modTitanic2,~.^2)
modTitanic3 <- glm(cbind(Survived,Died)~Economic.status*Gender+ Economic.status*Age.group,data=titanic,family="binomial")
###the model is uncorrect because 
# we have not Crew members on the age.group C
#we try the other interaction

### (B.2) Model intepretation

results <- cbind(coefs=modTitanic2$coefficients, OR = exp(coef(modTitanic2)), exp(confint(modTitanic2)))
results

############## other 
# re-estimate with over-dispersion.
modTitanic2_over <- glm(cbind(Survived,Died)~Economic.status*Gender+ Age.group,data=titanic,family="quasibinomial")
summary(modTitanic2_over)