###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Paolo Girardi (paolo.girardi@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) A simple linear model from scratch
### (A.1) Data
### (A.2) Model definition and estimation using GLM
### (A.4) Model diagnostic
### (A.5) Parameter interpretation
### (A.6) Hypothesis testing
### (A.7) Model selection
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("~/Dropbox/SMDA/Parte B/labs/data")
#******************************************************************************************************


#-----------------------------------------------------------------------------------------------------
# Setting directory and loading packages
#-----------------------------------------------------------------------------------------------------
## (A.1) Data 
## Consider the data stored in 'teddy_child.csv'.
# They refer to n=379 mothers who partecipated to the TEDDY Child study.
## We have a lot variables, specifically
## ID: identification number of the mother
## Children: number of children
## Fam_status: Married or Single
## Education: Middle school, High School, Degree or higher
## Occupation: Part-time, Full-time and Unemployed
## Age: Age in years
## Smoking_status: Yes, No, Ex-smoker
## Alcool_status:  Yes, No, Ex-drinker
## Parental_stress: Parental Stress
## Alabama_involvment: Alabama family involvment
## MSPSS_total: MSPSS total score (Perceived social support)
## LTE: Number of Life threating Event
## Partner_cohesion: Score of partner cohesion
## Family_cohesion: Score of family cohesion

# The dependent variable is:
## Depression_pp: Yes, No.
## This is the response variable is a binary variable.

# Importing the data and check

teddy <- read.csv("teddy_child.csv",header=TRUE)
#teddy <- read.csv(file.choose(),header=TRUE)
head(teddy)
summary(teddy)
str(teddy)

# Recoding the character variabile as a factor
teddy$Fam_status <- factor(teddy$Fam_status)
teddy$Education <- factor(teddy$Education)
teddy$Occupation <- factor(teddy$Occupation)
teddy$Smoking_status <- factor(teddy$Smoking_status)
teddy$Smoking_status<-relevel(teddy$Smoking_status,ref="No")
teddy$Alcool_status <- factor(teddy$Alcool_status)
teddy$Depression_pp <- factor(teddy$Depression_pp)
teddy$Children<-as.numeric(teddy$Children)
summary(teddy)

### (A.2) Descriptive statistics
#-----------------------------------------------------------------------------------------------------
# Some descriptive statistics
#-----------------------------------------------------------------------------------------------------
# Big descriptive tables 
# we use the library gtsummary
# install.packages("gtsummary")
library(gtsummary)
## see help or online refs.
table<- tbl_summary(
  teddy[,-1], # I don't consider ID
  by = Depression_pp, # split table by group
  type=list(Children ~ "continuous")
) %>%
  add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) %>%
  add_overall() %>%
  add_n() %>%
  modify_header(label ~ "**Characteristics**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Depression**") %>%
  modify_footnote(
    all_stat_cols() ~ "Median (IQR) or Frequency (%)"
  ) %>%
  bold_labels()

table

# we noted a marginal influence of the age, parental stress and alcool status (p=0.077)
#we recoded the Alcool_status in no- yes
levels(teddy$Alcool_status)
levels(teddy$Alcool_status)<-c("No","No","Yes")
with(teddy, chisq.test(table(Alcool_status,Depression_pp)))

### (A.2) Model definition and estimation
#-----------------------------------------------------------------------------------------------------
# Model definition and estimation
#-----------------------------------------------------------------------------------------------------
# The code below estimates a logistic regression model using the 
# glm (generalized linear model) function. 
# This function is used to fit generalized linear models and requires the specification 
# of the
# - dependent and explanatory variables using the usual formula
#   dependent variable ~ explanatory variables separated by +
# - description of the error distribution using the "family" argument
#   For the logistic model family = binomial(link = "logit")
mod1 <- glm(Depression_pp~Parental_stress, family=binomial(link = "logit"), data=teddy)
summary(mod1)
exp(0.03601)
# we have increase of 3.67% on the relative probability to 
# have a post partum depression for each 1 point increase in 
# parental stress 

# we test the inclusion of the Age
add1(mod1, scope = ~Parental_stress+Age, test = "Chisq") ## Deviance Chisq-test
#p-value >0.05
# we test the inclusion of the Alcool_status
add1(mod1, scope = ~Parental_stress+Alcool_status, test = "Chisq") ## Deviance Chisq-test
#p-value >0.05

## Other covariates?
# We use a step function
# step()
mod_full<- glm(Depression_pp ~ ., family=binomial(link = "logit"), data = teddy[,-1])
# default by AIC
mod_sel<-step(mod_full)
#library(bestglm) #glm
#Xy<-teddy[,c(2,11:15,9)]
#fit_best<-bestglm(Xy,family=binomial(link = "logit"),IC="AIC")

summary(mod_sel)
library(sjPlot)
tab_model(mod_sel)

# the final model contains 4 variables 
# check the presence of interactions
add1(mod_sel,~.^2 ,test="Chisq")
# no iteraction significant

#test the presence of quadratic terms
mod_sel_q1<- update(mod_sel,~.+I(Parental_stress^2))
anova(mod_sel,mod_sel_q1,test="Chisq") #p=0.27

mod_sel_q2<- update(mod_sel,~.+I(Family_cohesion^2))
anova(mod_sel,mod_sel_q2,test="Chisq") #p=0.97


### (A.3) Model diagnostic
#-----------------------------------------------------------------------------------------------------
# Model diagnostics
#-----------------------------------------------------------------------------------------------------
# Standard way not too helpful
par(mfrow=c(2,2))
plot(mod_sel, pch=19,cex=0.1)
par(mfrow=c(1,1))

# A better way to visualize the diagnostics
# Linearity
library(car)
residualPlots(mod_sel, type = "deviance", pch=20, smooth=list(col="red"))

# Outliers, leverage, Cook's distance
influenceIndexPlot(mod_sel,vars=c("Cook"),id=teddy$ID)
outlierTest(mod_sel) # Testing outliers
CookThreshold <- 4/(379-6-1) #(4/N-k-1) # Cook?s distance threshold for GLM
CookThreshold

# Is 31 really influential?
mod_sel2 <-update(mod_sel,subset=-c(31))
compareCoefs(mod_sel,mod_sel2) # smoking status changes
# Is 327 really influential?
mod_sel3 <-update(mod_sel,subset=-c(327,31))
compareCoefs(mod_sel,mod_sel3) # smoking status changes

### (A.4) Parameter interpretation
#-----------------------------------------------------------------------------------------------------
# Parameter interpretation (inference)
#-----------------------------------------------------------------------------------------------------

summary(mod_sel)

# Odds ratios and Wald CIs
results <- cbind(coefs=mod_sel$coefficients, OR = exp(coef(mod_sel)), exp(confint.default(mod_sel)))
exp(summary(mod_sel)$coefficients[,1]-qnorm(0.975)*summary(mod_sel)$coefficients[,2])
exp(summary(mod_sel)$coefficients[,1]+qnorm(0.975)*summary(mod_sel)$coefficients[,2])

# Odds ratios and profile-likelihood CIs
results <- cbind(coef=mod_sel$coefficients, OR = exp(coef(mod_sel)), exp(confint(mod_sel)))
results

# Prediction ability
predict(mod_sel,type="response")
hist(predict(mod_sel,type="response"))
# Confusion matrix - Prediction matrix
table(predict(mod_sel,type="response")>0.5,teddy$Depression_pp)
