###########################################################################
### PSQ4106864 DATAVIS
### A.Y. 2024/2025
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Visualizing continuous data (univariate II)
########################################################



# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2024_2025/datavis/")
source("labs/add_legend.R")


## Data import
datax <- read.csv(file = "datasets/avia_par_it.tsv",header = TRUE,sep = ",",na.strings = ": ")
str(datax)
head(datax)

# Note: The flight is coded using ICAO standard, so:
# 'IT_LIBC_IT_LIME' indicates the route from Crotone (LIBC, IT) to Bergamo (LIME, IT)
# 'IT_LIBC_DE_EDDN' indicates the route from Crotone (LIBC, IT) to Nuremberg (EDDN, DE)
# To find your ICAO airport: https://airportcodes.aero/search

# To simplify the dataset encoding, let split the first variable into four different variables:
out <- strsplit(x = datax$airp_pr.time,split = "_") #it returns a list with each element being a vector of four elements
out <- matrix(unlist(out),ncol = 4,byrow = TRUE) #unlist and rearrange the output as a matrix with four columns
# Now, let's create the final dataframe:
datay <- data.frame(out,datax[,-1])
str(datay)
colnames(datay)[1:4] <- c("FROM","ICAO_1","TO","ICAO_2") 

# Remove the symbol 'X' in front of each variable name (it is not needed)
nms <- gsub(x = colnames(datay),pattern = "X",replacement = "")
print(nms)
colnames(datay) <- nms
str(nms)



# (A) Visualizing continuous data -----------------------------------------

## (A.1) Quarterly time series
## The time series here represent variations over consecutive months (three). 
## To plot the related data, first we need to detect those variables (columns of the dataset) of the form YYYYQx with x in {1,2,3,4}.
## For the sake of simplicity, consider five consecutive years:
jjd1 <- grep(x = colnames(datay),pattern = "2000Q")
print(datay[1,jjd1]) 
#Note that months are ordered in decreasing order! So:
jjd1 <- sort(jjd1,decreasing = TRUE)
jjd2 <- sort(grep(x = colnames(datay),pattern = "2001Q"),decreasing = TRUE)
jjd3 <- sort(grep(x = colnames(datay),pattern = "2002Q"),decreasing = TRUE)
jjd4 <- sort(grep(x = colnames(datay),pattern = "2003Q"),decreasing = TRUE)
jjd5 <- sort(grep(x = colnames(datay),pattern = "2004Q"),decreasing = TRUE)
X <- datay[,c(jjd1,jjd2,jjd3,jjd4,jjd5)] #subsetting the original dataset (it contains only numeric values) column-wise

## Consider flights from LIRF (Fiumicino) to LFPG (Paris, Charles de Gaulle)
iid <- which(datay$ICAO_1=="LIRF" & datay$ICAO_2=="LFPG")
Y <- X[iid,] #subsetting row-wise

J <- NCOL(Y) #number of values to plot
ymin <- min(Y,na.rm=TRUE)
ymax <- max(Y,na.rm=TRUE)
yaxis <- seq(ymin,ymax,length=13)
cls <- c("#D2691E","#66CD00","#8B6508","#EEC900")
xlab <- colnames(Y)
iid <- grep(x = xlab,pattern = "Q1") #first period/year
iid2 <- setdiff(1:J,iid)
xlab[iid2] <- ""
xlab <- gsub(x = xlab,pattern = "Q1",replacement = "") #last refinements
print(xlab)

x11(bg = "azure2")
plot(0,0,bty="n",xlim=c(1,J),ylim=c(ymin,ymax),xlab="",ylab="",axes=FALSE)
axis(side = 1,at = 1:J,labels = xlab)
axis(side = 2,at = yaxis,labels = round(yaxis,2))
abline(v = iid,lwd=4,col="white")
# first series
points(1:J,Y[1,],pch=20,col=cls[1],type="b",lwd=1.5)
# second series
points(1:J,Y[2,],pch=20,col=cls[2],type="b",lwd=1.5)
# third series
points(1:J,Y[3,],pch=20,col=cls[3],type="b",lwd=1.5)
add_legend("topleft",fill = cls[1:3],legend = c("Route 1","Route 2","Route 3"),border = FALSE,bty = "n",ncol = 1)


## Alternatively, one can be also interested in plotting three different routes (by fixing the same years):
iid1 <- which(datay$ICAO_1=="LIRF" & datay$ICAO_2=="LFPG") #from Fiumicino to Paris CDG
iid2 <- which(datay$ICAO_1=="LIRF" & datay$ICAO_2=="EDDM") #from Fiumicino to Munchen
iid3 <- which(datay$ICAO_1=="LIRF" & datay$ICAO_2=="EGLL") #from Fiumicino to London Heathrow
iid4 <- which(datay$ICAO_1=="LIRF" & datay$ICAO_2=="EBBR") #from Fiumicino to Brussels (Zaventem)

Y <- rbind( #sum the number of flights column-wise for each destination
  apply(X[iid1,],2,sum,na.rm=TRUE),
  apply(X[iid2,],2,sum,na.rm=TRUE),
  apply(X[iid3,],2,sum,na.rm=TRUE),
  apply(X[iid4,],2,sum,na.rm=TRUE)
  )
print(Y)

ymin <- min(Y,na.rm=TRUE)
ymax <- max(Y,na.rm=TRUE)
yaxis <- floor(seq(ymin,ymax,length=13))

x11(bg = "#EEEEE0")
plot(0,0,bty="n",xlim=c(1,J),ylim=c(ymin,ymax),xlab="",ylab="",axes=FALSE)
axis(side = 1,at = 1:J,labels = xlab)
axis(side = 2,at = yaxis,labels = yaxis)
abline(v = iid,lwd=4,col="white")
points(1:J,Y[1,],pch=20,col=cls[1],type="b",lwd=1.5)
points(1:J,Y[2,],pch=20,col=cls[2],type="b",lwd=1.5)
points(1:J,Y[3,],pch=20,col=cls[3],type="b",lwd=1.5)
points(1:J,Y[4,],pch=20,col=cls[4],type="b",lwd=1.5)
add_legend("topleft",fill = cls[1:4],legend = c("to LFPG","to EDDM","to EGLL","to EBBR"),border = FALSE,bty = "n",ncol = 4)


## (A.2) Yearly time series

## Plot the number of flights for a subset of routes for all the available years (only Q1 for the sake of simplicity). 
## Additionally, highlight the period around 2009 when the H1N1 epidemic spreads out.
jjd <- sort(grep(x = colnames(datax),pattern = "Q1"),decreasing=TRUE) #years are stored in decreasing order
X <- datax[,jjd]
lbls <- colnames(X)
lbls <- gsub(x = lbls,pattern = "X|Q1",replacement = "") #remove unused symbols in year labels
iid1 <- which(datay$TO=="DE")
iid2 <- which(datay$TO=="US")
iid3 <- which(datay$TO=="PL")
Y1 <- X[iid1,] #DE
Y2 <- X[iid2,] #US
Y3 <- X[iid3,] #PL
y1 <- apply(Y1,2,mean,na.rm=TRUE)
y2 <- apply(Y2,2,mean,na.rm=TRUE)
y3 <- apply(Y3,2,mean,na.rm=TRUE)
ymin <- min(c(y1,y2,y3),na.rm=TRUE)
ymax <- max(c(y1,y2,y3),na.rm=TRUE)
yaxis <- round(seq(ymin,ymax,length=13),2)
J <- length(lbls)
cls <- RColorBrewer::brewer.pal(n = 3,name = "Set1")

x11(bg = "#EED5B7")
plot(0,0,bty="n",xlim=c(1,J),ylim=c(ymin,ymax),xlab="",ylab="",axes=FALSE)
axis(side = 1,at = 1:J,labels = lbls)
axis(side = 2,at = yaxis,labels = yaxis)
rect(xleft = which(lbls=="2008"),xright = which(lbls=="2010"),ybottom = ymin,ytop = ymax,col = "#CDB79E",border = NA) 
text(x = which(lbls=="2008"),y = ymax-10,adj=0,labels = "H1N1 epidemic")
points(1:J,y1,pch=20,col=cls[1],type="b",lwd=1.5)
points(1:J,y2,pch=20,col=cls[2],type="b",lwd=1.5)
points(1:J,y3,pch=20,col=cls[3],type="b",lwd=1.5)
add_legend("topleft",fill = cls[1:3],legend = c("DE","US","PL"),border = FALSE,bty = "n",ncol = 4)


## Consider all the routes from IT to US for the years 2001-2019 (Q1-Q3) and plot the median time series inside its corridor/envelope.
iid <- which(datay$TO=="US")
jjd <- sort(grep(x = colnames(datay),pattern = "Q"),decreasing = TRUE)
Y <- datay[iid,jjd]
Y <- Y[,-c(1:12,89:92)] #remove unwanted years
y <- apply(Y,2,median,na.rm=TRUE)
yq1 <- apply(Y,2,quantile,0.25,na.rm=TRUE) #lower bound of the envelope
yq2 <- apply(Y,2,quantile,0.75,na.rm=TRUE) #upper bound of the envelope
ymin <- min(yq1)
ymax <- max(yq2)
yaxis <- round(seq(ymin,ymax,length=13),2)
lbls <- colnames(Y)
J <- length(lbls)

x11(bg = "#EED5B7")
plot(0,0,bty="n",xlim=c(1,J),ylim=c(ymin,ymax),xlab="",ylab="",axes=FALSE,col="#EED5B7")
axis(side = 1,at = 1:J,labels = lbls)
axis(side = 2,at = yaxis,labels = yaxis)
polygon(x = c(1:J,rev(1:J)),y = c(yq2,rev(yq1)),col="gray",border=NA) #envelope
points(1:J,y,pch=20,col=cls[1],type="b",lwd=1.5)

## ..as before but now with the smoothed version of the series
x <- 1:J
out <- loess(y ~ x, span=0.5) #'span' controls for the smoothness of the curve
ypred <- predict(out)

x11(bg = "#EED5B7")
plot(0,0,bty="n",xlim=c(1,J),ylim=c(ymin,ymax),xlab="",ylab="",axes=FALSE,col="#EED5B7")
axis(side = 1,at = 1:J,labels = lbls)
axis(side = 2,at = yaxis,labels = yaxis)
polygon(x = c(1:J,rev(1:J)),y = c(yq2,rev(yq1)),col="gray",border=NA) #envelope
points(1:J,y,pch=20,col=cls[2],type="b",lwd=1)
points(1:J,ypred,pch=20,col=cls[1],type="l",lwd=1.5)
abline(h = mean(y),col=cls[2],lty=1,lwd=1.25)










