###########################################################################
### PSQ4106864 DATAVIS
### A.Y. 2024/2025
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Visualizing categorical data (bivariate)
## (B) Visualizing continuous data (bivariate)
## (C) Visualizing multivariate data
########################################################



# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2024_2025/datavis/")
source("labs/add_legend.R")
source("labs/generate_color_gradient.R")


## Data import
survey <- read.csv(file = "datasets/company_data.csv",header = TRUE)
str(survey)

bli <- xlsx::read.xlsx(file = "datasets/betterLifeIndex.xls",sheetIndex = 1)
str(bli)

grades <- xlsx::read.xlsx(file = "datasets/grades.xlsx",sheetIndex = 1)
str(grades)

matches <- read.delim(file = "datasets/matches.txt",header = FALSE,sep = "\t")
str(matches)
head(matches)
tail(matches) 
# The dataframe created above contains one empty row (the last one) that need to be removed:
matches <- matches[-376,]


# (A) Visualizing categorical data ----------------------------------------

## Consider the table formed by counting two variables jointly:
X <- table(formula = survey$Gender,survey$Occupation)
print(X)

## A fast and simple way to visualize the joint table is to use the so-called mosaic plot:
x11() 
mosaicplot(X,main="Gender and Occupation",col="#EED5B7",border = "black",xlab="")

## An example with three variables
X <- table(formula = survey$Gender,survey$Occupation,survey$EatingOutside)
print(X)

x11() 
mosaicplot(X,main="Gender, Occupation, Eating outside",col="#EED5B7",border = "black",xlab="")
  


# (B) Visualizing continuous data -----------------------------------------

cls <- c("#A52A2A","#66CD00")

## A simple and fast way to plot two series of data:
x11()
plot(x = bli$Self.reported.health,y = bli$Life.expectancy,bty = "n",xlab="health",ylab="life exp",pch=20)
abline(h = mean(bli$Life.expectancy),v = mean(bli$Self.reported.health),lty=2)


## ..adding a third variable (categorical)
x11(bg = "white")
plot(0,0,bty="n",,xlab="health",ylab="life exp",col="white",xlim=c(min(bli$Self.reported.health),max(bli$Self.reported.health)),ylim=c(min(bli$Life.expectancy),max(bli$Life.expectancy)))
abline(h = mean(bli$Life.expectancy),v = mean(bli$Self.reported.health),lty=2)
points(x = bli$Self.reported.health[bli$EU==0],y = bli$Life.expectancy[bli$EU==0],pch=20,cex=2,col=cls[1])
points(x = bli$Self.reported.health[bli$EU==1],y = bli$Life.expectancy[bli$EU==1],pch=20,cex=2,col=cls[2])

## ..adding additional details 1
x11(bg = "white")
plot(0,0,bty="n",,xlab="health",ylab="life exp",col="white",xlim=c(min(bli$Self.reported.health),max(bli$Self.reported.health)),ylim=c(min(bli$Life.expectancy),max(bli$Life.expectancy)))
rect(xleft = mean(bli$Self.reported.health),xright = max(bli$Self.reported.health),ybottom = mean(bli$Life.expectancy),ytop = max(bli$Life.expectancy),col = "#00EE00",border=NA)
rect(xleft = min(bli$Self.reported.health),xright = mean(bli$Self.reported.health),ybottom = min(bli$Life.expectancy),ytop = mean(bli$Life.expectancy),col = "#CD2626",border=NA)
abline(h = mean(bli$Life.expectancy),v = mean(bli$Self.reported.health),lty=2)
points(x = bli$Self.reported.health[bli$EU==0],y = bli$Life.expectancy[bli$EU==0],pch=0,cex=2)
points(x = bli$Self.reported.health[bli$EU==1],y = bli$Life.expectancy[bli$EU==1],pch=2,cex=2)
add_legend("bottomright",pch = c(0,2),legend = c("extra EU","EU"),border = FALSE,bty = "n",ncol = 2)

## ..adding additional details 2
x11(bg = "white")
plot(0,0,bty="n",,xlab="health",ylab="life exp",col="white",xlim=c(min(bli$Self.reported.health)-1,max(bli$Self.reported.health)+1),ylim=c(min(bli$Life.expectancy)-1,max(bli$Life.expectancy)+1))
abline(h = mean(bli$Life.expectancy),v = mean(bli$Self.reported.health),lty=2)
points(x = bli$Self.reported.health[bli$EU==0],y = bli$Life.expectancy[bli$EU==0],pch=0,cex=2,col=cls[1])
points(x = bli$Self.reported.health[bli$EU==1],y = bli$Life.expectancy[bli$EU==1],pch=2,cex=2,col=cls[2])
text(bli$Self.reported.health,bli$Life.expectancy,labels = bli$ISO3,cex=0.9,pos=1,offset=0.5) #'offset' is important to not overlap texts with symbols
add_legend("bottomright",col=cls,pch = c(0,2),legend = c("extra EU","EU"),border = FALSE,bty = "n",ncol = 2)

## ..adding a third variable (numeric) 1 
bli_ordered <- bli[order(bli$Employment.rate),] #order the observations according to a given variable
cls <- generate_color_gradient(values = sort(bli_ordered$Employment.rate),colors = c("red","yellow","springgreen","royalblue"))
print(cls)
x11();plot(1:length(cls),1:length(cls),col=cls,pch=20) #demonstrative example of extracted colors

x11(bg = "white")
plot(0,0,bty="n",,xlab="health",ylab="life exp",col="white",xlim=c(min(bli$Self.reported.health),max(bli$Self.reported.health)),ylim=c(min(bli$Life.expectancy),max(bli$Life.expectancy)))
abline(h = mean(bli$Life.expectancy),v = mean(bli$Self.reported.health),lty=2)
points(x = bli_ordered$Self.reported.health[bli$EU==0],y = bli_ordered$Life.expectancy[bli$EU==0],pch=15,cex=2,col=cls[bli_ordered$EU==0])
points(x = bli_ordered$Self.reported.health[bli$EU==1],y = bli_ordered$Life.expectancy[bli$EU==1],pch=17,cex=2,col=cls[bli_ordered$EU==1])
text(bli_ordered$Self.reported.health,bli_ordered$Life.expectancy,labels = bli_ordered$ISO3,cex=0.9,pos=1,offset=0.5) #'offset' is important to not overlap texts with symbols
add_legend("bottomright",pch = c(0,2),legend = c("extra EU","EU"),border = FALSE,bty = "n",ncol = 2)

## ..adding a third variable (numeric) 2
bli_ordered <- bli[order(bli$Employment.rate),] #order the observations according to a given variable
rads <- sqrt(bli_ordered$Employment.rate) #radius of the circles (each circle is an observation)

x11(bg = "white")
plot(0,0,bty="n",,xlab="health",ylab="life exp",col="white",xlim=c(min(bli$Self.reported.health)-2,max(bli$Self.reported.health)+2),ylim=c(min(bli$Life.expectancy)-2,max(bli$Life.expectancy)+2))
abline(h = mean(bli$Life.expectancy),v = mean(bli$Self.reported.health),lty=2)
points(x = bli_ordered$Self.reported.health[bli$EU==0],y = bli_ordered$Life.expectancy[bli$EU==0],pch=1,cex=bli_ordered$Employment.rate[bli$EU==0])
points(x = bli_ordered$Self.reported.health[bli$EU==1],y = bli_ordered$Life.expectancy[bli$EU==1],pch=1,cex=bli_ordered$Employment.rate[bli$EU==1])
text(bli_ordered$Self.reported.health,bli_ordered$Life.expectancy,labels = bli_ordered$ISO3,cex=0.9,pos=1,offset=0) #'offset' is important to not overlap texts with symbols
add_legend("bottomright",pch = c(0,2),legend = c("extra EU","EU"),border = FALSE,bty = "n",ncol = 2)



# (C) Visualizing multivariate data ---------------------------------------

## (C.1) Heatmaps
# A heatmap is a graphical representation of data where the individual values contained in a matrix are represented as colors. 
# There is no rule stating how the rows or columns should be arranged for illustration. If the order of both rows and columns is random, then a cluster method can be used to group 'similar'
# rows and/or columns.
library(pheatmap); library(RColorBrewer)

X <- as.matrix(grades[,2:(NCOL(grades)-1)]) #keep grades only
rownames(X) <- grades[,1]
head(X)

x11()
pheatmap(mat = X,cluster_rows = FALSE,cluster_cols = FALSE) #default visualization with no clustering information

x11()
pheatmap(mat = X,cluster_rows = FALSE,cluster_cols = FALSE,color = brewer.pal(6,"Spectral")) #change the color palette (example using brewer.pal)

x11()
pheatmap(mat = X,cluster_rows = TRUE,cluster_cols = FALSE,color = brewer.pal(6,"Spectral")) #in this case, rows are re-ordered based on the clustering information

## Heatmaps are quite useful in visualizing correlation matrices:
X <- bli[,4:23]
Xcor <- cor(X,use = "complete.obs") #do not consider NAs
print(Xcor)

X11()
pheatmap(mat = Xcor,cluster_rows = FALSE,cluster_cols = FALSE,breaks = c(-1,-0.5,0,0.5,1),color = brewer.pal(5,"Spectral")) #in this case, rows are re-ordered based on the clustering information


## (C.2) Networks (undirect)
library(igraph)

## Consider the dataframe 'matches'. It contains all the matches played between the teams of the first column and those of the second column.
# As it contains all the matches that have been played from 2013 to 2016, repetitions are allowed.
# Using a proper language, each team is a node of the network and each match (connection) is an edge.

gr <- igraph::graph_from_data_frame(d = matches,directed = FALSE) #create a graph from a dataframe containing links/edges
x11(); plot(gr) #first and simple plot

# However, very often raw data are represented as adj matrix like this one:
A <- as.matrix(igraph::get.adjacency(graph = gr,type = "both"))
print(A)
# In this case:
gr <- igraph::graph_from_adjacency_matrix(adjmatrix = A,mode = "undirected",weighted = TRUE)
x11(); plot(gr) #first and simple plot

# The default way to visualize a network graph is implemented by an automatic procedure,
# which selects a layout algorithm automatically based on the size and connectedness of the graph (see above examples).
# However, there are several algorithms that help in visualizing a network graph. Their differences can be
# appreciated especially in the case of very large networks.

# Here there are some very simple ways to plot a graph:
g0_l1 = layout_as_star(graph = gr,center = 10) #star
g0_l2 = layout_as_tree(graph = gr,root = 10) #tree
g0_l3 = layout_in_circle(graph = gr) #circle
g0_l4 = layout_on_grid(graph = gr) #grid

x11(); par(mfrow=c(2,2))
plot(gr,layout=g0_l1,main="star"); plot(gr,layout=g0_l2,main="tree")
plot(gr,layout=g0_l3,main="circle"); plot(gr,layout=g0_l4,main="grid")

# Other methods implement some theoretically-based procedures. For instance:

# Fruchterman-Reingold algorithm: 
# It is a force-directed method for graph layout that positions nodes based on physical forces. 
# Nodes repel each other, while edges act like springs pulling connected nodes together. 
# The algorithm iteratively adjusts node positions to minimize energy, creating an aesthetically pleasing and well-distributed layout.
g0_lfr = layout_with_fr(gr)
x11(); plot(gr,layout=g0_lfr)









