###########################################################################
### PSQ4106864 DATAVIS
### A.Y. 2024/2025
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Visualizing categorical data (univariate I)
## (B) Color palettes
## (C) Visualizing continuous data (univariate I)
########################################################



# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2024_2025/datavis/")
source("labs/add_legend.R")

## Data import
datax <- xlsx::read.xlsx(file = "datasets/data_100.xls",sheetIndex = 1)
str(datax)
datax <- datax[,-1] #remove the first variable

ipsos <- xlsx::read.xlsx(file = "datasets/ipsos.xlsx",sheetIndex = 1)
str(ipsos)

health <- xlsx::read.xlsx(file = "datasets/healthcare_costs.xlsx",sheetIndex = 1)
str(health)

survey <- read.csv(file = "datasets/company_data.csv",header = TRUE)
str(survey)

survey$EatingOutside <- survey$EatingOutside
survey$EatingInSameRestaurant <- survey$EatingInSameRestaurant



# (A) Visualizing categorical data ----------------------------------------

x <- table(datax$Country)
print(x)
cls <- c("#8B4513","#8B7355","#53868B","#B8860B")

## (A.1) A fast and simple h-plot
x11()
plot(x,type="h",bty="n",ylab="Counts",lwd=5,col=cls) 
#'lwd' controls for the tickness of the bars

## (A.2) Fast and simple barplots
x11(); par(mfrow=c(1,2))
barplot(x,horiz = FALSE,col = cls,border = "white"); title(main = "Vertical barplot",adj=0) #vertical barplot
barplot(x,horiz = TRUE,col = cls,border = "white"); title(main = "Horizontal barplot",adj=0) #horizontal barplot
abline(v = median(x),lty=3,col="gray",lwd=3)

x11()
barplot(ipsos$Percent,names.arg = ipsos$Country,horiz = TRUE,cex.names = 0.55)


## (A.3) Stacked barplot
## This can be used when more than a measurement is available for each category of the variable
X <- health[,1:5] #extract the first five columns of the dataset
rownames(X) <- X[,1] #save disease information as attribute (rownames)
X <- X[,-1] #remove the first variable (unused)
X <- t(apply(X,1,function(x)x/sum(x))) #fast way to compute frequencies (row-wise)
X <- as.matrix(X) #from dataframe to matrix 

x11()
barplot(X,beside = TRUE) #each disease (measurement) is grouped by the categories (columns)
barplot(t(X),beside = TRUE,col=cls) #traspose the matrix changes the information!
#add_legend("bottom",fill = cls,legend = colnames(X),border = FALSE,bty = "n",ncol = 4)
#legend can be placed at: bottomright, bottom, bottomleft, left, topleft, top, topright, right, center

barplot(t(X),beside = FALSE,col=cls,border="white") #traspose the matrix changes the information!
add_legend("top",fill = cls,legend = colnames(X),border = FALSE,bty = "n",ncol = 4)


## (A.4) Another example for multiple response categorical variables
X <- survey[,c(8,9)] #keep two variables from the datase
Y <- rbind(table(X[,1]),
           table(X[,2]))
rownames(Y) <- c("Eating-IN","Eating-OUT")
print(Y)

Y[1,] <- Y[1,]/sum(Y[1,])
Y[2,] <- Y[2,]/sum(Y[2,])

x11()
barplot(t(Y),xlim = c(0,1),horiz = TRUE,col=cls)
add_legend("top",fill = cls,legend = colnames(Y),border = FALSE,bty = "n",ncol = 4)


## (A.5) Dotchart 
ipsos <- ipsos[order(ipsos$Country),] #order the variable 'Percent' by 'Country'
  
x11()
dotchart(x = ipsos$Percent,labels = ipsos$Country,xlim = c(0,100),frame.plot = FALSE,pch=20)
abline(v = c(25,50,75),lty=3)
title(main = "Ipsos data")


## (A.6) Piechart
x11()
pie(x = ipsos$Percent,labels = ipsos$Country,radius = 1.0)
# 'radius' controls for the radius of the plot
# 'col' is used for label colors
title(main = "Ipsos data")



# (B) Color palettes ------------------------------------------------------

## Consider the ipsos dataset and, for instance, the piechart in section A.6.
## In this case, it could be difficult to change label colors manually (one needs to list 16 colors).
## However, lists of colors can be created using palettes, as those implemented in the 'RColorBrewer' library.
library(RColorBrewer)
x11();display.brewer.all() #it prints the list of palettes available in the library

cls <- RColorBrewer::brewer.pal(n = 16,name = "Set1") #'name' is the palette name from the list of palettes
#note: the chosen palette does not contain 16 colors. So, we need to manually create the full list by using two different palettes
cls1 <- RColorBrewer::brewer.pal(n = 9,name = "Set1") 
cls2 <- RColorBrewer::brewer.pal(n = 16-9,name = "Set2")
cls <- c(cls1,cls2)
print(cls)

## Now, the pie chart can be colored using the vector 'cls' created so far.
x11(); pie(x = ipsos$Percent,labels = ipsos$Country,radius = 0.9,col=cls)



# (C) Visualizing continuous data -----------------------------------------

cls <- c("#8B4513","#8B7355","#53868B","#B8860B")

## (C.1) Boxplot
x11(); layout(matrix(c(1,2,1,3),ncol = 2,byrow = TRUE),widths = c(1,2,3),heights = c(1,1))
#Note: The widths argument should have the same length as the number of columns of our position matrix. 
#      The heights argument should have the same length as the number of rows of our position matrix:
boxplot(datax$Age,frame=FALSE)  
boxplot(datax$Age[datax$Gender=="Male"],frame=FALSE,col = cls[1])
boxplot(datax$Age[datax$Gender=="Female"],frame=FALSE,col = cls[2])

x11(); boxplot(datax$Age[datax$Gender=="Male"],datax$Age[datax$Gender=="Female"],col = cls[1:2])
boxplot(datax$Age[datax$Gender=="Male"],datax$Age[datax$Gender=="Female"],col = cls[1:2],notch = TRUE) #alternatively
# Notches are used in box plots to help visually assess whether the medians of distributions differ. 
# If the notches do not overlap, this is evidence that the medians are different.

## Stacked or grouped boxplots can be obtained easily by using the R formula syntax,
## which works greatly if categorical variables (used to group the continuous variable) are codified as factors.
str(datax)
datax$Gender <- as.factor(datax$Gender)
datax$Country <- as.factor(datax$Country)

x11(); boxplot(Age~Gender,data=datax,frame=FALSE) #first subplot
x11(); boxplot(Age~Gender+Country,data=datax,frame=FALSE) #second subplot

## Boxplot with empirical data
n1 <- sum(datax$Gender=="Male") 
n2 <- sum(datax$Gender=="Female")
y <- datax$Age
y_m <- datax$Age[datax$Gender=="Male"]
y_f <- datax$Age[datax$Gender=="Female"]

x11()
plot(0,0,xlim=c(1,4),ylim=c(min(y),max(y)),bty="n",axes=FALSE,xlab="",ylab="")
axis(side = 1,at = 1:4,labels = c("","Male","Female",""),lwd.ticks = 0) #lwd.ticks = 0 removes the marks on the x-axis
axis(side = 2,at = seq(min(y),max(y),length=11),labels = round(seq(min(y),max(y),length=11),2)) 
boxplot(1:4,y_m,y_f,data = datax,frame=FALSE,border="gray",col=cls[3:4],pars = list(boxwex = 0.5),add = TRUE,axes=FALSE) #'boxwex' reduces the size of the boxes
points(rep(2,n1),y_m,pch=20)
points(2,mean(y_m),col="red",pch=17) #add the mean as a red point
segments(x0 = 1.9,x1 = 2.1,y0 = mean(y_m),y1 = mean(y_m),col="red",lwd=2) #draw a segment on the mean
points(rep(3,n2),y_f,pch=20) 
points(3,mean(y_f),col="red",pch=17)
segments(x0 = 2.9,x1 = 3.1,y0 = mean(y_f),y1 = mean(y_f),col="red",lwd=2)


## (C.2) Histogram and Density plot
## The histogram is a way to get the distribution of the continuous variable by counting how many times
## the recorded values fall inside a set of (pre-defined or user-defined) bins.
## If pre-defined bins are used, then:
x11(); hist(datax$Age,xlab="Age",ylab="counts",main = "")
out = hist(datax$Age,plot=FALSE) #this is what happens if you run hist()
print(out)
# 'out' is a list containing the information to create an histogram ('breaks' are the endpoints of the bins/classes on the x-axis)
# We can also specify how many bins we need:
x11(); par(mfrow=c(1,2));
hist(datax$Age,nclass = 4,xlab="Age",ylab="counts",main = "fixed nclass") #user-defined nclass
hist(datax$Age,nclass = "sturges",xlab="Age",ylab="counts",main = "sturges nclass") #using sturges formula

## If one needs to compare the distribution over the levels of a categorical variables, 
## then the number of bins/classes should be fixed in advance (for the sake of comparison):
x11(); par(mfrow=c(1,2))
hist(datax$Age[datax$Country=="France"],nclass=8,xlab="Age",ylab="",main = "France")
hist(datax$Age[datax$Country=="United States"],nclass=8,xlab="Age",ylab="",main = "US")

## Histograms can also be smoothed to get a density plot
x11(); par(mfrow=c(1,2)) 
plot(density(datax$Age[datax$Country=="France"],bw = 1.25),xlab="Age",ylab="",main = "France",bty="n")
plot(density(datax$Age[datax$Country=="United States"],bw = 1.25),xlab="Age",ylab="",main = "US",bty="n")
#'bw' is the smoothing bandwidth to be used (if it is not specified by the user, it is computed internally by the function)

# The density(x) function implements an estimator of the density function of 'x'. The result varies as a function of the parameter
# 'bw' (as the value decreases, the smoothness also decreases):
x11(); par(mfrow=c(3,2)) 
plot(density(datax$Age[datax$Country=="France"],bw = 0.5),xlab="Age",ylab="",main = "France",bty="n")
plot(density(datax$Age[datax$Country=="United States"],bw = 0.5),xlab="Age",ylab="",main = "US",bty="n")

plot(density(datax$Age[datax$Country=="France"],bw = 1.25),xlab="Age",ylab="",main = "",bty="n")
plot(density(datax$Age[datax$Country=="United States"],bw = 1.25),xlab="Age",ylab="",main = "",bty="n")

plot(density(datax$Age[datax$Country=="France"],bw = 2.5),xlab="Age",ylab="",main = "",bty="n")
plot(density(datax$Age[datax$Country=="United States"],bw = 2.5),xlab="Age",ylab="",main = "",bty="n")

# ..overlapping densities
x11()
out <- density(datax$Age[datax$Country=="France"])
x1 <- out$x; y1 <- out$y
out <- density(datax$Age[datax$Country=="United States"])
x2 <- out$x; y2 <- out$y
xmin <- min(min(x1),min(x2))
xmax <- max(max(x1),max(x2))
x <- seq(xmin,xmax,length=length(x1))
ymin <- min(min(y1),min(y2))
ymax <- max(max(y1),max(y2))
plot(x,y1,ylim=c(ymin,ymax),xlab="Age",ylab="",main = "France vs US",bty="n",col=cls[1],lwd=2,type="l")
lines(x,y2,,col=cls[3],lwd=2)
add_legend("topright",fill = cls[c(1,3)],legend = c("France","US"),border = FALSE,bty = "n",ncol = 1)

# ..overlapping densities with histograms
x11(); par(mfrow=c(1,2))
hist(datax$Age[datax$Country=="France"],nclass=8,xlab="Age",ylab="",main = "France",prob = TRUE,ylim=c(0,0.1),border = "white",col="#EEE8CD")
lines(density(datax$Age[datax$Country=="France"],bw = 1.25),xlab="Age",ylab="",main = "",bty="n",col="#D2691E")
hist(datax$Age[datax$Country=="United States"],nclass=8,xlab="Age",ylab="",main = "US",prob = TRUE,ylim=c(0,0.1),border = "white",col="#EEE8CD")
lines(density(datax$Age[datax$Country=="United States"],bw = 1.25),xlab="Age",ylab="",main = "",bty="n",col="#D2691E")






