###########################################################################
### PSQ4106864 DATAVIS
### A.Y. 2024/2025
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Data type and structures
## (B) Vectors and matrices
## (C) Matrix operations
## (D) From matrices to dataframes
## (E) Basics of I/O operations
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2024_2025/datavis/")



# (A) Data types and structures -------------------------------------------

#### (A.1) Data types ####
## Integers
x <- 23L 
typeof(x) 
is.integer(x)

## Doubles/Reals
x <- 2.3
typeof(x)
is.double(x)

## Booleans
x <- TRUE
typeof(x)
is.logical(x)
x <- 23L
y <- 24L
z <- x==y #is equal to
z <- x!=y #is not equal to
z <- x>y  #is greater then
z <- x>=y #is greater or equal then
z <- x<y  #is less then
z <- x<=y #is less or equal then

## Characters
x <- "hello"
typeof(x)
is.character(x)

## Complexes
x <- 2.3
z <- x + 1i
typeof(z)
is.complex(z)


#### (A.2) Data structures ####
## Simple vectors
x <- c(1L,2L,8L)
typeof(x)

x <- c(1.1,0.2,8.0)
typeof(x)

y <- c(2,1)
z <- c(y,x) #concatenate vectors
length(z)
y[1] #access the element of a vector

z <- c(0.9,0.2,7.8)
x==z #element-wise equal to
# Note: we can apply all the previous logical operations in this case

## Vectors as row/column matrices
x <- matrix(data= c(1,2,3),nrow= 3,ncol= 1) #column vector
x <- matrix(data= c(1,2,3),nrow= 1,ncol= 3) #row vector

## 2d Matrices
X <- matrix(data= c(1,2,3,4,5,6),nrow= 3,ncol= 3) #matrix populated columnwise
X <- matrix(data= c(1,2,3,4,5,6),nrow= 3,ncol= 3,byrow= TRUE) #matrix populated rowise
# Note: when creating a matrix, the number of elements in 'data' should be equal to 'nrow' * 'ncol'
NROW(X)
NCOL(X)
dim(X)
X[1,2] #access the element of a matrix
X[3,2]

z <- matrix(c(1,2,3),nrow= 1,ncol= 3)
X <- rbind(X,z) #append a row vector to X

z <- matrix(c(1,2,3,4),nrow= 4,ncol= 1)
X <- cbind(X,z) #append a column vector to X

A <- matrix(data= 1:16,nrow= 4,ncol= 4)
rownames(A) <- paste0("unit",1:4) #specify row names (attributes)
colnames(A) <- c("V1","V2","V3","V4") #specify column names (attributes)
print(A)


## 3d Matrices
X <- array(data= c(1,2,3,4,5,6,7,8),dim= c(2,2,2))
dim(X)
X[1,2,1] #access the element of a high-dim matrix
X[1,2,]
X[,,1]

## Lists
x <- c(1,2,3)
X <- matrix(data= c(1,2,3,4,5,6),nrow= 3,ncol= 3)
y <- letters[1:10]
d <- list("first"=x,"second"=X,"third"=y) #list with keys
str(d)
d$first #access the element of a list (by key)
d[[1]]  #access the element of a list (by position)
d$first[2] #second element of the first element of the list
d$second[,,1] #element c(,,1) of the second element of the list

d <- list(x,X,y) #list with no keys
str(d)
# Note: in this case elements can be accessed by position only (no keys)

## Dataframes
## Essentially, as they are lists, their manipulation can be done as for the lists.
x <- rep(23L,length=100)
y <- seq(from=1,to=10,length=100)
z <- rep(c("a","b"),each=50)
q <- rep(c("M","N","O","P"),each=25)
datax <- data.frame(x,y,z,q)
str(datax)

# Variables containing characters (e.g., 'z') can coded as 'factor'. This is a particular data type in R which allows
# for coding categorical variables. As soon as variables are coded as factors (instead of characters), the user inherits a set of
# specific functions which simplify the manipulation of this particular data type.
# For instance:
datax$z2 <- as.factor(datax$z) #the new variable 'z2' is directly stored into 'datax'
str(datax)
# Alternatively:
datax$z2 <- factor(x= datax$z,labels= c("A","B"))
str(datax)

# Categorical variables (factors) can be used to select subsets of a dataframe
datax[datax$z2=="A",] #select all the rows matching the criterion
datax[datax$z2=="B",] #select all the rows matching the criterion
datax[datax$z2=="B" & datax$q=="O",] #AND: both criteria need to be matched!
datax[datax$z2=="B" & datax$q!="O",] #AND: both criteria need to be matched!

# Removing rows or columns is easy in R
datax <- datax[-10,] #remove row number 10
datax <- datax[,-3]  #remove column number 3 (variable 'z')

iid <- datax$q=="P"   #which rows match the criterion
datax <- datax[-iid,] #remove those rows matching the criterion above


## (A.3) Attributes
## R can store metadata (qualitative information) as attributes of objects.
## For instance:
x <- c(2,3,8)
attr(x,"metadata") <- "This is my first vector" #create a metadata
print(x)
attr(x,"metadata") #recall a metadata (if any)

y <- c(2L,1L)
attr(y,"metadata") #no metadata exists in this case

# Rownames and colnames of a matrix are attributes:
print(A)
str(A) #the metadata is called 'dimnames' in this case
attr(A,"dimnames")



# (B) Vectors and Matrices ---------------------------------------------------
# The simplest way to define a vector in R is by using the operator: c()
x <- c(2,3,1)
print(x)

y <- c(1,1,2)
z <- c(x,y) # in this case, c() concatenates two vectors of different lengths
print(z)

y <- c("a","b","pippo","pluto") # a vector of non-numeric elements
print(y)

z <- c(x,y)  # concatenation of vectors with different elements
print(z)    # in this case, the elements of vector x are converted into characters
# Note: vectors and matrices must be of the same type!

typeof(x)   # vector x is 'double', numeric
typeof(z)   # vector z is character

x <- matrix(data= c(1,1,2,3,1,5),ncol=1) # column vector
print(x) 

y <- matrix(data= c(1,1,2,3,1,5),nrow=1) # row vector
print(y) 

# The difference between c() and matrix() is that c() is a generic operator that allows concatenation of elements, even of different types.
# Moreover, c() does not distinguish between row or column vectors, and R interprets the array generated by c() as a row or column vector depending on the usage context.

X <- matrix(data= c(1,2,3,4,5,6),nrow= 3,ncol= 3,byrow= FALSE)
print(X)
# The parameter byrow=FALSE (default in R) populates matrix X row-wise, keeping columns fixed each time.

X <- matrix(data= c(1,2,3,4,5,6),nrow= 3,ncol= 3,byrow= TRUE)
print(X)
# The parameter byrow=TRUE populates matrix X column-wise, keeping rows fixed each time.

X <- matrix(data= c(1,2,3,4,5,6),nrow= 4,ncol= 2)
# The number of rows (n) and columns (m) must match the total number of elements to be arranged!

x <- c(1,4,2,1,8,7,0,1)
X <- matrix(data= x,nrow= 2) # if not specified, the ncol parameter is automatically computed (given nrow)
print(X)

X <- matrix(data= x,ncol= 2) # if not specified, the nrow parameter is automatically computed (given ncol)
print(X)

X <- matrix(data <- x,ncol <- 2,nrow <- 2)
print(X)
diag(X) # extracts the main diagonal if X is a matrix
diag(x) # constructs a diagonal matrix with the elements of x if x is a vector!
# Note: be careful with diag(), its behavior depends on the input.

diag(diag(X)) # from inside out: the first diag() extracts the diagonal of X (output is a vector),
# the second diag() constructs a diagonal matrix with the previous output

set.seed(121) # set the random seed (for the next line)
x <- runif(16) # generate 16 uniformly distributed random numbers
X <- matrix(data= x,nrow= 4)
print(X)

t(X) # transpose of X

X[lower.tri(X)]               # extracts the lower triangular elements of X (output is a vector)
matrixcalc::lower.triangle(X) # extracts the lower triangular elements of X (output is a matrix)

X[upper.tri(X)]               # extracts the upper triangular elements of X (output is a vector)
matrixcalc::upper.triangle(X) # extracts the lower triangular elements of X (output is a matrix)

sum(diag(X))                  # trace of X
matrixcalc::matrix.trace(X)   # trace of X

det(X)                        # determinant of X
matrixcalc::matrix.rank(X)    # rank of X
qr(X)$rank                    # rank of X

solve(X)                      # inverse of X
matrixcalc::matrix.inverse(X) # inverse of X

x <- runif(9) 
X <- matrix(data= x,nrow= 3)
X[lower.tri(X)] <- X[upper.tri(X)]   # the lower triangle of X will contain the same elements as the upper triangle of X
matrixcalc::is.symmetric.matrix(X)  # the resulting matrix is symmetric



# (C) Matrix Operations ----------------------------------------------

V <- matrix(data= runif(8),nrow= 2)
X <- matrix(data= runif(16),nrow= 4)
Y <- matrix(data <- runif(16),nrow= 4)

Z <- X+Y        # sum
Z <- X-Y        # difference
print(Z)

Z <- X*Y        # element-wise product (Hadamard)
print(Z)

Z <- X%*%t(V)   # row-column product
print(Z)

Z <- X%*%Y      # row-column product
print(Z)

Z <- X%*%t(Y)   # row-column product
print(Z)

Z <- V%*%Y      # row-column product
print(Z)


v <- V[1,]      # first row of V
Z <- X%*%v      # row-column product
print(Z)

a <- matrix(data= 1,nrow= NROW(X))   # column vector of ones
z <- X%*%a      # column-wise sum of X
# equivalent to: apply(X,1,sum) or rowSums(X)

z <- t(a)%*%X   # row-wise sum of X
# equivalent to: apply(X,2,sum) or colSums(X)

z <- t(a)%*%X%*%a   # sum of X 
# equivalent to: sum(X)

z <- t(v)%*%v       # sum of the square of v
# equivalent to: sum(v^2)

Z <- v%*%t(v)       # symmetric matrix with diagonal equal to the square of v
is.symmetric.matrix(Z)
diag(Z) == v^2

Z <- X%*%v%*%t(v) + solve(Y)   # multiple operations together
Z <- X%*%solve(Y)
Z <- t(v)%*%X-t(Y%*%v)


# (D) From Matrices to Dataframes ------------------------------------------

D <- data.frame(X) # create a dataframe from a numeric matrix
# The dataframe is a data structure that - unlike matrices - allows the aggregation of variables of different types (e.g., strings, numbers).
# Moreover, this data type provides greater flexibility in performing certain operations in R (e.g., constructing plots, using more
# statistical-specific functions).

x1 <- runif(n= 10,min= 0,max= 2)       # define a first variable
x2 <- x1 + runif(n= 10,min= 0,max= 2)  # define a second variable as a combination of the first and a uniform error
x3 <- x1*0.9 + x2*1.1                     # define a third variable as a linear combination of the first two

D <- data.frame(var1=x1, var2=x2, var3=x3)
str(D) # function used to display the 'structure' of the dataframe

D <- data.frame(pippo=x1, pluto=x2, paperino=x3)  # variable names are not a problem
str(D)

# Additionally, with a dataframe, it is possible to associate a string-type identifier with each row and each column.
# For example:
colnames(D) <- c("lupin", "zenigata", "margot")   # change the names of the columns/variables
rownames(D) <- paste0("sbj",1:10)                 # the function paste0() allows 'pasting' the string sbj to a sequence of ten integers
str(D)
head(D)

# Compared to matrices, an alternative way to access dataframe variables/columns is as follows:
D[,1]    # extract the first column using an index
D$pippo  # extract the first column using an identifier (calling it by name)

z <- 0.98*D[,1] - 2.1*D[,2]       # variable 'z' obtained by row-wise summation of the first two variables of D
z <- 0.98*D$pippo - 2.1*D$pluto   # equivalent syntax



# (E) Basics I/O operations -----------------------------------------------

## CSV (Comma-separated files)
## Although standard .csv files are formatted using a comma "," to separate each value, it might happen that other symbols are instead use
## like semicolumn ";", single space " ", or multiple space "  " (also called tab). Therefore, it is highly recommended to inspect the raw file
## outside the R environment by using a text editor (do not use specialized softwares as they can modify the file implicitly).
datax <- read.csv(file <- "datasets/data_anxiety.csv",header <- TRUE,sep <- ",",) 
#header=TRUE when the first row contains the variable names (otherwise, header=FALSE)

str(datax) 
# When importing an external data file, R uses dataframes as default. In addition, categorical variables are encoded as chr by default.

## DAT (textual files)
## Files usually ending with .dat are textual files (actually, dat files stand for 'data'). Likewise for the .csv files, it is highly recommended to
## inspect those files outside the R environment before importing them.
datax <- read.delim(file = "datasets/cps_85_wages.dat",header = FALSE,sep = "\t")
#sep="\t" encodes the case of multiple space (or tab)

str(datax)

## XLS/XLSX (proprietary files)
## When handling with proprietary files (Microsoft Excel, IBM SPSS), external libraries need to be used. In the R environment, there are several
## libraries that can be used to this purpose. For instace, see: https://www.datacamp.com/doc/r/importingdata
datax <- xlsx::read.xlsx(file = "datasets/data_100.xls",sheetIndex = 1)
str(datax)


## JSON (JavaScript Object Notation)
## This is a lightweight, human-redable, and text-based file format used for storing 'structured data, ie nested structures such as objects and arrays. 
## Typically, those files have .json extension but can be easily visualized using general-purpose text editors.

datax <- jsonlite::fromJSON("datasets/airports.json")
str(datax,1) #the output is a list
length(datax) #in this case a very long list
datax[[1]] #inspect the first element of the list

## Converting JSON lists to dataframes can be a complicated task if the raw json file does not contain the same number of fields for each entry.
## For instance, consider the first three entries of the above dataset:
A <- data.frame(rbind(unlist(datax[[1]]),unlist(datax[[2]]),unlist(datax[[3]])))
head(A)
str(A)


## Exporting R objects outside the R environment

# Consider the matrix X created in Section (D)
print(X)

# Then, although several formats are available in this case, the most basic formats are: native R (Rdata, Rda, rds) or csv. 
# Native .Rdata/.Rda datasets can be load using the R function load() whereas .rds file need to be loaded via readRDS().
save(X,file = "dataout.Rdata") #.Rdata and .Rda are equivalent
write.csv(x = X,file = "dataout.csv")
saveRDS(object = X,file = "dataout.rds") #this is used when the input X is a large data structure





















