## 2.1 more data structures

# Q.2.1.1 
# I am using a:b, sample(), and c() to create my vectors
# you could also use any number of other functions, like seq(), runif() etc
x <- 5:8
y <- sample(10:100, 4)
z <- c(1, 26, 3, 88)

# Q2.1.2
# joining vectors to a matrix and assigning the result into a variable
mat <- cbind(x,y,z)

# Q2.1.3
# in 2-D structures, indexing goes row, vector
# this will also print the name of the vector the value came from
print(mat[2,3])

# Q2.1.4
# you can do this all in one line like so
mat2 <- matrix(runif(9), nrow=3, ncol=3)

# but an equally valid method would be to first create the numbers
randnumbers <- runif(9)
# and then make the vector into a matrix
other_mat2 <- matrix(randnumbers, nrow=3)

# Q2.1.5
# you can use vectors in square bracket indexing to get results which are matrices (or vectors), not just single numbers
print(mat2[2:3,2:3])

# Q2.1.6
thisisalist <- list(mat, mat2)

# Q2.1.7
# keep in mind the double bracket indexing for lists
# the [[1]] is to get to the correct element in the list, the [1,1] is to get the correct element of the matrix inside the list
thisisalist[[1]][1,1] 

## 2.2 Working with data

# Q2.2.2
data <- read.csv('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/exercise_data.csv', 
                 sep=";", dec=',')

# Q2.2.3
head(data)

# Q2.2.4 
# you can change the number of values you want printed with the argument n
tail(data, n=10)

# Q2.2.5
summary(data)

# Q2.2.6
library(psych)
# NB: in a typical script file you should have the packages at the very top so that a reader knows what is required to run the script! I have the package defined here to show that this
# is the part of the exercises when you come across the need for this package, but if this was a script I wrote for myself, the packages would all be at the very top
describe(data)

# Q2.2.7 
data[7,]

# Q2.2.8 
data$co_Gender

# Q2.2.9
# row 7 has a lot of missing values (NA)
# we can easily check the number of missing values in a matrix or vector with is.na and sum like so 
sum(is.na(data[7,]))
# since truth values are also 0 (FALSE) and 1 (TRUE). If we wanted to check how many values are not NA, we could do
sum(!is.na(data[7,]))

# gender is shown as numeric, although we know that logically it is a categorical variable

# Q2.2.10
data$co_Gender <- factor(data$co_Gender, levels=c(1,2), labels=c('female','male'))
data$co_EDUC_fam_clas <- factor(data$co_EDUC_fam_clas, levels=c(1,2,3), labels=c('low','middle','high'))

# Q2.2.11
summary(data)
# the columns that were made into factors are now showing counts of each factor level instead of mean etc

# Q2.2.12
data[,c('Age_calculated','co_Gender', 'co_EDUC_fam_clas', 'Healthy_pattern')]

# Q2.2.13
# The plots look different, because one has continuous variable as the x axis and the other one has a categorical variable. Therefore,
# R assumes you want different kinds of plots by default

# Q2.2.14-Q.2.2.18
pdf('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/sample_plot_1.pdf')
hist(data$Freshveg, breaks=0:35, xlab="Fresh vegetable consumption frequency (times per week)", ylab="Number of participants", main="Fresh vegetable use",
     col='grey')
dev.off()

# Q2.2.20
# the code plots two data series together on the same axis
# Add=T tells the figure to keep the old plot 
# Freq toggles between frequency (how many observations) and probability density. particularly relevant to use if your samples are very different in size 
# (in this case they're fairly equal, so it is not necessary as such)

with(data, plot(Age_calculated, an_BMI, 
                pch = 19, col=c('cyan','blue','darkblue')[as.integer(co_EDUC_fam_clas)],
                xlab = 'age',
                ylab = 'BMI',
                main='Age vs BMI'))

## 2.3 Conditional statements

# Q2.3.2
avg_choc = mean(data$Chocolate, na.rm=T)
for(row in 1:20){ # if you wanted to go over the whole data set, you would set the end to be nrow(data)
  if (is.na(data$Chocolate[row])){
    cat(paste0("We do not know if subject ", data$ID_number[row], " eats chocolate.\n"))
  } else if (data$Chocolate[row]==0){
    cat(paste0("Subject ", data$ID_number[row], " does not eat chocolate.\n"))
  } else if (data$Chocolate[row]< avg_choc){
    cat(paste0("Subject ", data$ID_number[row], " eats less than average amount of chocolate per week.\n"))
  } else {
    cat(paste0("Subject ", data$ID_number[row], " eats more than average amount of chocolate per week.\n"))
  }
}

# Q2.3.3
for(row in 1:nrow(data)){
  cat(paste0("This is subject number ",
             data$ID_number[row], ".\n"))
}


# Q2.3.4

for(row in 1:nrow(data)){
  if (is.na(data$co_Gender[row])){
    pronoun = 'They are '
  } else if (data$co_Gender[row]=='female'){
    pronoun = 'She is '
  } else {
    pronoun = 'He is '
  }
  cat(paste0("This is subject number ", data$ID_number[row], '. ', pronoun, data$Age_calculated[row], ' years old.\n'))
}
 

Created by Juulia Suvilehto
juulia.suvilehto@iki.fi

Creative Commons License