library(tidyverse)
library(patchwork)

setwd('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/')
## 3.1 Tidy data 
# Q3.1.1
data <- read_csv2('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/exercise_data_not_tidy.csv')
# inspect the tibble
data

## wrangling the data
# Q3.1.2 
# I would argue that in this case, it makes most sense to have the food types each be in their own column like they were in the last set of exercises. 

# Q3.1.3
# we can use the function spread (from tidyr) to go from long to wide
data <- data %>% spread(key='food_type', value='consumption')

## cleaning and manipulating the data
# Q3.1.4 
summary(data)
plot(data$Age_calculated)
plot(data$an_BMI)
# looks like we have a few problems: there are some subjects with age 99 (although we know these are supposed to be children)
# we have some subjects with bmi -1, which is not possible so that is also likely supposed to be NA
# we have a number of 'NaN's (for Not a Number), these should also be coded as NA

# Q3.1.5
# reload data will fix the NaN issue
data <- read_csv2('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/exercise_data_not_tidy.csv', na='NaN')
# re-run the spreading
data <- data %>% spread(key='food_type', value='consumption')

# we can use mutate and na_if to deal with the rest
data <- data %>% mutate(Age_calculated = na_if(Age_calculated, 99), an_BMI = na_if(an_BMI,-1))

# Q3.1.6
# looks like co_Gender and co_EDUC_fam_clas should both be factors, let's take care of that
data <- data %>% mutate(co_Gender = factor(co_Gender, levels = c(1,2),labels=c('female','male')), 
                        co_EDUC_fam_clas = factor(co_EDUC_fam_clas, levels=c(1,2,3), labels=c('low','middle','high')))

# Q3.1.7
# let's shorten the names of ID_number, co_Gender, co_EDUC_fam_clas, Age_calculated and an_BMI with rename
data <- data %>% rename(subid = ID_number, sex = co_Gender, education = co_EDUC_fam_clas, age = Age_calculated, BMI = an_BMI)

# Q3.1.8
# Let's sum values for different types of bread into a new column called 'Bread'.
data <- data %>% mutate(bread = Whitebread+Wholemealbread+Ryebread)

# Q3.1.9 
counted_data <- data %>% group_by(sex, education) %>% tally()

## Note: I have split up the different steps of this process to make it easier to follow what happens where, but you could do the 
## whole data cleaning bit in one go, like so: 
data <- data %>% spread(key='food_type', value='consumption') %>% 
  mutate(Age_calculated = na_if(Age_calculated, 99), an_BMI = na_if(an_BMI,-1)) %>% 
  mutate(co_Gender = factor(co_Gender, levels = c(1,2),labels=c('female','male')), 
         co_EDUC_fam_clas = factor(co_EDUC_fam_clas, levels=c(1,2,3), labels=c('low','middle','high'))) %>% 
  rename(subid = ID_number, sex = co_Gender, education = co_EDUC_fam_clas, age = Age_calculated, BMI = an_BMI) %>% 
  mutate(bread = Whitebread+Wholemealbread+Ryebread)
## (you would still want to build your process step by step, making sure outcome after each step is what you want it to be)

## 3.2 ggplot2 + patchwork

# Q3.2.1 
ggplot(data, aes(x=Freshveg)) +
  geom_histogram()

# Q3.2.2
# aes refers to aesthetics, ggplot's way of defining how to map data onto the plot
# if you skip the +, the latter lines won't be understood as part of the same plot call

# Q3.2.3 
ggplot(data, aes(x=Freshveg, fill=sex)) +
  geom_histogram()

# Q3.2.4
ggplot(data, aes(x=Freshveg, fill=sex)) +
  geom_histogram(position='identity')

# Q3.2.5 
ggplot(data, aes(x=Freshveg, fill=sex)) +
  geom_histogram(position='identity', alpha=0.5)
# larger numbers for alpha make the colours more solid, smaller numbers make it more see-through

# Q3.2.6
ggplot(data, aes(x=Freshveg, fill=sex)) +
  geom_histogram(position='identity', alpha=0.5, aes(y=..density..))

# Q3.2.7
ggplot(data, aes(x=Freshveg, fill=sex)) +
  geom_histogram(position='identity', alpha=0.5, aes(y=..density..)) +
labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density")

# Q3.2.8
ggplot(data, aes(x=Freshveg, fill=sex)) +
  geom_histogram(position='identity', alpha=0.5, aes(y=..density..)) +
  labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density")+
  theme_classic()

# Q3.2.9
# to remove the annoying na-bars, you can use na.omit(data) to only look at complete rows
# i.e. omit any row with at least one missing value
ggplot(na.omit(data), aes(x=Freshveg, fill=factor(sex))) +
  geom_histogram(position="identity",  alpha=0.5, aes(y=..density..)) + 
  labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density") +
  geom_density(alpha=0.2) +
  stat_function(fun = dnorm, n = 101, args = list(mean = mean(data$Freshveg, na.rm=T), sd = sd(data$Freshveg, na.rm=T)), colour='red') + 
  theme_classic(base_size=20)

# Q3.2.10
p1 <- ggplot(na.omit(data), aes(x=Freshveg, fill=factor(sex))) +
  geom_histogram(position="identity",  alpha=0.5, aes(y=..density..)) + 
  labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density") +
  geom_density(alpha=0.2) +
  stat_function(fun = dnorm, n = 101, args = list(mean = mean(data$Freshveg, na.rm=T), sd = sd(data$Freshveg, na.rm=T)), colour='red') + 
  theme_classic(base_size=20) +
  theme(legend.position = 'bottom')

pdf('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/figures/veg_consumption.pdf')
p1
dev.off()

# Q3.2.11
ggplot(na.omit(data), aes(x=age, y=BMI)) + 
  geom_point() 

# Q3.2.12
ggplot(na.omit(data), aes(x=age, y=BMI)) + 
  geom_point() + 
  geom_smooth(method='lm')

# Q3.2.13
p2 <- ggplot(na.omit(data), aes(x=age, y=BMI, col=education)) + 
  geom_point(size=2) + 
  geom_smooth(method='lm', se=F) + 
  theme_classic() +
  theme(legend.position = 'bottom')

# Q3.2.16
p1 + p2

# Q3.2.17
p1 / p2

# Q 3.2.18
# for this purpose, maybe the two rows, one column thing worked better?
# You can also use this to build more complex layouts, like
p1 + p2/p2
p1 / (p2 + p2)
 

Created by Juulia Suvilehto
juulia.suvilehto@iki.fi

Creative Commons License