library(tidyverse)
library(patchwork)
setwd('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/')
## 3.1 Tidy data
# Q3.1.1
data <- read_csv2('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/exercise_data_not_tidy.csv')
# inspect the tibble
data
## wrangling the data
# Q3.1.2
# I would argue that in this case, it makes most sense to have the food types each be in their own column like they were in the last set of exercises.
# Q3.1.3
# we can use the function spread (from tidyr) to go from long to wide
data <- data %>% spread(key='food_type', value='consumption')
## cleaning and manipulating the data
# Q3.1.4
summary(data)
plot(data$Age_calculated)
plot(data$an_BMI)
# looks like we have a few problems: there are some subjects with age 99 (although we know these are supposed to be children)
# we have some subjects with bmi -1, which is not possible so that is also likely supposed to be NA
# we have a number of 'NaN's (for Not a Number), these should also be coded as NA
# Q3.1.5
# reload data will fix the NaN issue
data <- read_csv2('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/data/exercise_data_not_tidy.csv', na='NaN')
# re-run the spreading
data <- data %>% spread(key='food_type', value='consumption')
# we can use mutate and na_if to deal with the rest
data <- data %>% mutate(Age_calculated = na_if(Age_calculated, 99), an_BMI = na_if(an_BMI,-1))
# Q3.1.6
# looks like co_Gender and co_EDUC_fam_clas should both be factors, let's take care of that
data <- data %>% mutate(co_Gender = factor(co_Gender, levels = c(1,2),labels=c('female','male')),
co_EDUC_fam_clas = factor(co_EDUC_fam_clas, levels=c(1,2,3), labels=c('low','middle','high')))
# Q3.1.7
# let's shorten the names of ID_number, co_Gender, co_EDUC_fam_clas, Age_calculated and an_BMI with rename
data <- data %>% rename(subid = ID_number, sex = co_Gender, education = co_EDUC_fam_clas, age = Age_calculated, BMI = an_BMI)
# Q3.1.8
# Let's sum values for different types of bread into a new column called 'Bread'.
data <- data %>% mutate(bread = Whitebread+Wholemealbread+Ryebread)
# Q3.1.9
counted_data <- data %>% group_by(sex, education) %>% tally()
## Note: I have split up the different steps of this process to make it easier to follow what happens where, but you could do the
## whole data cleaning bit in one go, like so:
data <- data %>% spread(key='food_type', value='consumption') %>%
mutate(Age_calculated = na_if(Age_calculated, 99), an_BMI = na_if(an_BMI,-1)) %>%
mutate(co_Gender = factor(co_Gender, levels = c(1,2),labels=c('female','male')),
co_EDUC_fam_clas = factor(co_EDUC_fam_clas, levels=c(1,2,3), labels=c('low','middle','high'))) %>%
rename(subid = ID_number, sex = co_Gender, education = co_EDUC_fam_clas, age = Age_calculated, BMI = an_BMI) %>%
mutate(bread = Whitebread+Wholemealbread+Ryebread)
## (you would still want to build your process step by step, making sure outcome after each step is what you want it to be)
## 3.2 ggplot2 + patchwork
# Q3.2.1
ggplot(data, aes(x=Freshveg)) +
geom_histogram()
# Q3.2.2
# aes refers to aesthetics, ggplot's way of defining how to map data onto the plot
# if you skip the +, the latter lines won't be understood as part of the same plot call
# Q3.2.3
ggplot(data, aes(x=Freshveg, fill=sex)) +
geom_histogram()
# Q3.2.4
ggplot(data, aes(x=Freshveg, fill=sex)) +
geom_histogram(position='identity')
# Q3.2.5
ggplot(data, aes(x=Freshveg, fill=sex)) +
geom_histogram(position='identity', alpha=0.5)
# larger numbers for alpha make the colours more solid, smaller numbers make it more see-through
# Q3.2.6
ggplot(data, aes(x=Freshveg, fill=sex)) +
geom_histogram(position='identity', alpha=0.5, aes(y=..density..))
# Q3.2.7
ggplot(data, aes(x=Freshveg, fill=sex)) +
geom_histogram(position='identity', alpha=0.5, aes(y=..density..)) +
labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density")
# Q3.2.8
ggplot(data, aes(x=Freshveg, fill=sex)) +
geom_histogram(position='identity', alpha=0.5, aes(y=..density..)) +
labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density")+
theme_classic()
# Q3.2.9
# to remove the annoying na-bars, you can use na.omit(data) to only look at complete rows
# i.e. omit any row with at least one missing value
ggplot(na.omit(data), aes(x=Freshveg, fill=factor(sex))) +
geom_histogram(position="identity", alpha=0.5, aes(y=..density..)) +
labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density") +
geom_density(alpha=0.2) +
stat_function(fun = dnorm, n = 101, args = list(mean = mean(data$Freshveg, na.rm=T), sd = sd(data$Freshveg, na.rm=T)), colour='red') +
theme_classic(base_size=20)
# Q3.2.10
p1 <- ggplot(na.omit(data), aes(x=Freshveg, fill=factor(sex))) +
geom_histogram(position="identity", alpha=0.5, aes(y=..density..)) +
labs(title="Consumption of fresh vegetables", x="Times per week", y = "Density") +
geom_density(alpha=0.2) +
stat_function(fun = dnorm, n = 101, args = list(mean = mean(data$Freshveg, na.rm=T), sd = sd(data$Freshveg, na.rm=T)), colour='red') +
theme_classic(base_size=20) +
theme(legend.position = 'bottom')
pdf('/Users/juusu53/Documents/projects/opetus/R_for_scientists/course_materials/figures/veg_consumption.pdf')
p1
dev.off()
# Q3.2.11
ggplot(na.omit(data), aes(x=age, y=BMI)) +
geom_point()
# Q3.2.12
ggplot(na.omit(data), aes(x=age, y=BMI)) +
geom_point() +
geom_smooth(method='lm')
# Q3.2.13
p2 <- ggplot(na.omit(data), aes(x=age, y=BMI, col=education)) +
geom_point(size=2) +
geom_smooth(method='lm', se=F) +
theme_classic() +
theme(legend.position = 'bottom')
# Q3.2.16
p1 + p2
# Q3.2.17
p1 / p2
# Q 3.2.18
# for this purpose, maybe the two rows, one column thing worked better?
# You can also use this to build more complex layouts, like
p1 + p2/p2
p1 / (p2 + p2)
Created by Juulia Suvilehto
juulia.suvilehto@iki.fi
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License