Solutions
You can visualise the exercise solutions in HTML format here.
<!DOCTYPE html>
Day 1: Data Visualisation Fundamentals
Imports and starter code
library(dplyr)
library(gridExtra)
library(ggplot2)
library(readxl)
library(tidyr)
df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")
Exercise 1: Data wrangling
Time to have a first glance at the dataset!
Print the number of variables and observations in the dataset
Print the first 10 rows of the dataset
Write a function called
describe
that:Selects the numeric columns that do not end with “id” in
df
Creates a table for the remaining columns with the following summary statistics:
- min, Q1, median, mean, Q3, max
- the number of NaNs
- the data type (e.g.,
numeric
)
Solution: There are probably dozens of possibilities for this task! We show here three solutions
describe1 <- function(data) {
# data: A data frame
# return: Data frame with summary statistics for 'data'
# Filter out non-numeric data and "id columns"
# can also use summarise_if
sub_data <- data %>%
select_if(is.numeric) %>%
select(-ends_with("id"))
# Brute-force solution
df_min <- sapply(sub_data, min)
df_mean <- sapply(sub_data, mean)
df_max <- sapply(sub_data, max)
df_quarts <- sapply(sub_data, quantile, probs = c(0.25, 0.5, 0.75)) %>% t()
types <- sapply(sub_data, class)
num.na <- sapply(sub_data, function(x) sum(is.na(x)))
nunique <- unlist(sub_data %>% summarise_all(n_distinct))
my_summary <- data.frame(df_min, df_mean, df_quarts, df_max, types, num.na, nunique)
my_summary
}
describe2 <- function(data) {
# data: A data frame
# return: Data frame with summary statistics for 'data'
# Filter out non-numeric data and "id columns"
# can also use summarise_if
sub_data <- data %>%
select_if(is.numeric) %>%
select(-ends_with("id"))
# Other solution using "reframe"
get_summary <- function(x) {
list(
min = min(x, na.rm = TRUE),
q1 = quantile(x, 0.25, na.rm = TRUE),
q2 = quantile(x, 0.5, na.rm = TRUE),
mean = mean(x, na.rm = TRUE),
q3 = quantile(x, 0.75, na.rm = TRUE),
max = max(x, na.rm = TRUE)
)
}
my_summary <- as.data.frame(
sub_data |>
reframe(across(where(is.numeric), get_summary)) |>
t()
)
rownames(my_summary) <- c("min", "Q1", "Q2", "Q3", "max")
my_summary
}
describe3 <- function(data) {
sub_data <- data %>%
select_if(is.numeric) %>%
select(-ends_with("id"))
# Other solution using "summary"
my_summary <- as.data.frame(
do.call(
rbind, lapply(
sub_data, summary
)
)
)
my_summary$nunique <- c(sub_data %>% summarise_all(n_distinct))
my_summary$type <- lapply(sub_data, class)
my_summary$num.na <- lapply(sub_data, function(x) sum(is.na(x)))
my_summary
}
describe3(df)
## Min. 1st Qu.
## latitude -2.910580e+01 -6.500000e+00
## longitude -1.749848e+01 2.844251e+00
## year_start 1.995000e+03 2.008000e+03
## year_end 1.995000e+03 2.008000e+03
## year 1.995000e+03 2.008000e+03
## lower_age 0.000000e+00 3.000000e-01
## upper_age 1.000000e+00 4.000000e+00
## Npositive 0.000000e+00 0.000000e+00
## Nexamined 1.000000e+00 1.200000e+01
## PfPr 6.600000e-05 6.600000e-05
## itnuse0 0.000000e+00 1.108407e-01
## itnuse1 0.000000e+00 7.532611e-02
## itnuse2 0.000000e+00 5.547262e-02
## itnuse3 0.000000e+00 4.057860e-02
## itnavg4 0.000000e+00 7.940135e-02
## act 1.064223e-02 3.144580e-01
## irs 0.000000e+00 0.000000e+00
## tasseled_cap_wetness -6.610296e-01 -2.874196e-01
## mosquito_temperature_suitability_index 0.000000e+00 4.366600e+04
## precipitation 0.000000e+00 5.896000e+01
## enhanced_vegetation_index 1.427318e-03 2.129368e-01
## elevation -9.492000e+01 1.196800e+02
## nighttime_lights 3.000000e+00 4.160000e+00
## accessibility_tocity 7.500000e-01 7.152000e+01
## Median Mean
## latitude 1.349250e+00 6.481154e-01
## longitude 2.990042e+01 2.133483e+01
## year_start 2.011000e+03 2.010924e+03
## year_end 2.011000e+03 2.010934e+03
## year 2.011000e+03 2.010927e+03
## lower_age 7.000000e-01 1.958753e+00
## upper_age 5.000000e+00 2.196693e+01
## Npositive 1.000000e+00 1.039274e+01
## Nexamined 2.400000e+01 5.652764e+01
## PfPr 5.536191e-02 1.938571e-01
## itnuse0 3.168846e-01 3.311271e-01
## itnuse1 2.505204e-01 3.022578e-01
## itnuse2 2.073924e-01 2.672684e-01
## itnuse3 1.491138e-01 2.314580e-01
## itnavg4 2.547464e-01 2.830278e-01
## act 4.557029e-01 4.160956e-01
## irs 0.000000e+00 5.037315e-02
## tasseled_cap_wetness -2.274185e-01 -2.476784e-01
## mosquito_temperature_suitability_index 5.950500e+04 5.859058e+04
## precipitation 8.313043e+01 8.625286e+01
## enhanced_vegetation_index 3.002333e-01 2.966891e-01
## elevation 4.069200e+02 6.523600e+02
## nighttime_lights 4.880000e+00 1.021080e+01
## accessibility_tocity 1.513600e+02 2.027965e+02
## 3rd Qu. Max. nunique
## latitude 1.003458e+01 2.542279e+01 43380
## longitude 3.501800e+01 5.119900e+01 43357
## year_start 2.015000e+03 2.020000e+03 26
## year_end 2.015000e+03 2.020000e+03 26
## year 2.015000e+03 2.020000e+03 26
## lower_age 1.700000e+00 8.300000e+01 152
## upper_age 1.900000e+01 8.500000e+01 340
## Npositive 8.000000e+00 4.122000e+03 291
## Nexamined 5.700000e+01 1.025400e+04 641
## PfPr 3.247297e-01 9.979295e-01 13865
## itnuse0 5.343093e-01 9.819850e-01 38796
## itnuse1 5.079055e-01 9.327665e-01 38324
## itnuse2 4.448487e-01 9.298448e-01 37470
## itnuse3 4.038511e-01 9.442481e-01 36898
## itnavg4 4.588770e-01 9.103441e-01 38825
## act 5.303484e-01 6.562011e-01 40203
## irs 1.511388e-02 1.000000e+00 14039
## tasseled_cap_wetness -1.908172e-01 2.314969e-02 24307
## mosquito_temperature_suitability_index 7.714300e+04 9.404000e+04 20372
## precipitation 1.070400e+02 5.340014e+02 5254
## enhanced_vegetation_index 3.830003e-01 6.065061e-01 24307
## elevation 1.167120e+03 3.534960e+03 17888
## nighttime_lights 6.880000e+00 6.300000e+01 1328
## accessibility_tocity 2.754800e+02 2.539760e+03 11751
## type num.na
## latitude numeric 0
## longitude numeric 0
## year_start integer 0
## year_end integer 0
## year integer 0
## lower_age numeric 0
## upper_age numeric 0
## Npositive integer 0
## Nexamined integer 0
## PfPr numeric 0
## itnuse0 numeric 0
## itnuse1 numeric 0
## itnuse2 numeric 0
## itnuse3 numeric 0
## itnavg4 numeric 0
## act numeric 0
## irs numeric 0
## tasseled_cap_wetness numeric 0
## mosquito_temperature_suitability_index integer 0
## precipitation numeric 0
## enhanced_vegetation_index numeric 0
## elevation numeric 0
## nighttime_lights numeric 0
## accessibility_tocity numeric 0
Exercise 2: univariate plots
Something should feel off regarding the column
mosquito_temperature_suitability_index
.
Let’s plot a histogram to check the distribution. Use both
hist
and geom_histogram
.
Fix the data from mosquito_temperature_suitability_index
in a new column called mtsi
such that the summary looks
like this:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 43.67 59.51 58.59 77.14 94.04
Solution: the temperature was accidentally multiplied by 1000 (normal range should be 0-100 according to the metadata), so simply divide by 1000 to obtain the mtsi column.
# base R histogram
hist(df$mosquito_temperature_suitability_index)
# ggplot2 histogram
ggplot(df, aes(mosquito_temperature_suitability_index)) +
geom_histogram()
# Note: use "plot_grid" from the ```cowplot``` package
# to plot base R and ggplot2 plots on the same window
df$mtsi <- df$mosquito_temperature_suitability_index / 1000
Exercise 3: customising our univariate plot
Let’s customise our last plot (this time using the mtsi
column)! Incorporate the following elements: - Add a vertical, red,
dashed, line for the first three quartiles (Q1, median, Q3) - Better
data-to-ink-ratio: opt for a white background - Add dashed grid lines -
Increase the font size of the x and y ticks to 11
mtsi_quantiles <- quantile(df$mtsi, probs = c(0.25, 0.5, 0.75))
fig <- ggplot(df, aes(mtsi)) +
geom_histogram(fill = colourblind[1]) +
labs(x = "Mosquito temperature stability") +
# choose a minimal theme (no grey background)
theme_minimal() +
theme(
# set grid lines
panel.grid.major = element_line(color = "gray", linetype = "dashed"),
# set font size of x axis
axis.text.x = element_text(size = 11L),
# set font size of y axis
axis.text.y = element_text(size = 11L)
) +
# add a vertical line
geom_vline(
# set xintercept as mtsi_quantiles
xintercept = mtsi_quantiles,
# dashed line
linetype = "dashed",
color = "red",
linewidth = 1
)
Exercise 4: Bivariate plots
Plot a simple scatter plot showing the following:
- x-axis: precipitation
- y-axis: vegetation index
Assign the plot to the variable fig1
Augment fig1
with:
- A linear regression model fit, coloured in dark red
- A title: “R^2 = xxx” where xxx is the value of R^2 from the linear regression model
- Better labels for the x- and y-axes.
Plot the two figures (fig1 and fig2) side by side. Are the results coherent?
Solution: Yes, we fully expect precipitation with vegetation 😄
fig1 <- ggplot(df, aes(y = enhanced_vegetation_index, x = precipitation)) +
geom_point()
# Linear regression
mod <- lm(enhanced_vegetation_index ~ precipitation, df)
fig2 <- fig1 +
# classic theme
theme_classic() +
# change x-axis, y-axis and title format
theme(
axis.text.x = element_text(size = 11L),
axis.text.y = element_text(size = 11L),
plot.title = element_text(hjust = 0.5) # centre the title
) +
# Add the regression line
geom_smooth(method = lm, color = "darkred") +
# Change x-axis, y-axis and title labels
labs(
x = "Precipitation [mm]",
y = "Vegetation index [A.U.]",
title = paste("R^2:", format(summary(mod)$r.squared, digits = 2L))
)
# Subplots with 2 columns
grid.arrange(fig1, fig2, ncol = 2L)
Exercise 5: Focus on a single country
Find the country with the most observations in this dataset.
For this country, plot 4 subplots (in a 2 x 2 grid) showing the following:
- The number of surveys done over the years, grouped by surved method
- The average Plasmodium falciparum parasite rate (PfPr), plotted against the year (mean +/- standard deviation)
- The distribution of PfPr depending on the area type (rural vs. urban)
- Combine elevation, precipitation and city accessibility together
# Using dplyr in full force here, but there are tons of other ways to do it
most_observed_country <- df %>%
count(country, sort = TRUE) %>%
head(1) %>%
pull(country)
mocd <- df %>% filter(country == most_observed_country)
# Get mean and SD of PfPr grouped by year
mocd_pfpr_stats <- mocd %>%
group_by(year) %>%
summarise(PfPr_mean = mean(PfPr), PfPr_sd = sd(PfPr))
fig1 <- ggplot(mocd, aes(x = year, fill = method)) +
geom_bar() +
theme_minimal() +
scale_fill_manual(values = colourblind) +
labs(
x = "Year",
y = "# Surveys"
) +
theme(legend.position = "top")
fig2 <- ggplot(data = mocd_pfpr_stats) +
# Scatter plot
geom_point(aes(x = year, y = PfPr_mean)) +
# Line between each points
geom_line(aes(x = year, y = PfPr_mean)) +
# Error bar [y_mean - y_sd, y_mean + y_sd]
geom_errorbar(
aes(
x = year,
ymin = PfPr_mean - PfPr_sd,
ymax = PfPr_mean + PfPr_sd
),
width = 0.2
) +
labs(
x = "Year",
y = "PfPr (%)"
) +
theme_minimal()
fig3 <- ggplot(
# Filter out empty values
mocd %>% filter(rural_urban != ""),
aes(x = PfPr, color = rural_urban)
) +
# Use after_stat to scale the data
geom_density(aes(y = after_stat(scaled))) +
theme_minimal() +
labs(
y = "Density",
x = "PfPr (%)",
)
fig4 <- ggplot(mocd, aes(x = elevation, y = precipitation, colour = mtsi)) +
geom_point() +
theme_minimal() +
scale_colour_distiller(palette = "Purples", direction = 1) +
labs(
y = "Precipitation [mm]",
x = "Elevation [m]",
)
grid.arrange(fig1, fig2, fig3, fig4, ncol = 2L)
Exercise 6: Heatmaps
Compare the average usage of different types of insecticide-treated bed nets for Nigeria and its neighbours (Benin, Chad, Niger, Cameroon).
Plot the resulting table in a heatmap, with the value of each heatmap cell printed inside.
ex6 <- df |>
filter(country %in% c("Nigeria", "Benin", "Chad", "Niger", "Cameroon")) |>
group_by(country) |>
summarise(across(starts_with("itn"), mean)) |>
gather(itn_variable, avg, -country)
ggplot(ex6, aes(y = country, x = itn_variable)) +
# Make a heatmap in ggplot2
geom_tile(aes(fill = avg)) +
# Add text inside the heatmap
geom_text(aes(label = round(avg, 2))) +
# Make the heatmap squared
coord_fixed() +
# Change the gradient (white for low usage, red for high usage)
scale_fill_gradient(low = "white", high = "red") +
# Remove the x- and y-labels (for aesthetic purposes)
xlab("") +
ylab("") +
# Change the country order such that it's alphabetically ordered
ylim(rev(unique(ex6$country)))
<!DOCTYPE html>
Day 2: Maps
Imports and starter code
library(dplyr)
library(ggplot2)
# scale bars and north arrows
library(ggspatial)
# maps
library(rnaturalearth)
# map data
library(rnaturalearthdata)
# high-resolution maps
library(rnaturalearthhires)
#' simple features' package
library(sf)
# interactive thematic maps
library(tmap)
# Our malaria dataset
df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")
# Where all the map data is located
map_folder <- "../../data/maps"
# Rivers
rivers <- ne_load(
scale = 10,
type = "rivers_lake_centerlines",
destdir = map_folder,
returnclass = "sf"
)
## Reading layer `ne_10m_rivers_lake_centerlines' from data source
## `C:\KU\Teaching\DataVizCourse\2024\data\maps\ne_10m_rivers_lake_centerlines.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1473 features and 38 fields
## Geometry type: MULTILINESTRING
## Dimension: XY
## Bounding box: xmin: -164.9035 ymin: -52.15775 xmax: 177.5204 ymax: 75.79348
## Geodetic CRS: WGS 84
# Lakes
lakes <- ne_load(
scale = 10,
type = "lakes",
destdir = map_folder,
returnclass = "sf"
)
## Reading layer `ne_10m_lakes' from data source
## `C:\KU\Teaching\DataVizCourse\2024\data\maps\ne_10m_lakes.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1355 features and 41 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -165.9656 ymin: -50.66967 xmax: 177.1544 ymax: 81.95521
## Geodetic CRS: WGS 84
# Oceans
oceans <- ne_load(
scale = 10,
type = "ocean",
destdir = map_folder,
returnclass = "sf"
)
## Reading layer `ne_10m_ocean' from data source
## `C:\KU\Teaching\DataVizCourse\2024\data\maps\ne_10m_ocean.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -85.22194 xmax: 180 ymax: 90
## Geodetic CRS: WGS 84
Exercise 1: plot the world
Use the rnaturalearth
and ggplot2
libraries
to get the country outlines on a world map
# Use rnaturalearth to get all countries ("ne_countries")
world_ne <- ne_countries(
scale = "medium", # map scale
returnclass = "sf" # return a Simple Features dataframe
)
# We use geom_sf from ggplot to draw simple boundaries and outlines
ggplot(data = world_ne) +
geom_sf() # geom to visualise sf objects
Exercise 2: Kenya plot a Kenya map?
Plot a high-definition map of Kenya supplemented with information on:
- Scale
- Orientation (North/South)
- Coordinates (longitude/latitude)
Hint: use ggspatial
# Use rnaturalearth to get the country polygon for Kenya
kenya <- ne_countries(
country = "kenya", # country name
type = "countries", # country type
scale = "large", # requires rnaturalhires
returnclass = "sf"
)
# Basic Kenya map
kenya_map <- ggplot(data = kenya) +
geom_sf()
kenya_map_detailed <- kenya_map +
# Add labels
labs(
x = "Longitude",
y = "Latitude",
title = "Kenya"
) +
# Add the scale on the bottom left (ggspatial)
annotation_scale(location = "bl", width_hint = 0.4) +
# Add a north arrow on the bottom left (ggspatial)
annotation_north_arrow(
location = "bl",
which_north = "true",
pad_x = unit(0.0, "in"),
pad_y = unit(0.2, "in"),
style = north_arrow_fancy_orienteering
)
kenya_map_detailed
Exercise 3: Kenya add more details?
Augment your Kenya plot by adding:
- a point for 3 major cities: Nairobi, the capital, as well as Mombasa and Nakuru.
- lakes, rivers, and potential surrounding oceans
You might have to do a bit of googling to get the coordinates 😀
Hint: use sf
and the data in the
starter code!
# Kenya data for major cities
kenya_city_data <- data.frame(
city_name = c("Nairobi", "Mombasa", "Nakuru"),
lat = c(-1.2921, -4.0435, -0.3031),
lon = c(36.8219, 39.6682, 36.0800)
)
# Bounding box coordinates for Kenya
kenya_coords <- st_bbox(kenya)
xmin <- kenya_coords[1]
xmax <- kenya_coords[3]
ymin <- kenya_coords[2]
ymax <- kenya_coords[4]
kenya_map_very_detailed <- kenya_map_detailed +
# Add rivers
geom_sf(data = rivers, colour = "blue", linewidth = 0.2) +
# Add lakes
geom_sf(data = lakes, fill = "lightblue") +
# Add oceans
geom_sf(data = oceans, fill = "lightblue") +
# Restict coordinates for rivers/lakes/oceans
coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
# Add points for cities (scatter plot)
geom_point(
data = kenya_city_data,
mapping = aes(x = lon, y = lat),
colour = "black"
) +
# Add annotation
geom_text(
data = kenya_city_data,
mapping = aes(x = lon, y = lat, label = city_name),
nudge_y = 0.25,
color = "black"
)
kenya_map_very_detailed
Exercise 4: Kenya add survey data?
Last touch to our map! add the coordinates for surveys in our dataset
(df
) performed in Kenya
kenya_df <- df %>% filter(country == "Kenya")
kenya_map_very_detailed +
# Scatter plot with x = longitude, y = latitude
geom_point(
data = kenya_df,
mapping = aes(x = longitude, y = latitude),
colour = "red",
alpha = 0.2,
size = 1
)
Exercise 5: thematic maps with tmap
Warning! You might have to delete your current environment before doing this exercise and/or do it on a separate R script.
Let’s have a look at interactive thematic maps with
tmap
!
We provide you with some starter code below. World
is a
data.frame
with a special column that contains a geometry
for each row, in this case polygons.
# Interactive mode
tmap_mode("view")
# Load World Data
data("World")
# Show the first rows
head(World)
## Simple feature collection with 6 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.41544 ymin: -55.25 xmax: 75.15803 ymax: 42.68825
## Geodetic CRS: WGS 84
## iso_a3 name sovereignt continent
## 1 AFG Afghanistan Afghanistan Asia
## 2 AGO Angola Angola Africa
## 3 ALB Albania Albania Europe
## 4 ARE United Arab Emirates United Arab Emirates Asia
## 5 ARG Argentina Argentina South America
## 6 ARM Armenia Armenia Asia
## area pop_est pop_est_dens economy
## 1 652860.00 [km^2] 28400000 43.50090 7. Least developed region
## 2 1246700.00 [km^2] 12799293 10.26654 7. Least developed region
## 3 27400.00 [km^2] 3639453 132.82675 6. Developing region
## 4 71252.17 [km^2] 4798491 67.34519 6. Developing region
## 5 2736690.00 [km^2] 40913584 14.95003 5. Emerging region: G20
## 6 28470.00 [km^2] 2967004 104.21510 6. Developing region
## income_grp gdp_cap_est life_exp well_being footprint inequality
## 1 5. Low income 784.1549 59.668 3.8 0.79 0.4265574
## 2 3. Upper middle income 8617.6635 NA NA NA NA
## 3 4. Lower middle income 5992.6588 77.347 5.5 2.21 0.1651337
## 4 2. High income: nonOECD 38407.9078 NA NA NA NA
## 5 3. Upper middle income 14027.1261 75.927 6.5 3.14 0.1642383
## 6 4. Lower middle income 6326.2469 74.446 4.3 2.23 0.2166481
## HPI geometry
## 1 20.22535 MULTIPOLYGON (((61.21082 35...
## 2 NA MULTIPOLYGON (((16.32653 -5...
## 3 36.76687 MULTIPOLYGON (((20.59025 41...
## 4 NA MULTIPOLYGON (((51.57952 24...
## 5 35.19024 MULTIPOLYGON (((-65.5 -55.2...
## 6 25.66642 MULTIPOLYGON (((43.58275 41...
Try to plot a few economic indicators for African countries. Can we talk about an “economic burden” for malaria in African countries? Is this enough evidence?
# Select Africa
africa <- World %>% filter(continent == "Africa")
# Rough estimation of average PfPr (parasite rate) in each country
df %>%
group_by(country) %>%
summarize(PfPr_mean = mean(PfPr)) %>%
arrange(desc(PfPr_mean))
## # A tibble: 44 × 2
## country PfPr_mean
## <chr> <dbl>
## 1 Central African Republic 0.998
## 2 Burkina Faso 0.600
## 3 Equatorial Guinea 0.598
## 4 Comoros 0.496
## 5 Guinea 0.478
## 6 Congo 0.429
## 7 Sierra Leone 0.424
## 8 Mali 0.410
## 9 Togo 0.410
## 10 Ghana 0.395
## # ℹ 34 more rows
# You can plot several columns at once with tm_facets
tm_shape(africa) +
# Plot life expectancy and GDP per capita
tm_polygons(
col = c("life_exp", "gdp_cap_est", "economy"),
palette = list("Blues", "Blues", "Reds"),
id = "name"
) +
tm_facets(sync = TRUE, ncol = 3)
<!DOCTYPE html>
Day 3: dimensionality reduction
library(corrplot)
library(ggbiplot)
library(gridExtra)
library(dplyr)
library(ggplot2)
library(uwot)
# Our malaria dataset
df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")
df$mtsi <- df$mosquito_temperature_suitability_index / 1000
cols_of_interest <- c(
"itnuse0", "itnuse1", "itnuse2", "itnuse3",
"act", "irs", "tasseled_cap_wetness", "mtsi",
"precipitation", "enhanced_vegetation_index", "elevation", "nighttime_lights",
"accessibility_tocity"
)
countries <- c("Sudan", "Zambia", "Nigeria")
set.seed(42)
Exercise 1: correlation matrix
We have selected for you 13 columns of interest, and 3 different African countries from an economic, geographic, and demographic perspective: Sudan, Zambia, and Nigeria.
Make a correlation matrix using the features of interest then plot it.
Correlation matrix: cor
from the stats
package
Correlation matrix plot: corrplot
from the
corrplot
package
sub_df <- df %>% filter(df$country %in% countries)
sub_df_cont <- sub_df %>%
select(any_of(cols_of_interest))
# Correlation matrix
corr_matrix <- cor(sub_df_cont)
# Correlation matrix plot
corrplot(
corr_matrix,
# Only plot the lower triangular
type = "lower",
# Visualisation method: show ellipses
# eccentricity scaled to the correlation value
method = "ellipse",
# Sort by angular order of the eigenvectors
order = "AOE",
# Rotate the text labels
tl.srt = 45
)
Exercise 2: PCA
- Run a PCA on the subset of interest. Do not forget to scale and center the data!
- Run a scree plot. How many components are worth retaining?
PCA: prcomp
from the stats
package
Scree
plot: screeplot
from the stats
package
sub_df_cont_pca <- prcomp(sub_df_cont, scale = TRUE, center = TRUE)
# Screeplot
screeplot(sub_df_cont_pca, type = "lines")
Exercise 3: more PCA
Run a biplot to represent the information contained in the first two components.
- Observe the arrows: what do the principle components capture?
- Colour the biplot points by country: do the surveys cluster well by country? do the clusters make sense?
- Colour the biplot points by PfPr
Biplot: biplot
from the stats
package or
ggbiplot
from the ggbiplot
package
fig1 <- ggbiplot(
sub_df_cont_pca,
# arrows: change colour and increase size
varname.color = "red", varname.size = 4,
# points: reduce size and increase transparency
point.size = 1,
alpha = 0.5,
# colour by country
groups = sub_df$country,
# draw an ellipse (without fill colour)
ellipse = TRUE,
ellipse.fill = FALSE
) +
# change country colour
scale_color_manual(name = "country", values = colourblind[seq_along(countries)]) +
# minimal theme
theme_minimal()
fig2 <- ggbiplot(
sub_df_cont_pca,
# arrows: change colour and increase size
varname.color = "red", varname.size = 4,
# points: reduce size and increase transparency
point.size = 1,
alpha = 0.5,
# colour by PfPr
groups = sub_df$PfPr,
) +
# change PfPr colour
scale_color_distiller(name = "PfPr", palette = "Blues", direction = 1) +
# minimal theme
theme_minimal()
grid.arrange(fig1, fig2, ncol = 2)
Exercise 4: UMAP
Let’s now turn to UMAP.
Perform 4 UMAP plots with 4 different values for
n_neighbors
: 2, 20, 200, 2000 (it should take ~ 5 min).
What do you notice? How do these visualisations compare to your PCA plot?
UMAP: umap
(uwot) with fast_sgd = TRUE
neigh_plots <- list()
neigh_numbs <- c(2, 20, 200, 2000)
for (i in seq_along(neigh_numbs)) {
print(paste("Testing n_neighbors = ", neigh_numbs[[i]]), sep = "")
sub_df_cont_umap <- uwot::umap(
sub_df_cont,
n_neighbors = neigh_numbs[[i]],
fast_sgd = TRUE
)
neigh_plots[[i]] <- ggplot(
as.data.frame(sub_df_cont_umap),
aes(y = V2, x = V1, colour = sub_df$country)
) +
geom_point() +
scale_color_manual(name = "country", values = colourblind[seq_along(countries)])
}
## [1] "Testing n_neighbors = 2"
## [1] "Testing n_neighbors = 20"
## [1] "Testing n_neighbors = 200"
## [1] "Testing n_neighbors = 2000"
do.call(grid.arrange, neigh_plots)
<!DOCTYPE html>
Day 3: Networks
Introduction
In this set of exercises, we will go through creating a network in
igraph
visualising it using ggplot2
, setting
node attributes and plotting some features of the network.
First, we read in the network data - the edge list is in the form of
a .txt file, so we use read.table
instead of
read.csv
.
This network comes from a school that was used for modelling the spread of COVID-19. The original publication as well as the data can be found here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3009790
After reading the data, we drop the repeated edges from the edge list
using distinct in dplyr
. Then make sure the ordering of the
nodes from the graph vertices matches the edge list using match.
Finally, assign the node attributes using the $
on
V(school_graph)
.
# Set working directory here if needed!
library(dagitty)
library(dplyr)
library(ggdag)
library(ggraph)
library(ggplot2)
library(igraph)
library(tidygraph)
# Read in network data
nodes <- read.csv("../../data/school_nodelist.csv")
edges <- read.table("../../data/school_edgelist.txt")
# Drop repeated edges
edges <- edges[, 1:2] %>% distinct(V1, V2)
# Fix node ordering
school_graph <- graph_from_edgelist(as.matrix(edges) + 1)
reorder <- match(nodes$ID, V(school_graph))
nodes <- arrange(nodes, reorder)
# Assign the node attributes using the $ on V(school_graph)
V(school_graph)$role <- nodes$Role
Exercise 1: summary statistics
Now look for some summary statistics about the data - how many individuals are there? How many roles, and which are the different roles? How many individuals of each role are there?
# How many different kinds of roles are there in the network
unique(V(school_graph)$role)
## [1] "teacher" "student" "staff" "other"
# How many people are there in each role?
nrow(nodes %>% filter(Role == "staff"))
## [1] 55
Exercise 2: edge case
Currently, it’s a bit hard to visualise the full graph, because there are two many nodes.
One thing we could try to do is sample the nodes using the sample
function applied to V(graph)
- try doing this with 50
nodes.
Then, use subgraph
in igraph
to generate a
subgraph of the full network with these nodes.
If we have set the node attributes previously for our graph, these attributes will carry over to the subgraph!
Plot the resulting subgraph. Colour the nodes according to the role that each of them plays.
use as.factor
to convert the roles to factors, and then
assign the colors as node attributes using V(graph)$color
# Randomly subsample the nodes from the full graph - set the seed to get the
# same nodes each time!
set.seed(123)
sample_size <- 40
subgraph_nodes <- sample(V(school_graph), sample_size)
# Create subgraph using these nodes
school_subgraph <- as.undirected(subgraph(school_graph, subgraph_nodes))
# Customise layout
layout <- layout_nicely(school_subgraph)
plot(school_subgraph,
vertex.label = NA, vertex.size = 5, edge.size = 2,
layout = layout
)
# Get the corresponding subgraph node attributes, i.e.
# subgraph_node_attrs <- nodes %>% filter(ID %in% subgraph_nodes) %>% select(ID, Role)
# Set the colours of the roles
colors <- c("dodgerblue", "firebrick2", "darkorchid2", "gold2")
V(school_subgraph)$color <- colors[as.factor(V(school_subgraph)$role)]
labels <- c("other", "staff", "student", "teacher")
# Get degrees for plotting
degrees <- degree(school_subgraph)
# Plot!
plot(school_subgraph, layout = layout, vertex.label = NA, vertex.size = sqrt(degrees))
# Set legend to show how colours correspond to roles.
legend("bottomright", legend = levels(as.factor(V(school_subgraph)$role)), fill = colors)
title(paste("School Network Subsample, n=", as.character(sample_size)))
</div>
Exercise 3: network statistics
When we sample the vertices, we are excluding full data about the network.
As we discussed in lectures, one way to visualise information about the whole graph is to show the degree distribution.
Plot the degree distribution for the whole graph. Do this once for every node, and then repeat with the distributions for each separate role. Do the degree distributions look significantly different for the different roles?
nodes$degree <- degree(school_graph)
degree_dist <- ggplot(nodes, aes(x = degree)) +
geom_histogram(fill = "dodgerblue", color = "black") +
labs(
x = "Node Degree",
y = "Number of people"
) +
ggtitle("Degree distribution for School Network")
degree_dist
degree_density <- ggplot(nodes, aes(x = degree)) +
geom_density(fill = "dodgerblue", color = "black", alpha = 0.5) +
labs(
x = "Node Degree",
y = "Density"
) +
ggtitle("Degree distribution for School Network")
degree_density
Now show the distributions separated by roles. Compare the histogram with the density plot - which do you prefer?
degree_dist_role <- ggplot(nodes, aes(x = degree, fill = Role)) +
geom_histogram(color = "black") +
labs(
x = "Node Degree",
y = "Number of people"
) +
ggtitle("Degree distribution for School Network") +
xlim(0, 500)
show(degree_dist_role)
degree_density_role <- ggplot(nodes, aes(x = degree, fill = Role)) +
geom_density(color = "black", alpha = 0.5) +
labs(
x = "Node Degree",
y = "Density"
) +
ggtitle("Degree distribution for School Network") +
xlim(0, 500)
show(degree_density_role)
Exercise 4: Remember the subgraph?
Let’s also check whether the subgraph is a good representation of the full graph in terms of degree distribution. Plot the degree distributions for the two graphs in a single plot.
Do they look similar?
subgraph_node_degrees <- nodes %>%
filter(ID %in% subgraph_nodes) %>%
select(ID, degree)
color_manual <- c("Full graph" = "dodgerblue", "Subgraph" = "firebrick2")
subgraph_degree_comparison <- ggplot(nodes, aes(x = degree, fill = "Full graph")) +
geom_density(alpha = 0.5, color = "black") +
geom_density(
data = subgraph_node_degrees, aes(x = degree, fill = "Subgraph"),
color = "black", alpha = 0.5
) +
labs(
x = "Node Degree",
y = "Density"
) +
ggtitle("Degree distribution for School Network") +
xlim(0, 500) +
scale_fill_manual(values = color_manual)
subgraph_degree_comparison
Exercise 5 (extension): node to self: there are more network libraries
Extension: Have a go at converting the school subgraph into a
tidygraph
object. Use the function as_tbl_graph to convert
an igraph object to a tidygraph
, and then use ggraph to
plot.
Experiment with different ways of displaying the nodes and edges, as
well as the layout and colors of the vertices. Do you prefer this way of
visualising networks compared with igraph
?
subgraph_nodes_df <- nodes %>% filter(ID %in% subgraph_nodes)
# Define tidygraph object
new_graph <- as_tbl_graph(school_subgraph,
node_labels = subgraph_nodes_df$ID,
role = subgraph_nodes_df$Role,
degrees = subgraph_nodes_df$degree
)
# Play around here with different geom_edge and geom_node styles
g_plot <- ggraph(new_graph) +
geom_edge_bend(show.legend = FALSE) +
geom_node_point(aes(color = role), show.legend = TRUE, size = 5) +
theme_graph(background = "white")
show(g_plot)
Exercise 6: DAG-nificent
Let’s look at creating a DAG in dagitty
: Create a simple
DAG with 5 variables, called A, B, C, D and E, and make up some
associations between the variables.
If you prefer, you could make a DAG from your real research!
If you want to, you can also try playing around with the features in
ggdag
- this requires you to make a dagify()
object instead. See this tutorial for some ideas of how to start https://cloud.r-project.org/web/packages/ggdag/vignettes/intro-to-ggdag.html
and the kinds of functions available.
# Create a DAG using dagitty
dag_example <- dagitty("dag{
A
B
C
D
E
A -> B
A -> C
B -> D
B -> E
C -> D
D -> E
}")
# Check that it is acyclic!! Then plot:
print(isAcyclic(dag_example))
## [1] TRUE
plot(dag_example)
# Create the same DAG using dagify
ggdag_example <- dagify(E ~ B + D,
D ~ B + C,
C ~ A,
B ~ A,
exposure = "A",
outcome = "E"
) %>% tidy_dagitty()
# Create some nice ggdag plots
ggdag(ggdag_example) +
theme_dag()
ggdag_parents(ggdag_example, "E") +
theme_dag()
ggdag_descendants(ggdag_example, "C") +
theme_dag()
Day 4: Interactive visualisation
Imports and starter code
library(ggplot2)
library(plotly)
library(shiny)
df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")
df$mtsi <- df$mosquito_temperature_suitability_index / 1000
Exercise 1: ggplot to plotly
Take your plot from Day 1/Exercise 4 (fig2
) and make it
interactive with plotly
!
fig1 <- ggplot(df, aes(y = enhanced_vegetation_index, x = precipitation)) +
geom_point()
# Linear regression
mod <- lm(enhanced_vegetation_index ~ precipitation, df)
fig2 <- fig1 +
# classic theme
theme_classic() +
# change x-axis, y-axis and title format
theme(
axis.text.x = element_text(size = 11L),
axis.text.y = element_text(size = 11L),
plot.title = element_text(hjust = 0.5) # centre the title
) +
# Add the regression line
geom_smooth(method = lm, color = "darkred") +
# Change x-axis, y-axis and title labels
labs(
x = "Precipitation [mm]",
y = "Vegetation index [A.U.]",
title = paste("R^2:", format(summary(mod)$r.squared, digits = 2L))
)
plotly_fig <- ggplotly(fig2)
plotly_fig
Exercise 2: build a Shiny app
It is recommended to use RStudio and R (not RMarkdown) for this exercise.
Let’s start building an interactive app to compare countries using the 4-panel plot from Day 1/Exercise 5.
The app should incorporate the following elements: * Input: a country from the Malaria dataset * Output: an interactive 4-panel plot similar to Day 1/Exercise 5
This exercise will be divided into 3 parts: UI side (part 1), processing the input on the server side (part 2), and finally rendering an interactive a plt (part 3)
Rest assured, you’ll find some starter code in a separate script
called day4_app.R
!
Part 1: UI side
Our UI should consist of a fluidPage
with:
- A title
- A sidebar layout with two panels:
- A sidebar panel where a user can select a country from
unique_countries
(e.g., like a drop-down menu) - A main panel containing a
plotlyOutput
- A sidebar panel where a user can select a country from
Title: titlePanel
Side bar layout:
sidebarLayout
Side bar panel:
sidebarPanel
Drop-down menu: selectInput
main panel: mainPanel
unique_countries <- sort(unique(df$country))
ui <- fluidPage(
# Give the page a title
titlePanel("Exploring malaria survey data from African countries"),
# Generate a row with a sidebar
sidebarLayout(
# Define the sidebar with one input
sidebarPanel(
# Selection input
selectInput("country", "Country:",
choices = unique_countries
),
hr(),
helpText("Select a country using the drop-down menu.")
),
mainPanel(
plotlyOutput("plot")
)
)
)
Part 2: Server side - processing inputs
Our server should first incorporate the input country by filtering
df
such that only the rows from that country are
retained.
We are also interested in getting the mean PfPR and its SD (in an
object called sub_df_pfpr_stats
), grouped by year (like in
day1).
To check your code, replace plotly_empty
by:
first_year <- # CODE HERE
first_year_mean_pfpr <- # CODE HERE
validate(
need(
FALSE,
paste0(
country(),
". First sample obtained in ",
first_year,
". Mean PfPr: ",
first_year_mean_pfpr
)
)
)
When running the app, you should get the following for Angola:
Angola. First sample obtained in 2005. Mean PfPr: 0.669089607
Remember to make everything that is susceptible to change (such as the
country and country data) as reactive
!
server <- function(input, output) {
country <- reactive({
input$country
})
sub_df <- reactive({
df |> filter(country == country())
})
# Get mean and SD of PfPr grouped by year
sub_df_pfpr_stats <- reactive({
sub_df() %>%
group_by(year) %>%
summarise(PfPr_mean = mean(PfPr), PfPr_sd = sd(PfPr))
})
first_year <- sub_df_pfpr_stats()[1, 1]
first_year_mean_pfpr <- sub_df_pfpr_stats()[1, 2]
output$plot <- renderPlotly({
validate(
need(
FALSE,
paste0(
country(),
". First sample obtained in ",
first_year,
". Mean PfPr: ",
first_year_mean_pfpr
)
)
)
})
}
Part 3: Server side - visualization
Our UI is ready, our data processing is ready, now onto the plots!
Replace the validate snippet by a subplot
from the
plotly
library, incorporating the 4 panels (over 2 rows)
from Day 1/Exercise 5.
Remember that the data is reactive
Use ggplotly to
make the plots interactive
Use titleX = TRUE
and
titleY = TRUE
for
server <- function(input, output) {
country <- reactive({
input$country
})
sub_df <- reactive({
df |> filter(country == country())
})
# Get mean and SD of PfPr grouped by year
sub_df_pfpr_stats <- reactive({
sub_df() %>%
group_by(year) %>%
summarise(PfPr_mean = mean(PfPr), PfPr_sd = sd(PfPr))
})
output$plot <- renderPlotly({
fig1 <- ggplot(sub_df(), aes(x = year, fill = method)) +
geom_bar() +
theme_minimal() +
scale_fill_manual(values = colourblind) +
labs(
x = "Year",
y = "# Surveys"
)
fig2 <- ggplot(data = sub_df_pfpr_stats()) +
geom_point(aes(x = year, y = PfPr_mean)) +
geom_line(aes(x = year, y = PfPr_mean)) +
geom_errorbar(
aes(x = year, ymin = PfPr_mean - PfPr_sd, ymax = PfPr_mean + PfPr_sd),
width = 0.2
) +
labs(
x = "Year",
y = "PfPr (%)"
) +
theme_minimal()
fig3 <- ggplot(sub_df() %>% filter(rural_urban != ""), aes(x = PfPr, color = rural_urban)) +
geom_density(aes(y = after_stat(scaled))) +
theme_minimal() +
labs(
y = "Density",
x = "PfPr (%)",
)
fig4 <- ggplot(sub_df(), aes(x = elevation, y = precipitation, colour = mtsi)) +
geom_point() +
theme_minimal() +
scale_colour_distiller(palette = "Purples", direction = 1) +
labs(
y = "Precipitation [mm]",
x = "Elevation [m]",
)
m <- list(
l = 50L,
r = 50L,
b = 100L,
t = 100L,
pad = 4L
)
size <- 800L
plotly_fig <- subplot(
ggplotly(fig1, width = size, height = size),
ggplotly(fig2, width = size, height = size),
ggplotly(fig3, width = size, height = size),
ggplotly(fig4, width = size, height = size),
nrows = 2,
titleX = TRUE,
titleY = TRUE,
margin = 0.1
)
plotly_fig |> layout(margin = m)
})
}