Solutions

You can visualise the exercise solutions in HTML format here.

<!DOCTYPE html>

Day 1: Data Visualisation Fundamentals

Imports and starter code

library(dplyr)
library(gridExtra)
library(ggplot2)
library(readxl)
library(tidyr)

df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")

Exercise 1: Data wrangling

Time to have a first glance at the dataset!

  1. Print the number of variables and observations in the dataset

  2. Print the first 10 rows of the dataset

  3. Write a function called describe that:

  4. Selects the numeric columns that do not end with “id” in df

  5. Creates a table for the remaining columns with the following summary statistics:

  • min, Q1, median, mean, Q3, max
  • the number of NaNs
  • the data type (e.g., numeric)

Solution: There are probably dozens of possibilities for this task! We show here three solutions

describe1 <- function(data) {
  # data: A data frame
  # return: Data frame with summary statistics for 'data'

  # Filter out non-numeric data and "id columns"
  # can also use summarise_if
  sub_data <- data %>%
    select_if(is.numeric) %>%
    select(-ends_with("id"))

  # Brute-force solution
  df_min <- sapply(sub_data, min)
  df_mean <- sapply(sub_data, mean)
  df_max <- sapply(sub_data, max)
  df_quarts <- sapply(sub_data, quantile, probs = c(0.25, 0.5, 0.75)) %>% t()
  types <- sapply(sub_data, class)
  num.na <- sapply(sub_data, function(x) sum(is.na(x)))
  nunique <- unlist(sub_data %>% summarise_all(n_distinct))

  my_summary <- data.frame(df_min, df_mean, df_quarts, df_max, types, num.na, nunique)

  my_summary
}

describe2 <- function(data) {
  # data: A data frame
  # return: Data frame with summary statistics for 'data'

  # Filter out non-numeric data and "id columns"
  # can also use summarise_if
  sub_data <- data %>%
    select_if(is.numeric) %>%
    select(-ends_with("id"))

  # Other solution using "reframe"
  get_summary <- function(x) {
    list(
      min = min(x, na.rm = TRUE),
      q1 = quantile(x, 0.25, na.rm = TRUE),
      q2 = quantile(x, 0.5, na.rm = TRUE),
      mean = mean(x, na.rm = TRUE),
      q3 = quantile(x, 0.75, na.rm = TRUE),
      max = max(x, na.rm = TRUE)
    )
  }

  my_summary <- as.data.frame(
    sub_data |>
      reframe(across(where(is.numeric), get_summary)) |>
      t()
  )

  rownames(my_summary) <- c("min", "Q1", "Q2", "Q3", "max")

  my_summary
}

describe3 <- function(data) {
  sub_data <- data %>%
    select_if(is.numeric) %>%
    select(-ends_with("id"))

  # Other solution using "summary"
  my_summary <- as.data.frame(
    do.call(
      rbind, lapply(
        sub_data, summary
      )
    )
  )
  my_summary$nunique <- c(sub_data %>% summarise_all(n_distinct))
  my_summary$type <- lapply(sub_data, class)
  my_summary$num.na <- lapply(sub_data, function(x) sum(is.na(x)))

  my_summary
}

describe3(df)
##                                                 Min.       1st Qu.
## latitude                               -2.910580e+01 -6.500000e+00
## longitude                              -1.749848e+01  2.844251e+00
## year_start                              1.995000e+03  2.008000e+03
## year_end                                1.995000e+03  2.008000e+03
## year                                    1.995000e+03  2.008000e+03
## lower_age                               0.000000e+00  3.000000e-01
## upper_age                               1.000000e+00  4.000000e+00
## Npositive                               0.000000e+00  0.000000e+00
## Nexamined                               1.000000e+00  1.200000e+01
## PfPr                                    6.600000e-05  6.600000e-05
## itnuse0                                 0.000000e+00  1.108407e-01
## itnuse1                                 0.000000e+00  7.532611e-02
## itnuse2                                 0.000000e+00  5.547262e-02
## itnuse3                                 0.000000e+00  4.057860e-02
## itnavg4                                 0.000000e+00  7.940135e-02
## act                                     1.064223e-02  3.144580e-01
## irs                                     0.000000e+00  0.000000e+00
## tasseled_cap_wetness                   -6.610296e-01 -2.874196e-01
## mosquito_temperature_suitability_index  0.000000e+00  4.366600e+04
## precipitation                           0.000000e+00  5.896000e+01
## enhanced_vegetation_index               1.427318e-03  2.129368e-01
## elevation                              -9.492000e+01  1.196800e+02
## nighttime_lights                        3.000000e+00  4.160000e+00
## accessibility_tocity                    7.500000e-01  7.152000e+01
##                                               Median          Mean
## latitude                                1.349250e+00  6.481154e-01
## longitude                               2.990042e+01  2.133483e+01
## year_start                              2.011000e+03  2.010924e+03
## year_end                                2.011000e+03  2.010934e+03
## year                                    2.011000e+03  2.010927e+03
## lower_age                               7.000000e-01  1.958753e+00
## upper_age                               5.000000e+00  2.196693e+01
## Npositive                               1.000000e+00  1.039274e+01
## Nexamined                               2.400000e+01  5.652764e+01
## PfPr                                    5.536191e-02  1.938571e-01
## itnuse0                                 3.168846e-01  3.311271e-01
## itnuse1                                 2.505204e-01  3.022578e-01
## itnuse2                                 2.073924e-01  2.672684e-01
## itnuse3                                 1.491138e-01  2.314580e-01
## itnavg4                                 2.547464e-01  2.830278e-01
## act                                     4.557029e-01  4.160956e-01
## irs                                     0.000000e+00  5.037315e-02
## tasseled_cap_wetness                   -2.274185e-01 -2.476784e-01
## mosquito_temperature_suitability_index  5.950500e+04  5.859058e+04
## precipitation                           8.313043e+01  8.625286e+01
## enhanced_vegetation_index               3.002333e-01  2.966891e-01
## elevation                               4.069200e+02  6.523600e+02
## nighttime_lights                        4.880000e+00  1.021080e+01
## accessibility_tocity                    1.513600e+02  2.027965e+02
##                                              3rd Qu.         Max. nunique
## latitude                                1.003458e+01 2.542279e+01   43380
## longitude                               3.501800e+01 5.119900e+01   43357
## year_start                              2.015000e+03 2.020000e+03      26
## year_end                                2.015000e+03 2.020000e+03      26
## year                                    2.015000e+03 2.020000e+03      26
## lower_age                               1.700000e+00 8.300000e+01     152
## upper_age                               1.900000e+01 8.500000e+01     340
## Npositive                               8.000000e+00 4.122000e+03     291
## Nexamined                               5.700000e+01 1.025400e+04     641
## PfPr                                    3.247297e-01 9.979295e-01   13865
## itnuse0                                 5.343093e-01 9.819850e-01   38796
## itnuse1                                 5.079055e-01 9.327665e-01   38324
## itnuse2                                 4.448487e-01 9.298448e-01   37470
## itnuse3                                 4.038511e-01 9.442481e-01   36898
## itnavg4                                 4.588770e-01 9.103441e-01   38825
## act                                     5.303484e-01 6.562011e-01   40203
## irs                                     1.511388e-02 1.000000e+00   14039
## tasseled_cap_wetness                   -1.908172e-01 2.314969e-02   24307
## mosquito_temperature_suitability_index  7.714300e+04 9.404000e+04   20372
## precipitation                           1.070400e+02 5.340014e+02    5254
## enhanced_vegetation_index               3.830003e-01 6.065061e-01   24307
## elevation                               1.167120e+03 3.534960e+03   17888
## nighttime_lights                        6.880000e+00 6.300000e+01    1328
## accessibility_tocity                    2.754800e+02 2.539760e+03   11751
##                                           type num.na
## latitude                               numeric      0
## longitude                              numeric      0
## year_start                             integer      0
## year_end                               integer      0
## year                                   integer      0
## lower_age                              numeric      0
## upper_age                              numeric      0
## Npositive                              integer      0
## Nexamined                              integer      0
## PfPr                                   numeric      0
## itnuse0                                numeric      0
## itnuse1                                numeric      0
## itnuse2                                numeric      0
## itnuse3                                numeric      0
## itnavg4                                numeric      0
## act                                    numeric      0
## irs                                    numeric      0
## tasseled_cap_wetness                   numeric      0
## mosquito_temperature_suitability_index integer      0
## precipitation                          numeric      0
## enhanced_vegetation_index              numeric      0
## elevation                              numeric      0
## nighttime_lights                       numeric      0
## accessibility_tocity                   numeric      0

Exercise 2: univariate plots

Something should feel off regarding the column mosquito_temperature_suitability_index.

Let’s plot a histogram to check the distribution. Use both hist and geom_histogram.

Fix the data from mosquito_temperature_suitability_index in a new column called mtsi such that the summary looks like this:

  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  0.00   43.67   59.51   58.59   77.14   94.04

Solution: the temperature was accidentally multiplied by 1000 (normal range should be 0-100 according to the metadata), so simply divide by 1000 to obtain the mtsi column.

# base R histogram
hist(df$mosquito_temperature_suitability_index)

# ggplot2 histogram
ggplot(df, aes(mosquito_temperature_suitability_index)) +
  geom_histogram()

# Note: use "plot_grid" from the ```cowplot``` package
#       to plot base R and ggplot2 plots on the same window

df$mtsi <- df$mosquito_temperature_suitability_index / 1000

Exercise 3: customising our univariate plot

Let’s customise our last plot (this time using the mtsi column)! Incorporate the following elements: - Add a vertical, red, dashed, line for the first three quartiles (Q1, median, Q3) - Better data-to-ink-ratio: opt for a white background - Add dashed grid lines - Increase the font size of the x and y ticks to 11

mtsi_quantiles <- quantile(df$mtsi, probs = c(0.25, 0.5, 0.75))

fig <- ggplot(df, aes(mtsi)) +
  geom_histogram(fill = colourblind[1]) +
  labs(x = "Mosquito temperature stability") +
  # choose a minimal theme (no grey background)
  theme_minimal() +
  theme(
    # set grid lines
    panel.grid.major = element_line(color = "gray", linetype = "dashed"),
    # set font size of x axis
    axis.text.x = element_text(size = 11L),
    # set font size of y axis
    axis.text.y = element_text(size = 11L)
  ) +
  # add a vertical line
  geom_vline(
    # set xintercept as mtsi_quantiles
    xintercept = mtsi_quantiles,
    # dashed line
    linetype = "dashed",
    color = "red",
    linewidth = 1
  )

Exercise 4: Bivariate plots

Plot a simple scatter plot showing the following:

  • x-axis: precipitation
  • y-axis: vegetation index

Assign the plot to the variable fig1

Augment fig1 with:

  • A linear regression model fit, coloured in dark red
  • A title: “R^2 = xxx” where xxx is the value of R^2 from the linear regression model
  • Better labels for the x- and y-axes.

Plot the two figures (fig1 and fig2) side by side. Are the results coherent?

Solution: Yes, we fully expect precipitation with vegetation 😄

fig1 <- ggplot(df, aes(y = enhanced_vegetation_index, x = precipitation)) +
  geom_point()

# Linear regression
mod <- lm(enhanced_vegetation_index ~ precipitation, df)

fig2 <- fig1 +
  # classic theme
  theme_classic() +
  # change x-axis, y-axis and title format
  theme(
    axis.text.x = element_text(size = 11L),
    axis.text.y = element_text(size = 11L),
    plot.title = element_text(hjust = 0.5) # centre the title
  ) +
  # Add the regression line
  geom_smooth(method = lm, color = "darkred") +
  # Change x-axis, y-axis and title labels
  labs(
    x = "Precipitation [mm]",
    y = "Vegetation index [A.U.]",
    title = paste("R^2:", format(summary(mod)$r.squared, digits = 2L))
  )

# Subplots with 2 columns
grid.arrange(fig1, fig2, ncol = 2L)

Exercise 5: Focus on a single country

Find the country with the most observations in this dataset.

For this country, plot 4 subplots (in a 2 x 2 grid) showing the following:

  1. The number of surveys done over the years, grouped by surved method
  2. The average Plasmodium falciparum parasite rate (PfPr), plotted against the year (mean +/- standard deviation)
  3. The distribution of PfPr depending on the area type (rural vs. urban)
  4. Combine elevation, precipitation and city accessibility together
# Using dplyr in full force here, but there are tons of other ways to do it
most_observed_country <- df %>%
  count(country, sort = TRUE) %>%
  head(1) %>%
  pull(country)

mocd <- df %>% filter(country == most_observed_country)

# Get mean and SD of PfPr grouped by year
mocd_pfpr_stats <- mocd %>%
  group_by(year) %>%
  summarise(PfPr_mean = mean(PfPr), PfPr_sd = sd(PfPr))

fig1 <- ggplot(mocd, aes(x = year, fill = method)) +
  geom_bar() +
  theme_minimal() +
  scale_fill_manual(values = colourblind) +
  labs(
    x = "Year",
    y = "# Surveys"
  ) +
  theme(legend.position = "top")

fig2 <- ggplot(data = mocd_pfpr_stats) +
  # Scatter plot
  geom_point(aes(x = year, y = PfPr_mean)) +
  # Line between each points
  geom_line(aes(x = year, y = PfPr_mean)) +
  # Error bar [y_mean - y_sd, y_mean + y_sd]
  geom_errorbar(
    aes(
      x = year,
      ymin = PfPr_mean - PfPr_sd,
      ymax = PfPr_mean + PfPr_sd
    ),
    width = 0.2
  ) +
  labs(
    x = "Year",
    y = "PfPr (%)"
  ) +
  theme_minimal()

fig3 <- ggplot(
  # Filter out empty values
  mocd %>% filter(rural_urban != ""),
  aes(x = PfPr, color = rural_urban)
) +
  # Use after_stat to scale the data
  geom_density(aes(y = after_stat(scaled))) +
  theme_minimal() +
  labs(
    y = "Density",
    x = "PfPr (%)",
  )

fig4 <- ggplot(mocd, aes(x = elevation, y = precipitation, colour = mtsi)) +
  geom_point() +
  theme_minimal() +
  scale_colour_distiller(palette = "Purples", direction = 1) +
  labs(
    y = "Precipitation [mm]",
    x = "Elevation [m]",
  )

grid.arrange(fig1, fig2, fig3, fig4, ncol = 2L)

Exercise 6: Heatmaps

Compare the average usage of different types of insecticide-treated bed nets for Nigeria and its neighbours (Benin, Chad, Niger, Cameroon).

Plot the resulting table in a heatmap, with the value of each heatmap cell printed inside.

ex6 <- df |>
  filter(country %in% c("Nigeria", "Benin", "Chad", "Niger", "Cameroon")) |>
  group_by(country) |>
  summarise(across(starts_with("itn"), mean)) |>
  gather(itn_variable, avg, -country)

ggplot(ex6, aes(y = country, x = itn_variable)) +
  # Make a heatmap in ggplot2
  geom_tile(aes(fill = avg)) +
  # Add text inside the heatmap
  geom_text(aes(label = round(avg, 2))) +
  # Make the heatmap squared
  coord_fixed() +
  # Change the gradient (white for low usage, red for high usage)
  scale_fill_gradient(low = "white", high = "red") +
  # Remove the x- and y-labels (for aesthetic purposes)
  xlab("") +
  ylab("") +
  # Change the country order such that it's alphabetically ordered
  ylim(rev(unique(ex6$country)))

<!DOCTYPE html>

Day 2: Maps

Imports and starter code

library(dplyr)
library(ggplot2)
# scale bars and north arrows
library(ggspatial)
# maps
library(rnaturalearth)
# map data
library(rnaturalearthdata)
# high-resolution maps
library(rnaturalearthhires)
#' simple features' package
library(sf)
# interactive thematic maps
library(tmap)

# Our malaria dataset
df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")

# Where all the map data is located
map_folder <- "../../data/maps"

# Rivers
rivers <- ne_load(
  scale = 10,
  type = "rivers_lake_centerlines",
  destdir = map_folder,
  returnclass = "sf"
)
## Reading layer `ne_10m_rivers_lake_centerlines' from data source 
##   `C:\KU\Teaching\DataVizCourse\2024\data\maps\ne_10m_rivers_lake_centerlines.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1473 features and 38 fields
## Geometry type: MULTILINESTRING
## Dimension:     XY
## Bounding box:  xmin: -164.9035 ymin: -52.15775 xmax: 177.5204 ymax: 75.79348
## Geodetic CRS:  WGS 84
# Lakes
lakes <- ne_load(
  scale = 10,
  type = "lakes",
  destdir = map_folder,
  returnclass = "sf"
)
## Reading layer `ne_10m_lakes' from data source 
##   `C:\KU\Teaching\DataVizCourse\2024\data\maps\ne_10m_lakes.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1355 features and 41 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -165.9656 ymin: -50.66967 xmax: 177.1544 ymax: 81.95521
## Geodetic CRS:  WGS 84
# Oceans
oceans <- ne_load(
  scale = 10,
  type = "ocean",
  destdir = map_folder,
  returnclass = "sf"
)
## Reading layer `ne_10m_ocean' from data source 
##   `C:\KU\Teaching\DataVizCourse\2024\data\maps\ne_10m_ocean.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -85.22194 xmax: 180 ymax: 90
## Geodetic CRS:  WGS 84

Exercise 1: plot the world

Use the rnaturalearth and ggplot2 libraries to get the country outlines on a world map

# Use rnaturalearth to get all countries ("ne_countries")
world_ne <- ne_countries(
  scale = "medium", # map scale
  returnclass = "sf" # return a Simple Features dataframe
)

# We use geom_sf from ggplot to draw simple boundaries and outlines
ggplot(data = world_ne) +
  geom_sf() # geom to visualise sf objects

Exercise 2: Kenya plot a Kenya map?

Plot a high-definition map of Kenya supplemented with information on:

  • Scale
  • Orientation (North/South)
  • Coordinates (longitude/latitude)

Hint: use ggspatial

# Use rnaturalearth to get the country polygon for Kenya
kenya <- ne_countries(
  country = "kenya", # country name
  type = "countries", # country type
  scale = "large", # requires rnaturalhires
  returnclass = "sf"
)

# Basic Kenya map
kenya_map <- ggplot(data = kenya) +
  geom_sf()

kenya_map_detailed <- kenya_map +
  # Add labels
  labs(
    x = "Longitude",
    y = "Latitude",
    title = "Kenya"
  ) +
  # Add the scale on the bottom left (ggspatial)
  annotation_scale(location = "bl", width_hint = 0.4) +
  # Add a north arrow on the bottom left (ggspatial)
  annotation_north_arrow(
    location = "bl",
    which_north = "true",
    pad_x = unit(0.0, "in"),
    pad_y = unit(0.2, "in"),
    style = north_arrow_fancy_orienteering
  )

kenya_map_detailed

Exercise 3: Kenya add more details?

Augment your Kenya plot by adding:

  • a point for 3 major cities: Nairobi, the capital, as well as Mombasa and Nakuru.
  • lakes, rivers, and potential surrounding oceans

You might have to do a bit of googling to get the coordinates 😀

Hint: use sf and the data in the starter code!

# Kenya data for major cities
kenya_city_data <- data.frame(
  city_name = c("Nairobi", "Mombasa", "Nakuru"),
  lat = c(-1.2921, -4.0435, -0.3031),
  lon = c(36.8219, 39.6682, 36.0800)
)

# Bounding box coordinates for Kenya
kenya_coords <- st_bbox(kenya)
xmin <- kenya_coords[1]
xmax <- kenya_coords[3]
ymin <- kenya_coords[2]
ymax <- kenya_coords[4]

kenya_map_very_detailed <- kenya_map_detailed +
  # Add rivers
  geom_sf(data = rivers, colour = "blue", linewidth = 0.2) +
  # Add lakes
  geom_sf(data = lakes, fill = "lightblue") +
  # Add oceans
  geom_sf(data = oceans, fill = "lightblue") +
  # Restict coordinates for rivers/lakes/oceans
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  # Add points for cities (scatter plot)
  geom_point(
    data = kenya_city_data,
    mapping = aes(x = lon, y = lat),
    colour = "black"
  ) +
  # Add annotation
  geom_text(
    data = kenya_city_data,
    mapping = aes(x = lon, y = lat, label = city_name),
    nudge_y = 0.25,
    color = "black"
  )

kenya_map_very_detailed

Exercise 4: Kenya add survey data?

Last touch to our map! add the coordinates for surveys in our dataset (df) performed in Kenya

kenya_df <- df %>% filter(country == "Kenya")

kenya_map_very_detailed +
  # Scatter plot with x = longitude, y = latitude
  geom_point(
    data = kenya_df,
    mapping = aes(x = longitude, y = latitude),
    colour = "red",
    alpha = 0.2,
    size = 1
  )

Exercise 5: thematic maps with tmap

Warning! You might have to delete your current environment before doing this exercise and/or do it on a separate R script.

Let’s have a look at interactive thematic maps with tmap!

We provide you with some starter code below. World is a data.frame with a special column that contains a geometry for each row, in this case polygons.

# Interactive mode
tmap_mode("view")

# Load World Data
data("World")

# Show the first rows
head(World)
## Simple feature collection with 6 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -73.41544 ymin: -55.25 xmax: 75.15803 ymax: 42.68825
## Geodetic CRS:  WGS 84
##   iso_a3                 name           sovereignt     continent
## 1    AFG          Afghanistan          Afghanistan          Asia
## 2    AGO               Angola               Angola        Africa
## 3    ALB              Albania              Albania        Europe
## 4    ARE United Arab Emirates United Arab Emirates          Asia
## 5    ARG            Argentina            Argentina South America
## 6    ARM              Armenia              Armenia          Asia
##                area  pop_est pop_est_dens                   economy
## 1  652860.00 [km^2] 28400000     43.50090 7. Least developed region
## 2 1246700.00 [km^2] 12799293     10.26654 7. Least developed region
## 3   27400.00 [km^2]  3639453    132.82675      6. Developing region
## 4   71252.17 [km^2]  4798491     67.34519      6. Developing region
## 5 2736690.00 [km^2] 40913584     14.95003   5. Emerging region: G20
## 6   28470.00 [km^2]  2967004    104.21510      6. Developing region
##                income_grp gdp_cap_est life_exp well_being footprint inequality
## 1           5. Low income    784.1549   59.668        3.8      0.79  0.4265574
## 2  3. Upper middle income   8617.6635       NA         NA        NA         NA
## 3  4. Lower middle income   5992.6588   77.347        5.5      2.21  0.1651337
## 4 2. High income: nonOECD  38407.9078       NA         NA        NA         NA
## 5  3. Upper middle income  14027.1261   75.927        6.5      3.14  0.1642383
## 6  4. Lower middle income   6326.2469   74.446        4.3      2.23  0.2166481
##        HPI                       geometry
## 1 20.22535 MULTIPOLYGON (((61.21082 35...
## 2       NA MULTIPOLYGON (((16.32653 -5...
## 3 36.76687 MULTIPOLYGON (((20.59025 41...
## 4       NA MULTIPOLYGON (((51.57952 24...
## 5 35.19024 MULTIPOLYGON (((-65.5 -55.2...
## 6 25.66642 MULTIPOLYGON (((43.58275 41...

Try to plot a few economic indicators for African countries. Can we talk about an “economic burden” for malaria in African countries? Is this enough evidence?

# Select Africa
africa <- World %>% filter(continent == "Africa")

# Rough estimation of average PfPr (parasite rate) in each country
df %>%
  group_by(country) %>%
  summarize(PfPr_mean = mean(PfPr)) %>%
  arrange(desc(PfPr_mean))
## # A tibble: 44 × 2
##    country                  PfPr_mean
##    <chr>                        <dbl>
##  1 Central African Republic     0.998
##  2 Burkina Faso                 0.600
##  3 Equatorial Guinea            0.598
##  4 Comoros                      0.496
##  5 Guinea                       0.478
##  6 Congo                        0.429
##  7 Sierra Leone                 0.424
##  8 Mali                         0.410
##  9 Togo                         0.410
## 10 Ghana                        0.395
## # ℹ 34 more rows
# You can plot several columns at once with tm_facets
tm_shape(africa) +
  # Plot life expectancy and GDP per capita
  tm_polygons(
    col = c("life_exp", "gdp_cap_est", "economy"),
    palette = list("Blues", "Blues", "Reds"),
    id = "name"
  ) +
  tm_facets(sync = TRUE, ncol = 3)

<!DOCTYPE html>

Day 3: dimensionality reduction
library(corrplot)
library(ggbiplot)
library(gridExtra)
library(dplyr)
library(ggplot2)
library(uwot)

# Our malaria dataset
df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")

df$mtsi <- df$mosquito_temperature_suitability_index / 1000

cols_of_interest <- c(
  "itnuse0", "itnuse1", "itnuse2", "itnuse3",
  "act", "irs", "tasseled_cap_wetness", "mtsi",
  "precipitation", "enhanced_vegetation_index", "elevation", "nighttime_lights",
  "accessibility_tocity"
)

countries <- c("Sudan", "Zambia", "Nigeria")

set.seed(42)

Exercise 1: correlation matrix

We have selected for you 13 columns of interest, and 3 different African countries from an economic, geographic, and demographic perspective: Sudan, Zambia, and Nigeria.

Make a correlation matrix using the features of interest then plot it.

Correlation matrix: cor from the stats package
Correlation matrix plot: corrplot from the corrplot package

sub_df <- df %>% filter(df$country %in% countries)

sub_df_cont <- sub_df %>%
  select(any_of(cols_of_interest))

# Correlation matrix
corr_matrix <- cor(sub_df_cont)

# Correlation matrix plot
corrplot(
  corr_matrix,
  # Only plot the lower triangular
  type = "lower",
  # Visualisation method: show ellipses
  # eccentricity scaled to the correlation value
  method = "ellipse",
  # Sort by angular order of the eigenvectors
  order = "AOE",
  # Rotate the text labels
  tl.srt = 45
)

Exercise 2: PCA

  1. Run a PCA on the subset of interest. Do not forget to scale and center the data!
  2. Run a scree plot. How many components are worth retaining?

PCA: prcomp from the stats package
Scree plot: screeplot from the stats package

sub_df_cont_pca <- prcomp(sub_df_cont, scale = TRUE, center = TRUE)

# Screeplot
screeplot(sub_df_cont_pca, type = "lines")

Exercise 3: more PCA

Run a biplot to represent the information contained in the first two components.

  1. Observe the arrows: what do the principle components capture?
  2. Colour the biplot points by country: do the surveys cluster well by country? do the clusters make sense?
  3. Colour the biplot points by PfPr

Biplot: biplot from the stats package or ggbiplot from the ggbiplot package

fig1 <- ggbiplot(
  sub_df_cont_pca,
  # arrows: change colour and increase size
  varname.color = "red", varname.size = 4,
  # points: reduce size and increase transparency
  point.size = 1,
  alpha = 0.5,
  # colour by country
  groups = sub_df$country,
  # draw an ellipse (without fill colour)
  ellipse = TRUE,
  ellipse.fill = FALSE
) +
  # change country colour
  scale_color_manual(name = "country", values = colourblind[seq_along(countries)]) +
  # minimal theme
  theme_minimal()

fig2 <- ggbiplot(
  sub_df_cont_pca,
  # arrows: change colour and increase size
  varname.color = "red", varname.size = 4,
  # points: reduce size and increase transparency
  point.size = 1,
  alpha = 0.5,
  # colour by PfPr
  groups = sub_df$PfPr,
) +
  # change PfPr colour
  scale_color_distiller(name = "PfPr", palette = "Blues", direction = 1) +
  # minimal theme
  theme_minimal()

grid.arrange(fig1, fig2, ncol = 2)

Exercise 4: UMAP

Let’s now turn to UMAP.

Perform 4 UMAP plots with 4 different values for n_neighbors: 2, 20, 200, 2000 (it should take ~ 5 min).

What do you notice? How do these visualisations compare to your PCA plot?

UMAP: umap (uwot) with fast_sgd = TRUE

neigh_plots <- list()

neigh_numbs <- c(2, 20, 200, 2000)

for (i in seq_along(neigh_numbs)) {
  print(paste("Testing n_neighbors = ", neigh_numbs[[i]]), sep = "")
  sub_df_cont_umap <- uwot::umap(
    sub_df_cont,
    n_neighbors = neigh_numbs[[i]],
    fast_sgd = TRUE
  )

  neigh_plots[[i]] <- ggplot(
    as.data.frame(sub_df_cont_umap),
    aes(y = V2, x = V1, colour = sub_df$country)
  ) +
    geom_point() +
    scale_color_manual(name = "country", values = colourblind[seq_along(countries)])
}
## [1] "Testing n_neighbors =  2"
## [1] "Testing n_neighbors =  20"
## [1] "Testing n_neighbors =  200"
## [1] "Testing n_neighbors =  2000"
do.call(grid.arrange, neigh_plots)

<!DOCTYPE html>

Day 3: Networks

Introduction

In this set of exercises, we will go through creating a network in igraph visualising it using ggplot2, setting node attributes and plotting some features of the network.

First, we read in the network data - the edge list is in the form of a .txt file, so we use read.table instead of read.csv.

This network comes from a school that was used for modelling the spread of COVID-19. The original publication as well as the data can be found here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3009790

After reading the data, we drop the repeated edges from the edge list using distinct in dplyr. Then make sure the ordering of the nodes from the graph vertices matches the edge list using match.

Finally, assign the node attributes using the $ on V(school_graph).

# Set working directory here if needed!
library(dagitty)
library(dplyr)
library(ggdag)
library(ggraph)
library(ggplot2)
library(igraph)
library(tidygraph)

# Read in network data
nodes <- read.csv("../../data/school_nodelist.csv")
edges <- read.table("../../data/school_edgelist.txt")

# Drop repeated edges
edges <- edges[, 1:2] %>% distinct(V1, V2)

# Fix node ordering
school_graph <- graph_from_edgelist(as.matrix(edges) + 1)
reorder <- match(nodes$ID, V(school_graph))
nodes <- arrange(nodes, reorder)

# Assign the node attributes using the $ on V(school_graph)
V(school_graph)$role <- nodes$Role

Exercise 1: summary statistics

Now look for some summary statistics about the data - how many individuals are there? How many roles, and which are the different roles? How many individuals of each role are there?

# How many different kinds of roles are there in the network
unique(V(school_graph)$role)
## [1] "teacher" "student" "staff"   "other"
# How many people are there in each role?
nrow(nodes %>% filter(Role == "staff"))
## [1] 55

Exercise 2: edge case

Currently, it’s a bit hard to visualise the full graph, because there are two many nodes.

One thing we could try to do is sample the nodes using the sample function applied to V(graph) - try doing this with 50 nodes.

Then, use subgraph in igraph to generate a subgraph of the full network with these nodes.

If we have set the node attributes previously for our graph, these attributes will carry over to the subgraph!

Plot the resulting subgraph. Colour the nodes according to the role that each of them plays.

use as.factor to convert the roles to factors, and then assign the colors as node attributes using V(graph)$color

# Randomly subsample the nodes from the full graph - set the seed to get the
# same nodes each time!
set.seed(123)
sample_size <- 40
subgraph_nodes <- sample(V(school_graph), sample_size)

# Create subgraph using these nodes
school_subgraph <- as.undirected(subgraph(school_graph, subgraph_nodes))

# Customise layout
layout <- layout_nicely(school_subgraph)

plot(school_subgraph,
  vertex.label = NA, vertex.size = 5, edge.size = 2,
  layout = layout
)

# Get the corresponding subgraph node attributes, i.e.
# subgraph_node_attrs <- nodes %>% filter(ID %in% subgraph_nodes) %>% select(ID, Role)

# Set the colours of the roles
colors <- c("dodgerblue", "firebrick2", "darkorchid2", "gold2")
V(school_subgraph)$color <- colors[as.factor(V(school_subgraph)$role)]


labels <- c("other", "staff", "student", "teacher")

# Get degrees for plotting
degrees <- degree(school_subgraph)
# Plot!
plot(school_subgraph, layout = layout, vertex.label = NA, vertex.size = sqrt(degrees))
# Set legend to show how colours correspond to roles.
legend("bottomright", legend = levels(as.factor(V(school_subgraph)$role)), fill = colors)
title(paste("School Network Subsample, n=", as.character(sample_size)))

</div>

Exercise 3: network statistics

When we sample the vertices, we are excluding full data about the network.

As we discussed in lectures, one way to visualise information about the whole graph is to show the degree distribution.

Plot the degree distribution for the whole graph. Do this once for every node, and then repeat with the distributions for each separate role. Do the degree distributions look significantly different for the different roles?

nodes$degree <- degree(school_graph)

degree_dist <- ggplot(nodes, aes(x = degree)) +
  geom_histogram(fill = "dodgerblue", color = "black") +
  labs(
    x = "Node Degree",
    y = "Number of people"
  ) +
  ggtitle("Degree distribution for School Network")
degree_dist

degree_density <- ggplot(nodes, aes(x = degree)) +
  geom_density(fill = "dodgerblue", color = "black", alpha = 0.5) +
  labs(
    x = "Node Degree",
    y = "Density"
  ) +
  ggtitle("Degree distribution for School Network")
degree_density

Now show the distributions separated by roles. Compare the histogram with the density plot - which do you prefer?

degree_dist_role <- ggplot(nodes, aes(x = degree, fill = Role)) +
  geom_histogram(color = "black") +
  labs(
    x = "Node Degree",
    y = "Number of people"
  ) +
  ggtitle("Degree distribution for School Network") +
  xlim(0, 500)
show(degree_dist_role)

degree_density_role <- ggplot(nodes, aes(x = degree, fill = Role)) +
  geom_density(color = "black", alpha = 0.5) +
  labs(
    x = "Node Degree",
    y = "Density"
  ) +
  ggtitle("Degree distribution for School Network") +
  xlim(0, 500)
show(degree_density_role)

Exercise 4: Remember the subgraph?

Let’s also check whether the subgraph is a good representation of the full graph in terms of degree distribution. Plot the degree distributions for the two graphs in a single plot.

Do they look similar?

subgraph_node_degrees <- nodes %>%
  filter(ID %in% subgraph_nodes) %>%
  select(ID, degree)

color_manual <- c("Full graph" = "dodgerblue", "Subgraph" = "firebrick2")

subgraph_degree_comparison <- ggplot(nodes, aes(x = degree, fill = "Full graph")) +
  geom_density(alpha = 0.5, color = "black") +
  geom_density(
    data = subgraph_node_degrees, aes(x = degree, fill = "Subgraph"),
    color = "black", alpha = 0.5
  ) +
  labs(
    x = "Node Degree",
    y = "Density"
  ) +
  ggtitle("Degree distribution for School Network") +
  xlim(0, 500) +
  scale_fill_manual(values = color_manual)

subgraph_degree_comparison

Exercise 5 (extension): node to self: there are more network libraries

Extension: Have a go at converting the school subgraph into a tidygraph object. Use the function as_tbl_graph to convert an igraph object to a tidygraph, and then use ggraph to plot.

Experiment with different ways of displaying the nodes and edges, as well as the layout and colors of the vertices. Do you prefer this way of visualising networks compared with igraph?

subgraph_nodes_df <- nodes %>% filter(ID %in% subgraph_nodes)

# Define tidygraph object
new_graph <- as_tbl_graph(school_subgraph,
  node_labels = subgraph_nodes_df$ID,
  role = subgraph_nodes_df$Role,
  degrees = subgraph_nodes_df$degree
)

# Play around here with different geom_edge and geom_node styles
g_plot <- ggraph(new_graph) +
  geom_edge_bend(show.legend = FALSE) +
  geom_node_point(aes(color = role), show.legend = TRUE, size = 5) +
  theme_graph(background = "white")
show(g_plot)

Exercise 6: DAG-nificent

Let’s look at creating a DAG in dagitty: Create a simple DAG with 5 variables, called A, B, C, D and E, and make up some associations between the variables.

If you prefer, you could make a DAG from your real research!

If you want to, you can also try playing around with the features in ggdag - this requires you to make a dagify() object instead. See this tutorial for some ideas of how to start https://cloud.r-project.org/web/packages/ggdag/vignettes/intro-to-ggdag.html and the kinds of functions available.

# Create a DAG using dagitty
dag_example <- dagitty("dag{
                        A
                        B
                        C
                        D
                        E
                        A -> B
                        A -> C
                        B -> D
                        B -> E
                        C -> D
                        D -> E

                        }")

# Check that it is acyclic!! Then plot:
print(isAcyclic(dag_example))
## [1] TRUE
plot(dag_example)

# Create the same DAG using dagify

ggdag_example <- dagify(E ~ B + D,
  D ~ B + C,
  C ~ A,
  B ~ A,
  exposure = "A",
  outcome = "E"
) %>% tidy_dagitty()



# Create some nice ggdag plots
ggdag(ggdag_example) +
  theme_dag()

ggdag_parents(ggdag_example, "E") +
  theme_dag()

ggdag_descendants(ggdag_example, "C") +
  theme_dag()

</div> </body> </html> <!DOCTYPE html> Day 4: Interactive visualisation

Imports and starter code

library(ggplot2)
library(plotly)
library(shiny)

df <- read.csv("../../data/Malaria_Dataset_DataViz.csv")

df$mtsi <- df$mosquito_temperature_suitability_index / 1000

Exercise 1: ggplot to plotly

Take your plot from Day 1/Exercise 4 (fig2) and make it interactive with plotly!

fig1 <- ggplot(df, aes(y = enhanced_vegetation_index, x = precipitation)) +
  geom_point()

# Linear regression
mod <- lm(enhanced_vegetation_index ~ precipitation, df)

fig2 <- fig1 +
  # classic theme
  theme_classic() +
  # change x-axis, y-axis and title format
  theme(
    axis.text.x = element_text(size = 11L),
    axis.text.y = element_text(size = 11L),
    plot.title = element_text(hjust = 0.5) # centre the title
  ) +
  # Add the regression line
  geom_smooth(method = lm, color = "darkred") +
  # Change x-axis, y-axis and title labels
  labs(
    x = "Precipitation [mm]",
    y = "Vegetation index [A.U.]",
    title = paste("R^2:", format(summary(mod)$r.squared, digits = 2L))
  )

plotly_fig <- ggplotly(fig2)

plotly_fig

Exercise 2: build a Shiny app

It is recommended to use RStudio and R (not RMarkdown) for this exercise.

Let’s start building an interactive app to compare countries using the 4-panel plot from Day 1/Exercise 5.

The app should incorporate the following elements: * Input: a country from the Malaria dataset * Output: an interactive 4-panel plot similar to Day 1/Exercise 5

This exercise will be divided into 3 parts: UI side (part 1), processing the input on the server side (part 2), and finally rendering an interactive a plt (part 3)

Rest assured, you’ll find some starter code in a separate script called day4_app.R!

Part 1: UI side

Our UI should consist of a fluidPage with:

  • A title
  • A sidebar layout with two panels:
    • A sidebar panel where a user can select a country from unique_countries (e.g., like a drop-down menu)
    • A main panel containing a plotlyOutput

Title: titlePanel
Side bar layout: sidebarLayout
Side bar panel: sidebarPanel
Drop-down menu: selectInput
main panel: mainPanel

unique_countries <- sort(unique(df$country))

ui <- fluidPage(

  # Give the page a title
  titlePanel("Exploring malaria survey data from African countries"),

  # Generate a row with a sidebar
  sidebarLayout(
    # Define the sidebar with one input
    sidebarPanel(
      # Selection input
      selectInput("country", "Country:",
        choices = unique_countries
      ),
      hr(),
      helpText("Select a country using the drop-down menu.")
    ),
    mainPanel(
      plotlyOutput("plot")
    )
  )
)

Part 2: Server side - processing inputs

Our server should first incorporate the input country by filtering df such that only the rows from that country are retained.

We are also interested in getting the mean PfPR and its SD (in an object called sub_df_pfpr_stats), grouped by year (like in day1).

To check your code, replace plotly_empty by:

first_year <- # CODE HERE

  first_year_mean_pfpr <- # CODE HERE

  validate(
    need(
      FALSE,
      paste0(
        country(),
        ". First sample obtained in ",
        first_year,
        ". Mean PfPr: ",
        first_year_mean_pfpr
      )
    )
  )

When running the app, you should get the following for Angola:

Angola. First sample obtained in 2005. Mean PfPr: 0.669089607

Remember to make everything that is susceptible to change (such as the country and country data) as reactive!

server <- function(input, output) {
  country <- reactive({
    input$country
  })

  sub_df <- reactive({
    df |> filter(country == country())
  })

  # Get mean and SD of PfPr grouped by year
  sub_df_pfpr_stats <- reactive({
    sub_df() %>%
      group_by(year) %>%
      summarise(PfPr_mean = mean(PfPr), PfPr_sd = sd(PfPr))
  })

  first_year <- sub_df_pfpr_stats()[1, 1]
  first_year_mean_pfpr <- sub_df_pfpr_stats()[1, 2]

  output$plot <- renderPlotly({
    validate(
      need(
        FALSE,
        paste0(
          country(),
          ". First sample obtained in ",
          first_year,
          ". Mean PfPr: ",
          first_year_mean_pfpr
        )
      )
    )
  })
}

Part 3: Server side - visualization

Our UI is ready, our data processing is ready, now onto the plots!

Replace the validate snippet by a subplot from the plotly library, incorporating the 4 panels (over 2 rows) from Day 1/Exercise 5.

Remember that the data is reactive
Use ggplotly to make the plots interactive
Use titleX = TRUE and titleY = TRUE for

server <- function(input, output) {
  country <- reactive({
    input$country
  })

  sub_df <- reactive({
    df |> filter(country == country())
  })

  # Get mean and SD of PfPr grouped by year
  sub_df_pfpr_stats <- reactive({
    sub_df() %>%
      group_by(year) %>%
      summarise(PfPr_mean = mean(PfPr), PfPr_sd = sd(PfPr))
  })

  output$plot <- renderPlotly({
    fig1 <- ggplot(sub_df(), aes(x = year, fill = method)) +
      geom_bar() +
      theme_minimal() +
      scale_fill_manual(values = colourblind) +
      labs(
        x = "Year",
        y = "# Surveys"
      )

    fig2 <- ggplot(data = sub_df_pfpr_stats()) +
      geom_point(aes(x = year, y = PfPr_mean)) +
      geom_line(aes(x = year, y = PfPr_mean)) +
      geom_errorbar(
        aes(x = year, ymin = PfPr_mean - PfPr_sd, ymax = PfPr_mean + PfPr_sd),
        width = 0.2
      ) +
      labs(
        x = "Year",
        y = "PfPr (%)"
      ) +
      theme_minimal()

    fig3 <- ggplot(sub_df() %>% filter(rural_urban != ""), aes(x = PfPr, color = rural_urban)) +
      geom_density(aes(y = after_stat(scaled))) +
      theme_minimal() +
      labs(
        y = "Density",
        x = "PfPr (%)",
      )

    fig4 <- ggplot(sub_df(), aes(x = elevation, y = precipitation, colour = mtsi)) +
      geom_point() +
      theme_minimal() +
      scale_colour_distiller(palette = "Purples", direction = 1) +
      labs(
        y = "Precipitation [mm]",
        x = "Elevation [m]",
      )

    m <- list(
      l = 50L,
      r = 50L,
      b = 100L,
      t = 100L,
      pad = 4L
    )

    size <- 800L

    plotly_fig <- subplot(
      ggplotly(fig1, width = size, height = size),
      ggplotly(fig2, width = size, height = size),
      ggplotly(fig3, width = size, height = size),
      ggplotly(fig4, width = size, height = size),
      nrows = 2,
      titleX = TRUE,
      titleY = TRUE,
      margin = 0.1
    )

    plotly_fig |> layout(margin = m)
  })
}