Graphics using ggplot2

Bar plots

library(ggplot2); library(scales); library(tidyr); library(dplyr)

year <- seq(from = 1990, to = 2004, by = 1)
income.male <- c(18, 18.5, 21, 21, 22, 24, 25, 27.5, 31, 33, 34, 36, 42, 36, 36)
income.female <- c(18.2, 18.7, 23, 24, 21, 27, 29, 29.5, 34, 33, 37, 38, 42.5, 36.7, 40)
dat <- data.frame(year = rep(year, times = 2), sex = c(rep("Male", times = 15), rep("Female", times = 15)), income = c(income.male, income.female))

ggplot(data = dat, aes(x = year, y = income, fill = sex)) +
   geom_bar(stat = 'identity', position = position_dodge(), color = "grey") +
   scale_fill_manual(values = c("darkblue", "red")) +
   labs(x = "Year", y = "Income (x $1000)", fill = element_blank()) +
   ylim(0, 50) +
   guides(fill = guide_legend(title = "")) +
   theme(legend.position = c(0.1, 0.9))

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/vertical_barplot_ggplot2.png}
\caption{Vertical bar plot showing average income for males and females, 1990 to 2004.}
\label{fig:vertical_barplot_ggplot2}
\end{figure}

Figure 1: Vertical bar plot showing average income for males and females, 1990 to 2004.

Another example:

odate <- seq(from = as.Date("1/1/06", format = "%d/%m/%y"), to = as.Date("1/12/07", format = "%d/%m/%y"), by = "1 month")
small <- c(4,5,6,3,2,8,9,12,4,5,7,34,76,98,34,65,23,46,3,5,4,76,2,45)
large <- c(4,6,7,45,65,76,87,78,65,36,63,47,87,46,34,56,35,46,78,35,67,89,56,35)
dat <- data.frame(odate = odate, cat = c(rep("Small herds", times = 24), rep("Large herds", times = 24)), n = c(small, large))

ggplot(data = dat, aes(x = odate, y = n, fill = cat)) +
   geom_bar(stat = "identity", position = position_dodge()) +
   labs(x = "Date", y = "Number of samples", fill = "Herd size") +
   ylim(0, 140) +
   scale_x_date(breaks = date_breaks("1 month"), labels = date_format("%b %Y")) +
   theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = c(0.1, 0.9))

Frequency histograms

set.seed(123); dat <- data.frame(val = rnorm(n = 20000, mean = 1, sd = 5))

ggplot(data = dat, aes(x = val)) +
   geom_histogram(color = "black", fill = "white", binwidth = 0.5)

ggplot(data = dat, aes(x = val)) +
   geom_histogram(color = "grey", fill = "dark blue", binwidth = 2)

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/frequency_histogram_ggplot2.png}
\caption{Frequency histograms showing absolute frequencies.}
\label{fig:frequency_histogram_ggplot2}
\end{figure}

Figure 2: Frequency histogram showing absolute frequencies.

Epidemic curves

Outbreak data with one row for each case:

nmal <- 5; nfem <- 25
edate <- seq(from = as.Date("2004-07-26"), to = as.Date("2004-08-13"), by = 1)
prob <- c(1:10, 9:1); prob <- prob / sum(prob)
dmal <- sample(x = edate, size = nmal, replace = TRUE, p = prob)
dfem <- sample(x = edate, size = nfem, replace = TRUE, p = prob)
sex <- c(rep("Male", nmal), rep("Female", nfem))
dat <- data.frame(edate = c(dmal, dfem), sex = sex)

Epidemic curve, both genders:

ggplot(data = dat, aes(x = dat$edate)) +
   geom_histogram(binwidth = 1, colour = "gray", size = 1) +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   scale_x_date(breaks = date_breaks("1 week"), labels = date_format("%d %b"), name = "Date") +
   scale_y_continuous(breaks = seq(from = 0, to = 10, by = 2), limits = c(0,10), name = "Number of cases") +
   geom_vline(aes(xintercept = as.numeric(as.Date("31/07/2004", format = "%d/%m/%Y"))), linetype = "dashed")

Epidemic curve, faceted by gender:

ggplot(data = dat, aes(x = dat$edate)) +
   geom_histogram(binwidth = 1, colour = "gray", size = 1) +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   facet_grid( ~ sex) +
   scale_x_date(breaks = date_breaks("1 week"), labels = date_format("%d %b"), name = "Date") +
   scale_y_continuous(breaks = seq(from = 0, to = 10, by = 2), limits = c(0,10), name = "Number of cases") +
   geom_vline(aes(xintercept = as.numeric(as.Date("31/07/2004", format = "%d/%m/%Y"))), linetype = "dashed")

Epidemic curve, different colours for each gender:

ggplot(data = dat, aes(x = dat$edate, group = sex, fill = sex)) +
   geom_histogram(binwidth = 1, colour = "gray", size = 1) +
   scale_fill_manual(values = c("red", "dark blue")) +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   scale_x_date(breaks = date_breaks("1 week"), labels = date_format("%d %b"), name = "Date") +
   scale_y_continuous(breaks = seq(from = 0, to = 10, by = 2), limits = c(0,10), name = "Number of cases") +
   guides(fill = guide_legend(title = "Gender")) +
   geom_vline(aes(xintercept = as.numeric(as.Date("31/07/2004", format = "%d/%m/%Y"))), linetype = "dashed") +
   theme(legend.position = c(0.15, 0.85))

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/epidemic_curve-01_ggplot2.png}
\caption{Epidemic curve with different colours used to indicate counts of male and female cases.}
\label{fig:epidemic_curve-01_ggplot2}
\end{figure}

Figure 3: Epidemic curve with different colours used to indicate counts of male and female cases.

Outbreak data with one row for every case event date and an integer representing the number of cases identified on that date. This example shows how to colour different parts of the epidemic curve according to different criteria. Assume during the first 6 months of 2003 controls were put in place:

edate <- seq(from = as.Date("1/1/00", format = "%d/%m/%y"), to = as.Date("1/1/05", format = "%d/%m/%y"), by = "1 month")
nsick <- round(runif(n = length(edate), min = 0, max = 100), digits = 0)
dat <- data.frame(edate, nsick)
dat$ctl <- "neg"
dat$ctl[dat$edate >= as.Date("1/1/03", format = "%d/%m/%y") & dat$edate <= as.Date("1/6/03", format = "%d/%m/%y")] <- "pos"

ggplot(data = dat, aes(x = edate, weight = nsick, fill = factor(ctl))) +
   geom_histogram(binwidth = 60, colour = "gray") +
   labs(x = "Date of onset", y = "Number of confirmed cases") +
   scale_fill_manual(values = c("#2f4f4f", "red")) +
   scale_x_date(breaks = date_breaks("6 months"), labels = date_format("%b %Y")) +
   scale_y_continuous(breaks = seq(from = 0, to = 200, by = 25), limits = c(0,200), name = "Number of cases") +
   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   guides(fill = FALSE)

Box and whisker plots

score <- c(rnorm(n = 50, mean = 2, sd = 2), rnorm(n = 50, mean = 1.5, sd = 2.5), rnorm(n = 50, mean = 1, sd = 3.5))
grp <- c(rep("A", times = 50), rep("B", times = 50), rep("C", times = 50))
dat <- data.frame(grp, score)

ggplot(data = dat, aes(x = grp, y = score)) +
   geom_boxplot()

Horizontal box and whisker plot:

ggplot(data = dat, aes(x = grp, y = score)) +
   geom_boxplot() +
   xlab("Group") +
   ylab("Score") +
   coord_flip()

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/boxwhisker_ggplot2.png}
\caption{Box and whisker plot. In the above plot the lines within the boxes indicate the median depression score for each group. The lower and upper bound of the boxes represent the 25th and 75th quantiles of the distribution of depression scores, respectively. The horizontal lines extending from the boxes represent the lower and upper bounds of the 95% confidence interval around the distribution of depression scores. The dots beyond the lines attached to each box represent outliers (extreme values).}
\label{fig:boxwhisker_ggplot2}
\end{figure}

Figure 4: Box and whisker plot. In the above plot the lines within the boxes indicate the median depression score for each group. The lower and upper bound of the boxes represent the 25th and 75th quantiles of the distribution of depression scores, respectively. The horizontal lines extending from the boxes represent the lower and upper bounds of the 95% confidence interval around the distribution of depression scores. The dots beyond the lines attached to each box represent outliers (extreme values).

Line plots

Average income for males and females, 1990 to 2004 (Figure 5a):

year <- seq(from = 1990, to = 2004, by = 1)
income.male <- c(18, 18.5, 21, 21, 22, 24, 25, 27.5, 31, 33, 34, 36, 42, 36, 36)
income.female <- c(18.2, 18.7, 23, 24, 21, 27, 29, 29.5, 34, 33, 37, 38, 42.5, 36.7, 40)
dwide <- data.frame(year, Male = income.male, Female = income.female)

Convert data from wide to long format:

dlong <- dwide %>%
   gather(key = gender, value = income, -c(year))

Line plot:

ggplot(data = dlong, aes(x = year, y = income, group = gender, color = gender)) +
   geom_line() +
   scale_x_continuous(breaks = seq(from = 1990, to = 2004, by = 2), limits = c(1990,2004), name = "Year") +
   scale_y_continuous(breaks = seq(from = 0, to = 50, by = 10), limits = c(0,50), name = "Annual income ($)") +
   guides(color = guide_legend(title = "Gender")) +
   annotate("text", x = 1997, y = 50, label = "Start of intervention") +
   geom_vline(xintercept = 1994, linetype = "dashed") +
   theme(legend.position = c(0.85, 0.20))

Crude and adjusted incidence risks, 2010 to 2019 (Figure 5b):

year <- 2010:2019
crude <- c(1000,990,960,950,920,925,915,905,880,870)
adj <- c(1000,975,950,900,840,820,790,780,755,730)
dwide <- data.frame(year, crude = crude, adj = adj)

Convert data from wide to long format:

dlong <- dwide %>%
   gather(key = method, value = irisk, -c(year))

Line plot:

ggplot(data = dlong, aes(x = year, y = irisk, linetype = method, colour = method)) +
   theme_bw() +
   geom_line(size = 0.5) +
   scale_linetype_manual(values = c("dotdash", "solid"), name = "Method", breaks = c("adj", "crude"), labels = c("Adjusted", "Crude")) +
   scale_color_manual(values = c("red", "dark blue"), name = "Method", breaks = c("adj", "crude"), labels = c("Adjusted", "Crude")) +
   scale_x_continuous(breaks = seq(from = 2010, to = 2020, by = 2), limits = c(2010, 2020), name = "Year") +
   scale_y_continuous(breaks = seq(from = 0, to = 1000, by = 200), limits = c(0,1000), name = "Incidence risk (cases per 100,000 at risk)") +
   theme(legend.position = c(0.25, 0.25))

(a) Average income for males and females, 1990 to 2004. (b) Crude and adjusted incidence risks, 2010 to 2019.

Figure 5: Example line plots.

Line plot with shading to indicate confidence intervals:

library(epiR)
set.seed(123); dat <- rpois(n = 50, lambda = 2)
edr.04 <- epi.edr(dat, n = 4, conf.level = 0.95, nsim = 99, na.zero = TRUE)

ggplot(data = edr.04, aes(x = 1:50, y = est)) +
   geom_line(lwd = 0.5, col = "black") +
   # scale_y_log10() +
   geom_hline(yintercept = 1, lty = 2) +
   geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
   labs(x = "Event day", y = "Estimated dissemination ratio")

Pie charts

dat <- data.frame(id = 1:95, class = c(rep("A", times = 25), rep("B", times = 30), rep("C", times = 40)))

gdat <- dat %>%
   group_by(class)
dat <- data.frame(dplyr::summarise(gdat, n = n()))

Stacked bar chart:

ggplot(data = dat, aes(x = "", y = n, fill = class)) +
   geom_bar(width = 1, stat = "identity")

Pie chart:

ggplot(data = dat, aes(x = "", y = n, fill = class)) +
   geom_bar(width = 1, stat = "identity") +
   coord_polar("y", start = 0) +
   guides(fill = guide_legend(title = "Class"))

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/piechart_ggplot2.png}
\caption{Piechart.}
\label{fig:piechart_ggplot2}
\end{figure}

Figure 6: Piechart.

Scatterplots

set.seed(123); score1 <- rnorm(n = 100, mean = 20, sd = 15)
set.seed(456); score2 <- rnorm(n = 100, mean = 10, sd = 5)
dat <- data.frame(score1, score2)

Scatterplot with the default loess smooth:

ggplot(data = dat, aes(x = score1, y = score2)) +
   geom_point() +
   stat_smooth() +
   scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20), limits = c(-20, 60), name = "X axis label") +
   scale_y_continuous(limits = c(-5, 25), name = "Y axis label")

Scatterplot with superimposed regression line (estimated within stat_smooth):

ggplot(data = dat, aes(x = score1, y = score2)) +
   geom_point() +
   stat_smooth(method = "glm", formula = y ~ x) +
   scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20), limits = c(-20, 60), name = "X axis label") +
   scale_y_continuous(limits = c(-5, 25), name = "Y axis label")

Scatterplot with superimposed regression line (estimated externally):

dat.lm <- lm(score2 ~ score1, data = dat)
newdat <- data.frame(x = seq(from = 1, to = 100, by = 1))

err <- predict(dat.lm, newdata = newdat, se = TRUE)
newdat$ucl <- err$fit + 1.96 * err$se.fit
newdat$lcl <- err$fit - 1.96 * err$se.fit

ggplot(data = dat, aes(x = score1, y = score2)) +
   geom_point() +
   stat_smooth(data = newdat, aes(ymin = lcl, ymax = ucl), stat = "identity") +
   scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20), limits = c(-20, 60), name = "X axis label") +
   scale_y_continuous(limits = c(-5, 25), name = "Y axis label")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/scatterplot_ggplot2.png}
\caption{Scatterplot showing Score 2 as a function of Score 1. Superimposed on this plot is a loess smoothed curve with shading to show its 95\% confidence interval.}
\label{fig:scatterplot_ggplot2}
\end{figure}

Figure 7: Scatterplot showing Score 2 as a function of Score 1. Superimposed on this plot is a loess smoothed curve with shading to show its 95% confidence interval.

Error bar plots

xpos <- c(c(1:4) - 0.10, c(1:4) + 0.10)
xlab <- rep(c("Jan-94", "Jan-95", "Jan-96", "Jan-97"), times = 2)
Region <- c(rep("North", times = 4), rep("South", times = 4))
est <- c(21,9,23,34,25,12,27,36)
lower <- c(2,1,1,3,3,4,4,5)
upper <- c(65,33,67,99,67,36,71,99)
dat <- data.frame(xpos, xlab, Region, est, lower, upper)

ggplot(data = dat, aes(x = xpos, y = est, colour = Region)) +
   geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1) +
   geom_point() +
   scale_x_continuous(breaks = c(1,2,3,4), labels = dat$xlab[1:4], name = "Date") +
   scale_y_continuous(breaks = seq(from = 0, to = 100, by = 25), labels = seq(from = 0, to = 100, by = 25), name = "Prevalence") +
   theme(legend.justification = c(0,1), legend.position = c(0.10,0.95))

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/error_bar_ggplot2.png}
\caption{Error bar plot showing the prevalence of disease for northern and southern districts as a function of calendar year.}
\label{fig:error_bar_ggplot2}
\end{figure}

Figure 8: Error bar plot showing the prevalence of disease for northern and southern districts as a function of calendar year.

Time series plots

edate <- seq(from = as.Date("2004-01-01"), to = as.Date("2004-12-31"), by = 1)
obs <- rnorm(n = length(edate), mean = 0, sd = 1)
dat <- data.frame(edate, obs)

ggplot(data = dat, aes(x = edate, y = obs)) +
   geom_line() +
   scale_x_date(breaks = date_breaks("3 months"), labels = date_format("%b %Y"), name = "Date") +
   scale_y_continuous(breaks = seq(from = -6, to = 6, by = 2), labels = seq(from = -6, to = 6, by = 2), limits = c(-6,6), name = "Observed") +
   geom_vline(xintercept = as.Date("2004-06-30"), linetype = 2, color = "red") +
   annotate("text", x = as.Date("2004-08-31"), y = 5, label = "Off medication")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/time_series_ggplot2.png}
\caption{Time series plot showing behaviour score as a function of observation date, 1 January 2004 to 31 December 2004.}
\label{fig:time_series_ggplot2}
\end{figure}

Figure 9: Time series plot showing behaviour score as a function of observation date, 1 January 2004 to 31 December 2004.