library(ggplot2); library(scales); library(tidyr); library(dplyr)
year <- seq(from = 1990, to = 2004, by = 1)
income.male <- c(18, 18.5, 21, 21, 22, 24, 25, 27.5, 31, 33, 34, 36, 42, 36, 36)
income.female <- c(18.2, 18.7, 23, 24, 21, 27, 29, 29.5, 34, 33, 37, 38, 42.5, 36.7, 40)
dat <- data.frame(year = rep(year, times = 2), sex = c(rep("Male", times = 15), rep("Female", times = 15)), income = c(income.male, income.female))
ggplot(data = dat, aes(x = year, y = income, fill = sex)) +
geom_bar(stat = 'identity', position = position_dodge(), color = "grey") +
scale_fill_manual(values = c("darkblue", "red")) +
labs(x = "Year", y = "Income (x $1000)", fill = element_blank()) +
ylim(0, 50) +
guides(fill = guide_legend(title = "")) +
theme(legend.position = c(0.1, 0.9))
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/vertical_barplot_ggplot2.png}
\caption{Vertical bar plot showing average income for males and females, 1990 to 2004.}
\label{fig:vertical_barplot_ggplot2}
\end{figure}](../images/vertical_barplot_ggplot2.png)
Another example:
odate <- seq(from = as.Date("1/1/06", format = "%d/%m/%y"), to = as.Date("1/12/07", format = "%d/%m/%y"), by = "1 month")
small <- c(4,5,6,3,2,8,9,12,4,5,7,34,76,98,34,65,23,46,3,5,4,76,2,45)
large <- c(4,6,7,45,65,76,87,78,65,36,63,47,87,46,34,56,35,46,78,35,67,89,56,35)
dat <- data.frame(odate = odate, cat = c(rep("Small herds", times = 24), rep("Large herds", times = 24)), n = c(small, large))
ggplot(data = dat, aes(x = odate, y = n, fill = cat)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(x = "Date", y = "Number of samples", fill = "Herd size") +
ylim(0, 140) +
scale_x_date(breaks = date_breaks("1 month"), labels = date_format("%b %Y")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = c(0.1, 0.9))
set.seed(123); dat <- data.frame(val = rnorm(n = 20000, mean = 1, sd = 5))
ggplot(data = dat, aes(x = val)) +
geom_histogram(color = "black", fill = "white", binwidth = 0.5)
ggplot(data = dat, aes(x = val)) +
geom_histogram(color = "grey", fill = "dark blue", binwidth = 2)
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/frequency_histogram_ggplot2.png}
\caption{Frequency histograms showing absolute frequencies.}
\label{fig:frequency_histogram_ggplot2}
\end{figure}](../images/frequency_histogram_ggplot2.png)
Outbreak data with one row for each case:
nmal <- 5; nfem <- 25
edate <- seq(from = as.Date("2004-07-26"), to = as.Date("2004-08-13"), by = 1)
prob <- c(1:10, 9:1); prob <- prob / sum(prob)
dmal <- sample(x = edate, size = nmal, replace = TRUE, p = prob)
dfem <- sample(x = edate, size = nfem, replace = TRUE, p = prob)
sex <- c(rep("Male", nmal), rep("Female", nfem))
dat <- data.frame(edate = c(dmal, dfem), sex = sex)
Epidemic curve, both genders:
ggplot(data = dat, aes(x = dat$edate)) +
geom_histogram(binwidth = 1, colour = "gray", size = 1) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_date(breaks = date_breaks("1 week"), labels = date_format("%d %b"), name = "Date") +
scale_y_continuous(breaks = seq(from = 0, to = 10, by = 2), limits = c(0,10), name = "Number of cases") +
geom_vline(aes(xintercept = as.numeric(as.Date("31/07/2004", format = "%d/%m/%Y"))), linetype = "dashed")
Epidemic curve, faceted by gender:
ggplot(data = dat, aes(x = dat$edate)) +
geom_histogram(binwidth = 1, colour = "gray", size = 1) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_grid( ~ sex) +
scale_x_date(breaks = date_breaks("1 week"), labels = date_format("%d %b"), name = "Date") +
scale_y_continuous(breaks = seq(from = 0, to = 10, by = 2), limits = c(0,10), name = "Number of cases") +
geom_vline(aes(xintercept = as.numeric(as.Date("31/07/2004", format = "%d/%m/%Y"))), linetype = "dashed")
Epidemic curve, different colours for each gender:
ggplot(data = dat, aes(x = dat$edate, group = sex, fill = sex)) +
geom_histogram(binwidth = 1, colour = "gray", size = 1) +
scale_fill_manual(values = c("red", "dark blue")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_date(breaks = date_breaks("1 week"), labels = date_format("%d %b"), name = "Date") +
scale_y_continuous(breaks = seq(from = 0, to = 10, by = 2), limits = c(0,10), name = "Number of cases") +
guides(fill = guide_legend(title = "Gender")) +
geom_vline(aes(xintercept = as.numeric(as.Date("31/07/2004", format = "%d/%m/%Y"))), linetype = "dashed") +
theme(legend.position = c(0.15, 0.85))
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/epidemic_curve-01_ggplot2.png}
\caption{Epidemic curve with different colours used to indicate counts of male and female cases.}
\label{fig:epidemic_curve-01_ggplot2}
\end{figure}](../images/epidemic_curve-01_ggplot2.png)
Outbreak data with one row for every case event date and an integer representing the number of cases identified on that date. This example shows how to colour different parts of the epidemic curve according to different criteria. Assume during the first 6 months of 2003 controls were put in place:
edate <- seq(from = as.Date("1/1/00", format = "%d/%m/%y"), to = as.Date("1/1/05", format = "%d/%m/%y"), by = "1 month")
nsick <- round(runif(n = length(edate), min = 0, max = 100), digits = 0)
dat <- data.frame(edate, nsick)
dat$ctl <- "neg"
dat$ctl[dat$edate >= as.Date("1/1/03", format = "%d/%m/%y") & dat$edate <= as.Date("1/6/03", format = "%d/%m/%y")] <- "pos"
ggplot(data = dat, aes(x = edate, weight = nsick, fill = factor(ctl))) +
geom_histogram(binwidth = 60, colour = "gray") +
labs(x = "Date of onset", y = "Number of confirmed cases") +
scale_fill_manual(values = c("#2f4f4f", "red")) +
scale_x_date(breaks = date_breaks("6 months"), labels = date_format("%b %Y")) +
scale_y_continuous(breaks = seq(from = 0, to = 200, by = 25), limits = c(0,200), name = "Number of cases") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
guides(fill = FALSE)
score <- c(rnorm(n = 50, mean = 2, sd = 2), rnorm(n = 50, mean = 1.5, sd = 2.5), rnorm(n = 50, mean = 1, sd = 3.5))
grp <- c(rep("A", times = 50), rep("B", times = 50), rep("C", times = 50))
dat <- data.frame(grp, score)
ggplot(data = dat, aes(x = grp, y = score)) +
geom_boxplot()
Horizontal box and whisker plot:
ggplot(data = dat, aes(x = grp, y = score)) +
geom_boxplot() +
xlab("Group") +
ylab("Score") +
coord_flip()
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/boxwhisker_ggplot2.png}
\caption{Box and whisker plot. In the above plot the lines within the boxes indicate the median depression score for each group. The lower and upper bound of the boxes represent the 25th and 75th quantiles of the distribution of depression scores, respectively. The horizontal lines extending from the boxes represent the lower and upper bounds of the 95% confidence interval around the distribution of depression scores. The dots beyond the lines attached to each box represent outliers (extreme values).}
\label{fig:boxwhisker_ggplot2}
\end{figure}](../images/boxwhisker_ggplot2.png)
Average income for males and females, 1990 to 2004 (Figure 5a):
year <- seq(from = 1990, to = 2004, by = 1)
income.male <- c(18, 18.5, 21, 21, 22, 24, 25, 27.5, 31, 33, 34, 36, 42, 36, 36)
income.female <- c(18.2, 18.7, 23, 24, 21, 27, 29, 29.5, 34, 33, 37, 38, 42.5, 36.7, 40)
dwide <- data.frame(year, Male = income.male, Female = income.female)
Convert data from wide to long format:
dlong <- dwide %>%
gather(key = gender, value = income, -c(year))
Line plot:
ggplot(data = dlong, aes(x = year, y = income, group = gender, color = gender)) +
geom_line() +
scale_x_continuous(breaks = seq(from = 1990, to = 2004, by = 2), limits = c(1990,2004), name = "Year") +
scale_y_continuous(breaks = seq(from = 0, to = 50, by = 10), limits = c(0,50), name = "Annual income ($)") +
guides(color = guide_legend(title = "Gender")) +
annotate("text", x = 1997, y = 50, label = "Start of intervention") +
geom_vline(xintercept = 1994, linetype = "dashed") +
theme(legend.position = c(0.85, 0.20))
Crude and adjusted incidence risks, 2010 to 2019 (Figure 5b):
year <- 2010:2019
crude <- c(1000,990,960,950,920,925,915,905,880,870)
adj <- c(1000,975,950,900,840,820,790,780,755,730)
dwide <- data.frame(year, crude = crude, adj = adj)
Convert data from wide to long format:
dlong <- dwide %>%
gather(key = method, value = irisk, -c(year))
Line plot:
ggplot(data = dlong, aes(x = year, y = irisk, linetype = method, colour = method)) +
theme_bw() +
geom_line(size = 0.5) +
scale_linetype_manual(values = c("dotdash", "solid"), name = "Method", breaks = c("adj", "crude"), labels = c("Adjusted", "Crude")) +
scale_color_manual(values = c("red", "dark blue"), name = "Method", breaks = c("adj", "crude"), labels = c("Adjusted", "Crude")) +
scale_x_continuous(breaks = seq(from = 2010, to = 2020, by = 2), limits = c(2010, 2020), name = "Year") +
scale_y_continuous(breaks = seq(from = 0, to = 1000, by = 200), limits = c(0,1000), name = "Incidence risk (cases per 100,000 at risk)") +
theme(legend.position = c(0.25, 0.25))
![]() |
![]() |
| (a) Average income for males and females, 1990 to 2004. | (b) Crude and adjusted incidence risks, 2010 to 2019. |
Line plot with shading to indicate confidence intervals:
library(epiR)
set.seed(123); dat <- rpois(n = 50, lambda = 2)
edr.04 <- epi.edr(dat, n = 4, conf.level = 0.95, nsim = 99, na.zero = TRUE)
ggplot(data = edr.04, aes(x = 1:50, y = est)) +
geom_line(lwd = 0.5, col = "black") +
# scale_y_log10() +
geom_hline(yintercept = 1, lty = 2) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
labs(x = "Event day", y = "Estimated dissemination ratio")
dat <- data.frame(id = 1:95, class = c(rep("A", times = 25), rep("B", times = 30), rep("C", times = 40)))
gdat <- dat %>%
group_by(class)
dat <- data.frame(dplyr::summarise(gdat, n = n()))
Stacked bar chart:
ggplot(data = dat, aes(x = "", y = n, fill = class)) +
geom_bar(width = 1, stat = "identity")
Pie chart:
ggplot(data = dat, aes(x = "", y = n, fill = class)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
guides(fill = guide_legend(title = "Class"))
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/piechart_ggplot2.png}
\caption{Piechart.}
\label{fig:piechart_ggplot2}
\end{figure}](../images/piechart_ggplot2.png)
set.seed(123); score1 <- rnorm(n = 100, mean = 20, sd = 15)
set.seed(456); score2 <- rnorm(n = 100, mean = 10, sd = 5)
dat <- data.frame(score1, score2)
Scatterplot with the default loess smooth:
ggplot(data = dat, aes(x = score1, y = score2)) +
geom_point() +
stat_smooth() +
scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20), limits = c(-20, 60), name = "X axis label") +
scale_y_continuous(limits = c(-5, 25), name = "Y axis label")
Scatterplot with superimposed regression line (estimated within stat_smooth):
ggplot(data = dat, aes(x = score1, y = score2)) +
geom_point() +
stat_smooth(method = "glm", formula = y ~ x) +
scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20), limits = c(-20, 60), name = "X axis label") +
scale_y_continuous(limits = c(-5, 25), name = "Y axis label")
Scatterplot with superimposed regression line (estimated externally):
dat.lm <- lm(score2 ~ score1, data = dat)
newdat <- data.frame(x = seq(from = 1, to = 100, by = 1))
err <- predict(dat.lm, newdata = newdat, se = TRUE)
newdat$ucl <- err$fit + 1.96 * err$se.fit
newdat$lcl <- err$fit - 1.96 * err$se.fit
ggplot(data = dat, aes(x = score1, y = score2)) +
geom_point() +
stat_smooth(data = newdat, aes(ymin = lcl, ymax = ucl), stat = "identity") +
scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20), limits = c(-20, 60), name = "X axis label") +
scale_y_continuous(limits = c(-5, 25), name = "Y axis label")
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/scatterplot_ggplot2.png}
\caption{Scatterplot showing Score 2 as a function of Score 1. Superimposed on this plot is a loess smoothed curve with shading to show its 95\% confidence interval.}
\label{fig:scatterplot_ggplot2}
\end{figure}](../images/scatterplot_ggplot2.png)
xpos <- c(c(1:4) - 0.10, c(1:4) + 0.10)
xlab <- rep(c("Jan-94", "Jan-95", "Jan-96", "Jan-97"), times = 2)
Region <- c(rep("North", times = 4), rep("South", times = 4))
est <- c(21,9,23,34,25,12,27,36)
lower <- c(2,1,1,3,3,4,4,5)
upper <- c(65,33,67,99,67,36,71,99)
dat <- data.frame(xpos, xlab, Region, est, lower, upper)
ggplot(data = dat, aes(x = xpos, y = est, colour = Region)) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1) +
geom_point() +
scale_x_continuous(breaks = c(1,2,3,4), labels = dat$xlab[1:4], name = "Date") +
scale_y_continuous(breaks = seq(from = 0, to = 100, by = 25), labels = seq(from = 0, to = 100, by = 25), name = "Prevalence") +
theme(legend.justification = c(0,1), legend.position = c(0.10,0.95))
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/error_bar_ggplot2.png}
\caption{Error bar plot showing the prevalence of disease for northern and southern districts as a function of calendar year.}
\label{fig:error_bar_ggplot2}
\end{figure}](../images/error_bar_ggplot2.png)
edate <- seq(from = as.Date("2004-01-01"), to = as.Date("2004-12-31"), by = 1)
obs <- rnorm(n = length(edate), mean = 0, sd = 1)
dat <- data.frame(edate, obs)
ggplot(data = dat, aes(x = edate, y = obs)) +
geom_line() +
scale_x_date(breaks = date_breaks("3 months"), labels = date_format("%b %Y"), name = "Date") +
scale_y_continuous(breaks = seq(from = -6, to = 6, by = 2), labels = seq(from = -6, to = 6, by = 2), limits = c(-6,6), name = "Observed") +
geom_vline(xintercept = as.Date("2004-06-30"), linetype = 2, color = "red") +
annotate("text", x = as.Date("2004-08-31"), y = 5, label = "Off medication")
![\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/time_series_ggplot2.png}
\caption{Time series plot showing behaviour score as a function of observation date, 1 January 2004 to 31 December 2004.}
\label{fig:time_series_ggplot2}
\end{figure}](../images/time_series_ggplot2.png)