Graphics using base R

Bar plots

Bar plots should be used when you are showing segments of information - typically time series data. The double (or group) vertical bar plot is an effective way to compare groups. One disadvantage of vertical bar plots is that they lack space for text labelling at the foot of each bar. When category labels are too long, horizontal bar plots are a better way of displaying information.

year <- seq(from = 1990, to = 2004, by = 1)
income.male <- c(18, 18.5, 21, 21, 22, 24, 25, 27.5, 31, 33, 34, 36, 42, 36, 36)
income.female <- c(18.2, 18.7, 23, 24, 21, 27, 29, 29.5, 34, 33, 37, 38, 42.5, 36.7, 40)
income <- as.matrix(rbind(income.male, income.female))

par(mfrow = c(2,2))
barplot(income.male, xlab = "Year", ylab = "Income (x $1000)", names = as.character(year), col = "dark blue", border = "gray", main = "Annual income: male")
barplot(income, xlab = "Year", ylab = "Income (x $1000)", names = as.character(year), beside = T, col = c("dark blue", "red"), border = c("gray", "gray"), main = "Annual income: male and female")
legend(0, 40, legend = c("Male", "Female"), fill = c("dark blue", "red"), c("gray", "gray"), bty = "n")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/vertical_barplot.png}
\caption{Vertical bar plot showing average income for males and females, 1990 to 2004.}
\label{fig:vertical_barplot}
\end{figure}

Figure 1: Vertical bar plot showing average income for males and females, 1990 to 2004.

Another example:

date.bins <- seq(from = as.Date("1/1/06", format = "%d/%m/%y"), to = as.Date("1/12/07", format = "%d/%m/%y"), by = "1 month")
date.lab <- as.character(date.bins, format = "%b-%y")

cases <- c(4,5,6,3,2,8,9,12,4,5,7,34,76,98,34,65,23,46,3,5,4,76,2,45)
other <- c(4,6,7,45,65,76,87,78,65,36,63,47,87,46,34,56,35,46,78,35,67,89,56,35)
vals <- as.matrix(rbind(cases, other))

par(las = 2)
barplot(vals, xlab = "Date", ylab = "Number of cases", names = date.lab, beside = TRUE, ylim = c(0, 150), cex.axis = 0.75, cex.names = 0.75, col = c("dark blue", "red"), border = c("gray", "gray"), main = "")
legend(x = "topright", legend = c("Small herds", "Large herds"), fill = c("dark blue", "red"), c("gray", "gray"), bty = "n")

Frequency histograms

Histograms are used to summarise discrete or continuous data that are measured on an interval scale. A histogram divides up the range of possible values in a data set into classes or groups. For each group, a rectangle is constructed with a base length equal to the range of values in that specific group, and an area proportional to the number of observations falling into that group. This means that the rectangles will be drawn of non-uniform height. A histogram has an appearance similar to a vertical bar graph, but when the variables are continuous, there are no gaps between the bars. When the variables are discrete, however, gaps should be left between the bars.

set.seed(123); dat <- data.frame(val = rnorm(n = 20000, mean = 1, sd = 5))

par(mfrow = c(1,2), pty = "s")
hist(dat$val, col = "dark blue", border = "gray", ylim = c(0, 5000), xlab = "Number", ylab = "Frequency", freq = TRUE, main = "Absolute frequency")
hist(dat$val, col = "dark blue", border = "gray", ylim = c(0, 0.15), xlab = "Number", ylab = "Probability", freq = FALSE, main = "Relative frequency")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/frequency_histogram.png}
\caption{Frequency histograms showing absolute (left) and relative (right) frequencies.}
\label{fig:frequency_histogram}
\end{figure}

Figure 2: Frequency histograms showing absolute (left) and relative (right) frequencies.

Epidemic curves

Outbreak data with one row for each case:

nmal <- 5; nfem <- 25
edate <- seq(from = as.Date("2004-07-26"), to = as.Date("2004-08-13"), by = 1)
prob <- c(1:10, 9:1); prob <- prob / sum(prob)
dmal <- sample(x = edate, size = nmal, replace = TRUE, p = prob)
dfem <- sample(x = edate, size = nfem, replace = TRUE, p = prob)
sex <- c(rep("Male", nmal), rep("Female", nfem))
dat <- data.frame(edate = c(dmal, dfem), sex = sex)

Plot all cases and all cases by sex:

tab1 <- table(dat$edate)
tab2 <- table(dat$sex, dat$edate)

par(mfrow = c(1,2), pty = "s", cex.axis = 0.75, cex.lab = 0.75)
barplot(tab1)
barplot(tab2)

Tidier plot:

xlabs <- c(26:31,1:13)
xvals <- seq(from = 0.5, by = 1, length = length(xlabs))

barplot(height = tab2, ylim = c(0, 10), width = 1, space = 0, axisnames = FALSE, axes = FALSE, col = c("red", "blue" ), border = "grey", xlab = "", ylab = "Number of cases", main = "")
axis(1, at = xvals, labels = xlabs, cex.axis = 0.75)
axis(2, at = 0:10, labels = 0:10, cex.axis = 0.75)
mtext(text = "July", side = 1, line = 2, at = xvals[xlabs == 28])
mtext(text = "August", side = 1, line = 2, at = xvals[xlabs == 6])
legend(x = "topright", legend = c("Females", "Males"), col = c("red", "blue"), pch = 16, bty = "n")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/epidemic_curve-01.png}
\caption{Epidemic curve with different colours used to indicate counts of male and female cases.}
\label{fig:epidemic_curve-01}
\end{figure}

Figure 3: Epidemic curve with different colours used to indicate counts of male and female cases.

Data set listing, for each case, the date of onset. Plot a histogram of date of onset for the period 1 January 2000 to 31 December 2000, grouped by 1-month periods:

day.1 <- as.Date(as.character("1-Jan-2000"), format = "%d-%b-%Y")
donset <- day.1 + runif(n = 200, min = 0, max = 365)
dbins.01m <- seq(from = as.Date("1/1/00", format = "%d/%m/%y"), to = as.Date("1/1/01", format = "%d/%m/%y"), by = "1 months")
dbins.02w <- seq(from = as.Date("1/1/00", format = "%d/%m/%y"), to = as.Date("1/1/01", format = "%d/%m/%y"), by = "2 weeks")

Plot by one month:

hist(donset, breaks = dbins.01m, freq = TRUE, col = "dark blue", border = "gray", xlab = "Date", ylab = "Number of outbreaks", main = "")

Plot by two weeks:

hist(donset, breaks = dbins.02w, freq = TRUE, col = "dark blue", border = "gray", xlab = "Date", ylab = "Number of outbreaks", main = "")

Box and whisker plots

A box and whisker plot (sometimes called a boxplot) is a graph that presents information from a five-number summary. It does not show a distribution in as much detail as a stem and leaf plot or histogram, but is especially useful for indicating whether a distribution is skewed and whether there are outliers in the data set. Box and whisker plots are also very useful when two or more data sets are being compared.

score <- c(rnorm(n = 50, mean = 2, sd = 2), rnorm(n = 50, mean = 1.5, sd = 2.5), rnorm(n = 50, mean = 1, sd = 3.5))
grp <- c(rep("A", times = 50), rep("B", times = 50), rep("C", times = 50))
dat <- data.frame(grp, score)
splus <- list(boxwex = 0.25, staplewex = 1, outwex = 1, boxfill = "grey40", medlwd = 3, medcol = "white", whisklty = 3, outlty = 1, outpch = " ")

boxplot(score ~ grp, xlab = "Score", ylab = "Group", pars = splus, horizontal = TRUE, data = dat)
abline(v = 1, lty = 2)

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/boxwhisker.png}
\caption{Box and whisker plot. In the above plot the lines within the boxes indicate the median depression score for each group. The lower and upper bound of the boxes represent the 25th and 75th quantiles of the distribution of depression scores, respectively. The horizontal lines extending from the boxes represent the lower and upper bounds of the 95% confidence interval around the distribution of depression scores. Vertical lines beyond the lines attached to each box represent outliers (extreme values).}
\label{fig:boxwhisker}
\end{figure}

Figure 4: Box and whisker plot. In the above plot the lines within the boxes indicate the median depression score for each group. The lower and upper bound of the boxes represent the 25th and 75th quantiles of the distribution of depression scores, respectively. The horizontal lines extending from the boxes represent the lower and upper bounds of the 95% confidence interval around the distribution of depression scores. Vertical lines beyond the lines attached to each box represent outliers (extreme values).

Line plots

A line plot provides a visual comparison of how two variables are related or vary with each other. The y-axis in a line graph usually indicates quantity (e.g., dollars, litres) or percentage, while the horizontal x-axis often measures units of time.

Although they do not present specific data as well as tables, line graphs are able to show relationships more clearly. Line graphs can also depict multiple series which are usually the best candidate for time series data and frequency distribution. Bar and column graphs and line graphs share a similar purpose. The column graph, however, reveals a change in magnitude, whereas the line graph is used to show a change in direction.

year <- seq(from = 1990, to = 2004, by = 1)
income.male <- c(18, 18.5, 21, 21, 22, 24, 25, 27.5, 31, 33, 34, 36, 42, 36, 36)
income.female <- c(18.2, 18.7, 23, 24, 21, 27, 29, 29.5, 34, 33, 37, 38, 42.5, 36.7, 40)
dat <- data.frame(year, male = income.male, female = income.female)

par(bty = "l", las = 1)
plot(dat$year, dat$male, ylim = c(0, 50), xlab = "Year", ylab = "Income (x $1000)", type = "l", lty = 1, main = "")
lines(dat$year, dat$female, type = "l", lty = 2)
legend(1990, 45, legend = c("Male", "Female"), lty = c(1, 2), bty = "n")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/lineplot.png}
\caption{Line plot showing average annual income for males and females as a function of fiscal year, 1990 to 2004.}
\label{fig:boxwhisker}
\end{figure}

Figure 5: Line plot showing average annual income for males and females as a function of fiscal year, 1990 to 2004.

Pie charts

A pie chart is a way of summarising a set of categorical data or displaying the percentage distribution of a given variable. This type of chart is a circle divided into a series of segments. Each segment represents a particular category. The area of each segment is the same proportion of a circle as the category is of the total data set. The use of the pie chart is quite popular, as the circle provides a visual concept of the whole (100%). Pie charts should be used sparingly for two reasons. Firstly, they are best used for displaying statistical information when there are no more than six categories (otherwise they appear too complex). Secondly, pie charts are not useful when the values of each component are similar because it is difficult to see the differences between slice sizes.

grp <- c(rep("A", times = 25), rep("B", times = 30), rep("C", times = 40))
n <- table(grp)

pie(n, labels = c("Score 1", "Score 2", "Score 3"), col = c("blue", "red", "green"), edges = 200, radius = 0.8, main = "")

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/piechart.png}
\caption{Piechart.}
\label{fig:piechart}
\end{figure}

Figure 6: Piechart.

Scatterplots

Scatterplots are used to indicate the type and strength of relationship between two continuous variables.

set.seed(123); score1 <- rnorm(n = 100, mean = 20, sd = 15)
set.seed(456); score2 <- rnorm(n = 100, mean = 10, sd = 5)
dat <- data.frame(score1, score2)

plot(x = dat$score1, y = dat$score2, xlab = "Score 1", ylab = "Score 2", pch = 16)
abline(h = 0, lty = 2)

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/scatterplot.png}
\caption{Scatterplot showing Score 2 as a function of Score 1.}
\label{fig:scatterplot}
\end{figure}

Figure 7: Scatterplot showing Score 2 as a function of Score 1.

Stem and leaf plots

A stem and leaf plot looks something like a bar graph. Each number in the data is broken down into a stem and a leaf, thus the name. The stem of the number includes all but the last digit. The leaf of the number will always be a single digit. The main advantages of stem and leaf plots are: (1) the distribution of the data can be readily appreciated, and (2) all of the original data is shown, as part of the plot.

score1 <- rnorm(n = 100, mean = 20, sd = 15)
stem(score1)

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/stem_leaf.png}
\caption{Stem and leaf plot. The decimal point is 1 digit(s) to the right of the | line.}
\label{fig:stem_leaf}
\end{figure}

Figure 8: Stem and leaf plot. The decimal point is 1 digit(s) to the right of the | line.

Error bar plots

Plot a continuous variable value on the y-axis versus a categorical variable time on the x-axis, stratifying by group b and h. Error bar plots are plotted on the same graph (jittered) to facilitate interpretation. Note the use of cex to adjust the size of the axis labels.

xpos <- c(c(1:4) - 0.10, c(1:4) + 0.10)
xlab <- rep(c("Jan-94", "Jan-95", "Jan-96", "Jan-97"), times = 2)
Region <- c(rep("North", times = 4), rep("South", times = 4))
est <- c(21,9,23,34,25,12,27,36)
lower <- c(2,1,1,3,3,4,4,5)
upper <- c(65,33,67,99,67,36,71,99)
dat <- data.frame(xpos, xlab, Region, est, lower, upper)

plot(x = dat$xpos, y = dat$est, xlab = "Month", ylab = "Prevalence", xlim = c(0.75,4.25), ylim = c(0,100), xaxt = "n", type = "n", cex.axis = 0.8, cex.lab = 0.8)
points(x = dat$xpos[dat$Region == "North"], y = dat$est[dat$Region == "North"], type = "p", col = "red", pch = 16)
points(x = dat$xpos[dat$Region == "South"], y = dat$est[dat$Region == "South"], type = "p", col = "blue", pch = 16)
axis(side = 1, at = 1:4, labels = dat$xlab[1:4], tick = TRUE, cex.axis = 0.8)

for(i in 1:4){
 segments(dat$xpos[i], dat$est[i] + 2.5, dat$xpos[i], dat$upper[i], col = "red", lwd = 1)
 segments(dat$xpos[i], dat$est[i] - 2.5, dat$xpos[i], dat$lower[i], col = "red", lwd = 1)
 segments(dat$xpos[i] - 0.025, dat$upper[i], dat$xpos[i] + 0.025, dat$upper[i], col = "red", lwd = 1)
 segments(dat$xpos[i] - 0.025, dat$lower[i], dat$xpos[i] + 0.025, dat$lower[i], col = "red", lwd = 1)
}

for(i in 5:8){
 segments(dat$xpos[i], dat$est[i] + 2.5, dat$xpos[i], dat$upper[i], col = "blue", lwd = 1)
 segments(dat$xpos[i], dat$est[i] - 2.5, dat$xpos[i], dat$lower[i], col = "blue", lwd = 1)
 segments(dat$xpos[i] - 0.025, dat$upper[i], dat$xpos[i] + 0.025, dat$upper[i], col = "blue", lwd = 1)
 segments(dat$xpos[i] - 0.025, dat$lower[i], dat$xpos[i] + 0.025, dat$lower[i], col = "blue", lwd = 1)
}

legend(x = 1, y = 100, legend = c("Northern districts", "Southern districts"), col = c("red", "blue"), pch = 16, bty = "n", cex = 0.8)

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/error_bar.png}
\caption{Error bar plot showing the prevalence of disease for northern and southern districts as a function of calendar year.}
\label{fig:error_bar}
\end{figure}

Figure 9: Error bar plot showing the prevalence of disease for northern and southern districts as a function of calendar year.

Time series plots

Plot of a continuous variable on the y-axis versus a date variable on the x-axis:

edate <- seq(from = as.Date("2004-01-01"), to = as.Date("2004-12-31"), by = 1)
obs <- rnorm(n = length(edate), mean = 0, sd = 1)
dat <- data.frame(edate, obs)

plot(dat$edate, dat$obs, ylim = c(-5, 5), xlab = "Date", ylab = "Observed", type = "l", lty = 1, cex.axis = 0.75, cex.lab = 0.75)
off.med <- as.Date("30/6/04", format = "%d/%m/%y")
abline(v = c(0, off.med), lty = 2)
text.default(off.med + 15, y = 4, adj = 0, "Off medication", cex = 0.75)

\TeX \begin{figure}[h]
\centering
\includegraphics[width=8.01cm]{../images/time_series.png}
\caption{Time series plot showing behaviour score as a function of observation date, 1 January 2004 to 31 December 2004.}
\label{fig:time_series}
\end{figure}

Figure 10: Time series plot showing behaviour score as a function of observation date, 1 January 2004 to 31 December 2004.