library(tidyverse)
library(packageRank)
library(plotly)
library(lubridate)
5 Plots to Visualize Download Trends
The following code creates various plots that allow us to visualize different trends over time in the cran download counts of ggplot2 packages.
First import the necessary packages.
Read in the packages stored in the ‘all_packages.csv’ file scraped from the tidyverse gallery and CRAN webpages in previous chapters.
<- read_csv("generated_data/all_packages.csv")
sorted_packages
#top 30 packages by download count, excludes ggplot2
<- c(sorted_packages$package[2:31])
top_packages
#all packages that can be found on CRAN (cran download count is not null)
<- c(sorted_packages$package[!is.na(sorted_packages$downloads)]) cran_packages
Defines a ‘get_cd_data’ function that retrieves historical cran download count data using the cranDownloads function of packageRank.
<- function(pkg) {
get_cd_data cranDownloads(packages = pkg, to = 2025)$cranlogs.data
}
Retrieves cran download data for top 30 packages and all cran packages.
<- map_dfr(top_packages, get_cd_data)
dc_top_packages <- map_dfr(cran_packages, get_cd_data) dc_cran_packages
Creates an interactive time series plot that shows daily download counts of the top 30 ggplot packages across time.
<- ggplot(dc_top_packages, aes(x = date, y = count, color = package)) +
dc_history_plot geom_smooth(se = FALSE, linewidth = .5) +
labs(title = "Downloads Across Time", x = "Date", y = "Download Count")
#uses plotly package to make plot interactive
<- ggplotly(dc_history_plot)
dc_history_plotly dc_history_plotly
Creates an interactive plot that shows most downloaded dates and the respective download counts of the top 30 ggplot packages.
#finds max download count dates
<- dc_top_packages %>%
max_date_df group_by(package) %>%
slice(which.max(count)) %>%
select(package, max_date = date, max_downloads = count)
#scatterplot of the most downloaded dates and respective download counts
<- ggplot(max_date_df, aes(x = max_date, y = max_downloads, color = package)) +
max_dc_plot geom_point() +
labs(title = "Max Download Dates by Package", x = "Max Download Date", y = "Download Count")
#uses plotly to be interactive
<- ggplotly(max_dc_plot)
max_dc_plotly max_dc_plotly
Creates an interactive plot that shows the average daily download count of all cran packages over time.
#finds average download count for all dates
<- dc_cran_packages %>%
average_dc group_by(date) %>%
summarize(average_dc = mean(count, na.rm = TRUE))
<- ggplot(average_dc, aes(x = date, y = average_dc)) +
average_dc_plot geom_point(size = 0.05) +
labs(title = "Average Downloads by Date", x = "Date", y = "Download Count")
#uses plotly to be interactive
<- ggplotly(average_dc_plot)
average_dc_plotly average_dc_plotly
Creates boxplots that show the daily download count of the top 30 ggplot packages for every month and every day of the week.
#determines separates daily download count data by month and by day of the week
<- dc_top_packages %>%
seasonal_data mutate(month = month(date, label = TRUE),
day_of_week = wday(date, label = TRUE))
#create seasonal plots by month and day of week
<- ggplot(seasonal_data, aes(x = month, y = count)) +
dc_monthly geom_boxplot() +
ylim(0, 5000) +
labs(title = "Months in Downloads", x = "Month", y = "Download Count")
dc_monthly
<- ggplot(seasonal_data, aes(x = day_of_week, y = count)) +
dc_daily geom_boxplot() +
ylim(0, 5000) +
labs(title = "Days of the Week in Downloads", x = "Day of Week", y = "Download Count")
dc_daily