library(tidyverse)
library(rvest) #html webscraping
library(packageRank) #to retrieve CRAN download counts
3 CRAN Packages Webpage
The following code is designed to scrape the names of packages in the ggplot environment from the CRAN page (https://cran.r-project.org/web/packages/available_packages_by_name.html) to then retrieve the cumulative historical cran download count for each.
First import the necessary packages:
Read in the downloaded CRAN packages (alphabetical order) page html file.
<- read_html("raw_data/CRAN_ Available Packages By Name.html") df
Scrape all packages names from the page.
<- df |>
names html_elements("span.CRAN") |>
html_text()
Filter for the names that start with gg by finding the index boundaries and store in gg_package_names.
#beginning index for 'gg' names
<- which(substr(names, 1, 2) == "gg")[1]
gg_start #end index for 'gg' names
<- which(substr(names, 1, 2) == "gh")[1] - 1
gg_end
<- names[gg_start:gg_end] gg_package_names
Remove packages that start with gg but are not relevant to ggplot2. These names were determined manually and saved in the ‘non_ggplot_packages.csv’ file.
<- read.csv("raw_data/non_ggplot_packages.csv")[[1]]
remove_packages
<- gg_package_names[!gg_package_names %in% remove_packages] ggplot_package_names
Scrape all package descriptions from the page. The resulting vector elements alternate between a package name and its description (e.g. ‘ggplot2’, ‘Create Elegant Data Visualisations Using the Grammar of Graphics’, ‘ggplot2.utils’, etc.), and is offset by a “” element between every letter change (e.g. between the end of all ‘a’ names and start of ‘b’ names).
#returns a vector that alternates between package names and their descriptions
<- df |>
desc html_elements("div.container") |>
html_elements("table") |>
html_elements("tbody") |>
html_elements("tr") |>
html_elements("td") |>
html_text()
Filter for only the packages with names that start with ‘gg’ by identifying index boundaries, and remove package names, keeping only descriptions, by retaining every other element to store in new vector.
#beginning index for 'gg' names in descriptions
<- which(substr(desc, 1, 2) == "gg")[1]
first_gg
#end index for 'gg' names in descriptions
<- which(substr(desc, 1, 2) == "gh")[1] - 1
end_gg
<- desc[first_gg:end_gg]
desc_gg
#Select only descriptions (remove package names) by storing every other element
<- desc_gg[c(FALSE, TRUE)] gg_descriptions
Remove descriptions of packages starting with gg but not relevant to ggplot2. Found manually and saved in ‘non_ggplot_desc.csv’ file.
#removes line breaks in descriptions to match csv file formatting
<- gsub("\n", " ", gg_descriptions)
gg_descriptions
<- read.csv("raw_data/non_ggplot_desc.csv")[[1]]
remove_desc
<- gg_descriptions[!gg_descriptions %in% remove_desc] ggplot_descriptions
The following block of code finds additional ggplot packages that do not start with gg. The names and descriptions are saved in csv files that can be read in for the future.
#finds all non gg package names
<- desc[c(1:(first_gg-1), (end_gg+1):length(desc))]
non_gg_desc
#found indices at which either the description or name contains 'ggplot', removes the first index since it is a duplicate
<- grep('ggplot', non_gg_desc)[-1]
gg_indices
#removes a known duplicate index where name and desc both contain 'ggplot'
<- gg_indices[-match(30865, gg_indices)]
gg_indices
#replaces manually found indices at which the name contains 'ggplot' with the indices of their descriptions
<- c(13588, 20251, 29832)
replace for (num in replace) {
<- match(num, gg_indices)
pos <- gg_indices[pos] + 1
gg_indices[pos]
}
#names are at the elements -1 before each desc
<- non_gg_desc[gg_indices-1]
add_ggplot_names <- non_gg_desc[gg_indices]
add_ggplot_desc
#save down additional names and desc for future reference
write_csv(data.frame(names = add_ggplot_names, indices = gg_indices-1), "raw_data/add_ggplot_names.csv")
write_csv(data.frame(desc = add_ggplot_desc, indices = gg_indices), "raw_data/add_ggplot_desc.csv")
Add the additional ggplot package names and descriptions found and saved in the code block above.
<- read.csv("raw_data/add_ggplot_names.csv")[[1]]
add_gg_names <- read.csv("raw_data/add_ggplot_desc.csv")[[1]]
add_gg_desc
<- c(ggplot_package_names, add_gg_names)
all_ggplot_packages <- c(ggplot_descriptions, add_gg_desc) all_ggplot_desc
To find the most current total historical download count, set a target_date of two days before today. Depending on the time of day, cranDownloads is updated to either 1 or 2 days previous to the current day.
<- Sys.Date()-2 target_date
The below function get_total_downloads takes in a package name to retrieve a cumulative count of that package’s cran downloads up until the set target_date by utilizing the cranDownloads funcion of packageRank. Handles error that arises when package is not found on CRAN. Returned as dataframe.
<- function(pkg) {
get_total_downloads
#to = 2025 pulls entire download history
<- tryCatch(
cd cranDownloads(packages = pkg, to = 2025),
#if the package is not found in cran return NA
error = function(e) NA
)
#retrieving the 'cumulative' value of a particular date gets total download count up to that date
<- ifelse(length(cd) == 1, NA, cd$cranlogs.data$cumulative [
count $cranlogs.data$date == target_date
cd
])
data.frame(package = pkg, downloads = count)
}
Retrieve historic cran download count for each package by mapping get_total_downloads across scraped package_names and combining returned dataframes in one df. Will take a few minutes to complete.
<- map_dfr(all_ggplot_packages, get_total_downloads) cran_packages
Store data in new dataframe with a column indicating CRAN page as the source and export as csv file.
$description = all_ggplot_desc
cran_packages$CRAN = TRUE
cran_packages
head(cran_packages)
package downloads
1 gg.gap 42826
2 gg1d 1046
3 ggalign 5514
4 ggaligner 3649
5 ggalignment 10115
6 ggallin 35439
description CRAN
1 Define Segments in y-Axis for 'ggplot2' TRUE
2 Exploratory Data Analysis using Tiled One-Dimensional Graphics TRUE
3 A 'ggplot2' Extension for Consistent Axis Alignment TRUE
4 Visualizing Sequence Alignment by Generating Publication-Ready Plots TRUE
5 Plots 'D&D'-Style Alignment Charts TRUE
6 Grab Bag of 'ggplot2' Functions TRUE
write_csv(cran_packages, "generated_data/cran_packages.csv")