3 Appendix B: CRAN Packages Webpage Scrape

The following code is designed to scrape the names of packages in the ggplot environment from the CRAN page (https://cran.r-project.org/web/packages/available_packages_by_name.html) to then retrieve the cumulative historical cran download count for each.

First import the necessary packages:

library(tidyverse)
library(rvest) #html webscraping
library(packageRank) #to retrieve CRAN download counts

Read in the downloaded CRAN packages (alphabetical order) page html file.

df <- read_html("raw_data/CRAN_ Available Packages By Name.html")

Scrape all packages names from the page.

names <- df |>
  html_elements("span.CRAN") |> 
  html_text()

Filter for the names that start with gg by finding the index boundaries and store in gg_package_names.

#beginning index for 'gg' names
gg_start <- which(substr(names, 1, 2) == "gg")[1] 
#end index for 'gg' names
gg_end <- which(substr(names, 1, 2) == "gh")[1] - 1 

gg_package_names <- names[gg_start:gg_end]

Remove packages that start with gg but are not relevant to ggplot2. These names were determined manually and saved in the ‘non_ggplot_packages.csv’ file.

remove_packages <- read.csv("raw_data/non_ggplot_packages.csv")[[1]]

ggplot_package_names <- gg_package_names[!gg_package_names %in% remove_packages]

Scrape all package descriptions from the page. The resulting vector elements alternate between a package name and its description (e.g. ‘ggplot2’, ‘Create Elegant Data Visualisations Using the Grammar of Graphics’, ‘ggplot2.utils’, etc.), and is offset by a “” element between every letter change (e.g. between the end of all ‘a’ names and start of ‘b’ names).

#returns a vector that alternates between package names and their descriptions
desc <- df |>
  html_elements("div.container") |> 
  html_elements("table") |> 
  html_elements("tbody") |> 
  html_elements("tr") |> 
  html_elements("td") |> 
  html_text()

Filter for only the packages with names that start with ‘gg’ by identifying index boundaries, and remove package names, keeping only descriptions, by retaining every other element to store in new vector.

#beginning index for 'gg' names in descriptions
first_gg <- which(substr(desc, 1, 2) == "gg")[1] 

#end index for 'gg' names in descriptions
end_gg <- which(substr(desc, 1, 2) == "gh")[1] - 1 

desc_gg <- desc[first_gg:end_gg]

#Select only descriptions (remove package names) by storing every other element
gg_descriptions <- desc_gg[c(FALSE, TRUE)]

Remove descriptions of packages starting with gg but not relevant to ggplot2. Found manually and saved in ‘non_ggplot_desc.csv’ file.

#removes line breaks in descriptions to match csv file formatting
gg_descriptions <- gsub("\n", " ", gg_descriptions)

remove_desc <- read.csv("raw_data/non_ggplot_desc.csv")[[1]]

ggplot_descriptions <- gg_descriptions[!gg_descriptions %in% remove_desc]

The following block of code finds additional ggplot packages that do not start with gg. The names and descriptions are saved in csv files that can be read in for the future.

#finds all non gg package names
non_gg_desc <- desc[c(1:(first_gg-1), (end_gg+1):length(desc))] 

#found indices at which either the description or name contains 'ggplot', removes the first index since it is a duplicate
gg_indices <- grep('ggplot', non_gg_desc)[-1] 

#removes a known duplicate index where name and desc both contain 'ggplot'
gg_indices <- gg_indices[-match(30865, gg_indices)] 


#replaces manually found indices at which the name contains 'ggplot' with the indices of their descriptions
replace <- c(13588, 20251, 29832)
for (num in replace) {
  pos <- match(num, gg_indices)
  gg_indices[pos] <- gg_indices[pos] + 1
}

#names are at the elements -1 before each desc
add_ggplot_names <- non_gg_desc[gg_indices-1] 
add_ggplot_desc <- non_gg_desc[gg_indices]

#save down additional names and desc for future reference
write_csv(data.frame(names = add_ggplot_names, indices = gg_indices-1), "raw_data/add_ggplot_names.csv")
write_csv(data.frame(desc = add_ggplot_desc, indices = gg_indices), "raw_data/add_ggplot_desc.csv")

Add the additional ggplot package names and descriptions found and saved in the code block above.

add_gg_names <- read.csv("raw_data/add_ggplot_names.csv")[[1]]
add_gg_desc <- read.csv("raw_data/add_ggplot_desc.csv")[[1]]

all_ggplot_packages <- c(ggplot_package_names, add_gg_names)
all_ggplot_desc <- c(ggplot_descriptions, add_gg_desc)

To find the most current total historical download count, set a target_date of two days before today. Depending on the time of day, cranDownloads is updated to either 1 or 2 days previous to the current day.

target_date <- Sys.Date()-2

The below function get_total_downloads takes in a package name to retrieve a cumulative count of that package’s cran downloads up until the set target_date by utilizing the cranDownloads funcion of packageRank. Handles error that arises when package is not found on CRAN. Returned as dataframe.

get_total_downloads <- function(pkg) {
  
  #to = 2025 pulls entire download history
  cd <- tryCatch(
    cranDownloads(packages = pkg, to = 2025),
    
    #if the package is not found in cran return NA
    error = function(e) NA
  )
  
  #retrieving the 'cumulative' value of a particular date gets total download count up      to that date
  count <- ifelse(length(cd) == 1, NA, cd$cranlogs.data$cumulative [ 
    cd$cranlogs.data$date == target_date
  ])
  
  data.frame(package = pkg, downloads = count)
}

Retrieve historic cran download count for each package by mapping get_total_downloads across scraped package_names and combining returned dataframes in one df. Will take a few minutes to complete.

cran_packages <- map_dfr(all_ggplot_packages, get_total_downloads)

Store data in new dataframe with a column indicating CRAN page as the source and export as csv file with a timestamp indicating when data was generated.

cran_packages$description = all_ggplot_desc
cran_packages$CRAN = TRUE

head(cran_packages)

      package downloads
1      gg.gap     44343
2        gg1d      1582
3     ggalign      8215
4   ggaligner      4053
5 ggalignment     10723
6     ggallin     36188
                                                           description CRAN
1                              Define Segments in y-Axis for 'ggplot2' TRUE
2       Exploratory Data Analysis using Tiled One-Dimensional Graphics TRUE
3                  A 'ggplot2' Extension for Consistent Axis Alignment TRUE
4 Visualizing Sequence Alignment by Generating Publication-Ready Plots TRUE
5                                   Plots 'D&D'-Style Alignment Charts TRUE
6                                      Grab Bag of 'ggplot2' Functions TRUE

timestamp <- paste("# CRAN data generated on:", Sys.time())

data_lines <- format_csv(cran_packages)

# Combine and write
write_lines(c(timestamp, data_lines), "generated_data/cran_packages.csv")

Read in and merge the data on ggplot packages from the gallery and from the cran webpage to store in a new dataframe, packages.

gallery_packages <- read_csv("generated_data/gallery_packages.csv", skip = 1)

cran_packages <- read_csv("generated_data/cran_packages.csv", skip = 1)

packages <- full_join(gallery_packages, cran_packages, by = c("package", "downloads"))

Sort by download count and save as all_packages with timestamp.

sorted_packages <- arrange(packages, desc(downloads))
sorted_packages$CRAN <- ifelse(is.na(sorted_packages$CRAN), FALSE, sorted_packages$CRAN)
sorted_packages$gallery <- ifelse(is.na(sorted_packages$gallery), FALSE, sorted_packages$gallery)

head(sorted_packages)

# A tibble: 6 × 6
  package  downloads stars gallery description                             CRAN 
  <chr>        <dbl> <dbl> <lgl>   <chr>                                   <lgl>
1 ggplot2  165853926    NA FALSE   Create Elegant Data Visualisations Usi… TRUE 
2 ggrepel   24613605  1228 TRUE    Automatically Position Non-Overlapping… TRUE 
3 cowplot   19405325   712 TRUE    Streamlined Plot Theme and Plot Annota… TRUE 
4 ggpubr    17120944  1154 TRUE    'ggplot2' Based Publication Ready Plots TRUE 
5 ggsci     12697535   676 TRUE    Scientific Journal and Sci-Fi Themed C… TRUE 
6 ggsignif  12619171   597 TRUE    Significance Brackets for 'ggplot2'     TRUE

gallery_ts <- readLines("generated_data/gallery_packages.csv", n = 1)
cran_ts <- readLines("generated_data/cran_packages.csv", n = 1)
data_lines <- format_csv(sorted_packages)

write_lines(c(gallery_ts, cran_ts, data_lines), "generated_data/all_packages.csv")