5 NYC Restaurant Inspection Analysis

Charlene Shen

5.1 Introduction

5.2 Data

5.2.1 Description

library(readr)
library(dplyr)
library(ggplot2)
library(patchwork)
library(tidyverse)

df <- read_csv("data/nyc_restaurant_sample.csv", show_col_types = FALSE)

head(df)

# A tibble: 6 × 27
     CAMIS DBA         BORO  BUILDING STREET ZIPCODE PHONE `CUISINE DESCRIPTION`
     <dbl> <chr>       <chr> <chr>    <chr>    <dbl> <chr> <chr>                
1 50164459 The Row Ha… Manh… 2270     ADAM …      NA 6468… <NA>                 
2 50168743 QINGYUN INC Manh… 367      WEST …   10018 2014… <NA>                 
3 50180582 GRUHAM FOO… Broo… 100      WILLO…   11201 9087… <NA>                 
4 50178349 GOOD BAKLA… Manh… 1        HERAL…   10001 6199… <NA>                 
5 50169251 BARTOLO BA… Manh… 714      11 AV…   10019 9176… <NA>                 
6 50130631 Yong Sheng… Manh… 2825     FRDRK…      NA 2128… Chinese              
# ℹ 19 more variables: `INSPECTION DATE` <chr>, ACTION <chr>,
#   `VIOLATION CODE` <chr>, `VIOLATION DESCRIPTION` <chr>,
#   `CRITICAL FLAG` <chr>, SCORE <dbl>, GRADE <chr>, `GRADE DATE` <chr>,
#   `RECORD DATE` <chr>, `INSPECTION TYPE` <chr>, Latitude <dbl>,
#   Longitude <dbl>, `Community Board` <dbl>, `Council District` <chr>,
#   `Census Tract` <chr>, BIN <dbl>, BBL <dbl>, NTA <chr>, Location <chr>

5.2.2 Missing value analysis

5.2.2.1 Missing Values by Variable

Analyses involving grades or scores should be interpreted with caution, as missing data may introduce bias into the results.

missing_count <- data.frame(variable = names(df), missing = colSums(is.na(df)))

ggplot(missing_count, aes(x = reorder(variable, missing), y = missing)) +
  geom_col() +
  coord_flip() +
  labs(title = "Missing Values by Variable", x = "Variable", y = "Number of Missing Values") +
  theme_minimal()

5.2.2.2 Missing Grade by Inspection Type

It seems that missing values in the GRADE variable are not random. Instead, they depend on the type of inspection. For example, categories like administrative or compliance inspections almost always have missing grades, which likely means that a grade is not assigned in those cases. On the other hand, initial and cycle inspections are more likely to have a recorded grade. This suggests that whether a grade appears in the data depends more on the inspection process itself rather than just data quality issues.

df %>%
  mutate(grade_missing = is.na(GRADE)) %>%
  count(`INSPECTION TYPE`, grade_missing) %>%
  ggplot(aes(x = `INSPECTION TYPE`, y = n, fill = grade_missing)) +
  geom_col(position = "fill") +
  coord_flip() +
  theme_minimal()

5.3 Results

5.3.1 Question 1: Are re-inspections associated with worse scores?

df |>
  mutate(reinspect = `INSPECTION TYPE` %in% c(
    "Cycle / Re-inspection",
    "Pre-permit (Operational) / Re-inspection",
    "Pre-permit (Non-operational) / Re-inspection"
  )) |>
  ggplot(aes(x = reinspect, y = SCORE)) +
  geom_boxplot() +
  labs(
    title = "Scores by Inspection Type",
    x = "Re-inspection",
    y = "Score"
  ) +
  theme_minimal()

5.3.2 Question 2: Which cuisine types have the worst inspection scores?

cuisine_score <- df |>
  filter(!is.na(SCORE), !is.na(`CUISINE DESCRIPTION`)) |>
  group_by(`CUISINE DESCRIPTION`) |>
  summarise(
    avg_score = mean(SCORE),
    count = n()
  ) |>
  filter(count >= 20) |>
  arrange(desc(avg_score)) |>
  slice(1:50)

ggplot(cuisine_score, aes(x = avg_score, y = reorder(`CUISINE DESCRIPTION`, avg_score))) +
  geom_col(fill = "pink") +
  labs(
    title = "Cuisine Types with Worst Inspection Scores",
    x = "Average Inspection Score",
    y = "Cuisine Type"
  ) +
  theme_minimal()

cuisine_score_box <- df |>
  filter(!is.na(SCORE), !is.na(`CUISINE DESCRIPTION`)) |>
  group_by(`CUISINE DESCRIPTION`) |>
  filter(n() >= 20) |>
  ungroup()

top_cuisine <- cuisine_score_box |>
  count(`CUISINE DESCRIPTION`, sort = TRUE) |>
  slice(1:12)

cuisine_score_box <- cuisine_score_box |>
  filter(`CUISINE DESCRIPTION` %in% top_cuisine$`CUISINE DESCRIPTION`)

ggplot(cuisine_score_box, aes(x = SCORE, y = reorder(`CUISINE DESCRIPTION`, SCORE))) +
  geom_boxplot(outlier.shape = NA) +
  coord_cartesian(xlim = c(0, 80)) +
  labs(
    title = "Inspection Score Distribution by Cuisine Type", 
    x = "Inspection Score", 
    y = "Cuisine Type"
  ) +
  theme_minimal()

5.3.3 Question 3: Are restaurant inspection grades geographically clustered?

location_grade <- df |>
  filter(
    !is.na(Longitude),
    !is.na(Latitude),
    Longitude != 0,
    Latitude != 0,
    GRADE %in% c("A", "B", "C")
  )

ggplot(location_grade, aes(x = Longitude, y = Latitude, color = GRADE)) +
  geom_point(alpha = 0.5, size = 1) +
  labs(
    title = "Restaurant Locations by Inspection Grade",
    x = "Longitude",
    y = "Latitude",
    color = "Grade"
  ) +
  theme_minimal()

5.1 Introduction

5.2 Data

5.2.1 Description

5.2.2 Missing value analysis

5.2.2.1 Missing Values by Variable

5.2.2.2 Missing Grade by Inspection Type

5.3 Results

5.3.1 Question 1: Are re-inspections associated with worse scores?

5.3.2 Question 2: Which cuisine types have the worst inspection scores?

5.3.3 Question 3: Are restaurant inspection grades geographically clustered?

5.4 Conclusion