url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
covid = read_csv(url) 

Question 1

set_up = covid %>%
  filter(state == "California") %>% 
  group_by(county) %>% 
  mutate(NewCases = cases - lag(cases)) %>% 
  mutate(Cumul_Cases = sum(cases)) %>% 
  ungroup() %>% 
  filter(date == max(date))

Table representing the five counties in CA with the most new cases at the most current day

(most_new_cases = set_up %>% 
  slice_max(NewCases, n = 5) %>% 
  select(county, NewCases))
## # A tibble: 5 x 2
##   county         NewCases
##   <chr>             <dbl>
## 1 Los Angeles         809
## 2 San Diego           265
## 3 Orange              185
## 4 Fresno              159
## 5 San Bernardino      156
knitr::kable(most_new_cases,
             caption = "Most New Cases: California Counties",
             col.names = c("County", "New Cases"),
             format.args = list(big.mark = ",")) %>% 
  kableExtra:: kable_styling("basic", full_width =  TRUE, font_size = NULL)
Most New Cases: California Counties
County New Cases
Los Angeles 809
San Diego 265
Orange 185
Fresno 159
San Bernardino 156

Table representing the five counties in CA with the most cumulative cases at the most current day

(most_cumul_cases = set_up %>% 
    slice_max(Cumul_Cases, n = 5) %>% 
    select(county, Cumul_Cases))
## # A tibble: 5 x 2
##   county         Cumul_Cases
##   <chr>                <dbl>
## 1 Los Angeles       18408002
## 2 Riverside          3611739
## 3 Orange             3293539
## 4 San Bernardino     2972488
## 5 San Diego          2832227
knitr::kable(most_cumul_cases,
             caption = "Most Cumulative Cases: California Counties",
             col.names = c("County", "Cumulative Cases"),
             format.args = list(big.mark = ",")) %>% 
  kableExtra:: kable_styling("basic", full_width =  TRUE, font_size = NULL)
Most Cumulative Cases: California Counties
County Cumulative Cases
Los Angeles 18,408,002
Riverside 3,611,739
Orange 3,293,539
San Bernardino 2,972,488
San Diego 2,832,227

Adding population information

PopulationEstimates <- read_excel("../data/PopulationEstimates.xls", skip = 2) %>% 
  dplyr::select(fips = FIPStxt, state = State, county= Area_Name, pop_2019 = POP_ESTIMATE_2019)
pop = PopulationEstimates %>% 
  dplyr::right_join(covid, by = "fips") %>% 
  dplyr::filter(state.y == "California") %>% 
  dplyr::group_by(county.y) %>% 
  mutate(cumul_cases_per_cap = sum(cases) / pop_2019) %>%
  mutate(new_cases_per_cap = ((cases - lag(cases)) /pop_2019)) %>% 
  filter(date == max(date)) %>% 
  ungroup()

Table representing the five counties in CA with the most new cases per capita at the most current day

(most_new_cases_per_cap = pop %>% 
    slice_max(new_cases_per_cap, n = 5) %>% 
    select(county.y, new_cases_per_cap))
## # A tibble: 5 x 2
##   county.y   new_cases_per_cap
##   <chr>                  <dbl>
## 1 Kings               0.000262
## 2 San Benito          0.000239
## 3 Monterey            0.000203
## 4 Lake                0.000171
## 5 Fresno              0.000159
knitr::kable(most_new_cases_per_cap,
             caption = "Most New Cases Per Capita: California Counties",
             col.names = c("County", "New Cases Per Capita"),
             format.args = list(big.mark = ",")) %>% 
  kableExtra:: kable_styling("basic", full_width =  TRUE, font_size = NULL)
Most New Cases Per Capita: California Counties
County New Cases Per Capita
Kings 0.0002615
San Benito 0.0002388
Monterey 0.0002027
Lake 0.0001708
Fresno 0.0001591

Table representing the five counties in CA with the most cumulative cases per capita at the most current day

(most_cumul_per_cap = pop %>% 
    slice_max(cumul_cases_per_cap, n = 5) %>% 
    select(county.y, cumul_cases_per_cap))
## # A tibble: 5 x 2
##   county.y    cumul_cases_per_cap
##   <chr>                     <dbl>
## 1 Imperial                   4.80
## 2 Kings                      2.77
## 3 Tulare                     1.95
## 4 Kern                       1.84
## 5 Los Angeles                1.83
knitr::kable(most_cumul_per_cap,
             caption = "Most Cumulative Cases Per Capita: California Counties",
             col.names = c("County", "Cumulative Cases Per Capita"),
             format.args = list(big.mark = ",")) %>% 
  kableExtra:: kable_styling("basic", full_width =  TRUE, font_size = NULL)
Most Cumulative Cases Per Capita: California Counties
County Cumulative Cases Per Capita
Imperial 4.795111
Kings 2.772264
Tulare 1.949337
Kern 1.844563
Los Angeles 1.833629

Total number of new cases in the last 14 days per 100,000 people

last_14_days = covid %>% 
  right_join(PopulationEstimates, by = "fips") %>% 
  filter(date >= max(date, na.rm = TRUE) - 13, state.x == "California") %>% 
  group_by(county.x) %>% 
  summarize(new_cases_per_cap = sum(cases - lag(cases), na.rm = TRUE) / (max(pop_2019) / 100000)) %>% 
  filter(new_cases_per_cap <= 100) %>% 
  ungroup()

As of 8/17/2020, there are a total of 14 safe counties in California. This means they comply with the California Department of Public Health’s criteria of having less than 100 new cases per 100,000 residents over the past 14 days.

Question 2

The number of daily new cases and the 7-day rolling mean of the California, New York, Florida, and Lousiana

four_states = covid %>% 
  filter (state %in% c("California", "Louisiana", "New York", "Florida")) %>% 
  group_by(state, date) %>%
  summarize(cases = sum(cases)) %>% 
  ungroup() %>% 
  group_by(state) %>% 
  mutate(daily_new_cases= cases-lag(cases), 
         seven_day_roll= zoo::rollmean(daily_new_cases, 7, fill= NA, align= 'right')) %>% 
  ungroup() %>% 
  filter(daily_new_cases > 0)

  ggplot(four_states, aes( x = date)) +
  geom_col(aes(y = daily_new_cases, col= state)) +
  geom_line(aes(y = seven_day_roll, col = "darkred", size = 1)) +
  facet_wrap(~state, scale = "free_y") +
  labs(title = "Daily New Covid Cases at the State Level", x= "", y= "", color = "", caption = "Lab 02", subtitle = "COVID-19 Data: NY Times" ) +
  theme_light() +
  theme(legend.position = 'NA')

To get the number of daily new cases per capita and the 7-day rolling mean of the California, New York, Florida, and Lousiana.

cap_state= PopulationEstimates %>% 
  select(state, pop_2019) %>% 
  right_join(four_states, by = "state") %>% 
  mutate(state_per_cap = ((cases - lag(cases))/ pop_2019), 
        seven_day_roll= zoo::rollmean(daily_new_cases, 7, fill= NA, align= 'right'))
  

ggplot(cap_state, aes(x=date)) +
  geom_col(aes(y= state_per_cap), col= NA, fill= "#F5B8B5") +
  geom_line(aes(y= seven_day_roll), col= "darkred", size= 1) +
  theme_light() +
  labs(title = "States' New Covid Cases Per Capita", x= "", y= "",  caption = "Lab 02", subtitle = "COVID-19 Data: NY Times" ) +
  theme(legend.position = 'NA')

The number of daily, new cases at the state level can be misleading since it doesn’t take into consideration the state’s population. Once population is integrated into the number of cases, it is expect for data to reflect a more accurate representation of which out of the four states (CA, LA, NY, FL) has been affected more by COVID.