library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.4     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(readxl)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(ggthemes)
library(sf)
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1

Data

url =  'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
covid = read_csv(url)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   date = col_date(format = ""),
##   county = col_character(),
##   state = col_character(),
##   fips = col_character(),
##   cases = col_double(),
##   deaths = col_double()
## )
landDate = read.csv("../data/landdata-states.csv")
PopulationEstimates = read_excel("../data/PopulationEstimates.xls", skip=2)

Question 1

state.of.interest = "California"

# County Level
covid %>%
  filter(state == state.of.interest, date == max(date)) %>%
  slice_max(cases, n=5) %>% 
knitr::kable(caption = "Counties with the Most COVID Cases")
Counties with the Most COVID Cases
date county state fips cases deaths
2021-07-29 Los Angeles California 06037 1294135 24675
2021-07-29 Riverside California 06065 315080 4659
2021-07-29 San Diego California 06073 311011 3798
2021-07-29 San Bernardino California 06071 309779 5236
2021-07-29 Orange California 06059 281790 5143
covid %>% 
  filter(state == state.of.interest) %>% 
  group_by(county) %>% 
  mutate(newCases = cases - lag(cases)) %>% 
  ungroup() %>%
  filter(date == max(date)) %>% 
  slice_max(newCases, n=5) %>% 
knitr::kable(caption = "Top 5 Counties with the Most COVID Cases")
Top 5 Counties with the Most COVID Cases
date county state fips cases deaths newCases
2021-07-29 Los Angeles California 06037 1294135 24675 3224
2021-07-29 San Diego California 06073 311011 3798 768
2021-07-29 San Bernardino California 06071 309779 5236 616
2021-07-29 Riverside California 06065 315080 4659 594
2021-07-29 Orange California 06059 281790 5143 562
pop = PopulationEstimates %>% 
  select("Area_Name", population = "POP_ESTIMATE_2019", fips = "FIPStxt")

covidpop = inner_join(covid, pop, by = "fips")

covidpop %>% 
  filter(state == state.of.interest, date == max(date)) %>% 
  mutate(perCapita= cases/population) %>% 
  slice_max(perCapita, n = 5) %>% 
knitr::kable(caption = "Top 5 Cases Per Capita")
Top 5 Cases Per Capita
date county state fips cases deaths Area_Name population perCapita
2021-07-29 Lassen California 06035 5876 26 Lassen County 30573 0.1921957
2021-07-29 Imperial California 06025 32040 742 Imperial County 181215 0.1768066
2021-07-29 Kings California 06031 24773 250 Kings County 152940 0.1619786
2021-07-29 San Bernardino California 06071 309779 5236 San Bernardino County 2180085 0.1420949
2021-07-29 Tulare California 06107 60441 854 Tulare County 466195 0.1296475
covidpop %>% 
  filter(state == state.of.interest) %>% 
  group_by(county, state) %>% 
  filter(date > (max(date)-14)) %>% 
  summarise(min_cases = min(cases), max_cases = max(cases), population = mean(population)) %>% 
  mutate(diff = max_cases-min_cases) %>% 
  ungroup() %>% 
  mutate(NewperCap = diff/population) %>% 
  slice_max(NewperCap, n = 5) %>% 
knitr::kable(caption = "Top 5 New Cases Per Capita")
## `summarise()` has grouped output by 'county'. You can override using the `.groups` argument.
Top 5 New Cases Per Capita
county state min_cases max_cases population diff NewperCap
Lake California 3774 4204 64386 430 0.0066785
Del Norte California 1522 1643 27812 121 0.0043506
Yuba California 6691 6985 78668 294 0.0037372
Tuolumne California 4322 4507 54478 185 0.0033959
Sacramento California 112421 117054 1552058 4633 0.0029851

Question 2

states.of.interest <- c("New York", "California", "Louisiana", "Florida")

# State level
StateLevel <- covid %>% 
  filter(state %in% states.of.interest) %>% 
  group_by(state, date) %>% 
  summarise(cases = sum(cases)) %>% 
  mutate(newCases = cases - lag(cases),
         roll7 = rollmean(newCases, 7, fill = NA, align="right")) %>% 
  ungroup()
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
ggplot(data = StateLevel, aes(x = date, y = roll7, group = state)) +
  geom_line(aes(col=state)) +
  labs( x = "Date", 
        y = "Daily New Cases", 
        title = "COVID-19 Cases in Ca, Ny, La, Fl",
        subtitle = "7 Day Rolling Mean",
        caption = "Source: New York Times") +
  facet_grid(~state) +
  theme_light() +
  theme(plot.background = element_rect(fill = "white"),
          panel.background = element_rect(fill = "white"),
          plot.title = element_text(size = 14, face = 'bold')) +
  theme(aspect.ratio = 0.75) +
  theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 28 row(s) containing missing values (geom_path).

stateCovid = inner_join(pop, covid, by = "fips")

stateCovid %>% 
  filter(state %in% c("California", "Florida", "Louisiana", "New York")) %>% 
  group_by(state, date) %>% 
  summarise(cases = sum(cases)) %>% 
  mutate(NewCases = cases - lag(cases),
         roll7 = rollmean(NewCases, 7, fill = NA, align="right")) %>% 
  ungroup()
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
## # A tibble: 2,089 x 5
##    state      date       cases NewCases  roll7
##    <chr>      <date>     <dbl>    <dbl>  <dbl>
##  1 California 2020-01-25     1       NA NA    
##  2 California 2020-01-26     2        1 NA    
##  3 California 2020-01-27     2        0 NA    
##  4 California 2020-01-28     2        0 NA    
##  5 California 2020-01-29     2        0 NA    
##  6 California 2020-01-30     2        0 NA    
##  7 California 2020-01-31     3        1 NA    
##  8 California 2020-02-01     3        0  0.286
##  9 California 2020-02-02     6        3  0.571
## 10 California 2020-02-03     6        0  0.571
## # ... with 2,079 more rows

Question 3 (extra credit)

#counties = USAboundaries::us_counties() %>% 
 # select("fips" = geoid, name, state_name) %>% 
  #st_centroid() %>% 
  #mutate(LON = st_coordinates(.)[,1], 
   #      LAT = st_coordinates(.)[,2]) %>% 
#  st_drop_geometry()