library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.4 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(readxl)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(ggthemes)
library(sf)
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
Data
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
covid = read_csv(url)
##
## -- Column specification --------------------------------------------------------
## cols(
## date = col_date(format = ""),
## county = col_character(),
## state = col_character(),
## fips = col_character(),
## cases = col_double(),
## deaths = col_double()
## )
landDate = read.csv("../data/landdata-states.csv")
PopulationEstimates = read_excel("../data/PopulationEstimates.xls", skip=2)
Question 1
state.of.interest = "California"
# County Level
covid %>%
filter(state == state.of.interest, date == max(date)) %>%
slice_max(cases, n=5) %>%
knitr::kable(caption = "Counties with the Most COVID Cases")
Counties with the Most COVID Cases
| 2021-07-29 |
Los Angeles |
California |
06037 |
1294135 |
24675 |
| 2021-07-29 |
Riverside |
California |
06065 |
315080 |
4659 |
| 2021-07-29 |
San Diego |
California |
06073 |
311011 |
3798 |
| 2021-07-29 |
San Bernardino |
California |
06071 |
309779 |
5236 |
| 2021-07-29 |
Orange |
California |
06059 |
281790 |
5143 |
covid %>%
filter(state == state.of.interest) %>%
group_by(county) %>%
mutate(newCases = cases - lag(cases)) %>%
ungroup() %>%
filter(date == max(date)) %>%
slice_max(newCases, n=5) %>%
knitr::kable(caption = "Top 5 Counties with the Most COVID Cases")
Top 5 Counties with the Most COVID Cases
| 2021-07-29 |
Los Angeles |
California |
06037 |
1294135 |
24675 |
3224 |
| 2021-07-29 |
San Diego |
California |
06073 |
311011 |
3798 |
768 |
| 2021-07-29 |
San Bernardino |
California |
06071 |
309779 |
5236 |
616 |
| 2021-07-29 |
Riverside |
California |
06065 |
315080 |
4659 |
594 |
| 2021-07-29 |
Orange |
California |
06059 |
281790 |
5143 |
562 |
pop = PopulationEstimates %>%
select("Area_Name", population = "POP_ESTIMATE_2019", fips = "FIPStxt")
covidpop = inner_join(covid, pop, by = "fips")
covidpop %>%
filter(state == state.of.interest, date == max(date)) %>%
mutate(perCapita= cases/population) %>%
slice_max(perCapita, n = 5) %>%
knitr::kable(caption = "Top 5 Cases Per Capita")
Top 5 Cases Per Capita
| 2021-07-29 |
Lassen |
California |
06035 |
5876 |
26 |
Lassen County |
30573 |
0.1921957 |
| 2021-07-29 |
Imperial |
California |
06025 |
32040 |
742 |
Imperial County |
181215 |
0.1768066 |
| 2021-07-29 |
Kings |
California |
06031 |
24773 |
250 |
Kings County |
152940 |
0.1619786 |
| 2021-07-29 |
San Bernardino |
California |
06071 |
309779 |
5236 |
San Bernardino County |
2180085 |
0.1420949 |
| 2021-07-29 |
Tulare |
California |
06107 |
60441 |
854 |
Tulare County |
466195 |
0.1296475 |
covidpop %>%
filter(state == state.of.interest) %>%
group_by(county, state) %>%
filter(date > (max(date)-14)) %>%
summarise(min_cases = min(cases), max_cases = max(cases), population = mean(population)) %>%
mutate(diff = max_cases-min_cases) %>%
ungroup() %>%
mutate(NewperCap = diff/population) %>%
slice_max(NewperCap, n = 5) %>%
knitr::kable(caption = "Top 5 New Cases Per Capita")
## `summarise()` has grouped output by 'county'. You can override using the `.groups` argument.
Top 5 New Cases Per Capita
| Lake |
California |
3774 |
4204 |
64386 |
430 |
0.0066785 |
| Del Norte |
California |
1522 |
1643 |
27812 |
121 |
0.0043506 |
| Yuba |
California |
6691 |
6985 |
78668 |
294 |
0.0037372 |
| Tuolumne |
California |
4322 |
4507 |
54478 |
185 |
0.0033959 |
| Sacramento |
California |
112421 |
117054 |
1552058 |
4633 |
0.0029851 |
Question 2
states.of.interest <- c("New York", "California", "Louisiana", "Florida")
# State level
StateLevel <- covid %>%
filter(state %in% states.of.interest) %>%
group_by(state, date) %>%
summarise(cases = sum(cases)) %>%
mutate(newCases = cases - lag(cases),
roll7 = rollmean(newCases, 7, fill = NA, align="right")) %>%
ungroup()
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
ggplot(data = StateLevel, aes(x = date, y = roll7, group = state)) +
geom_line(aes(col=state)) +
labs( x = "Date",
y = "Daily New Cases",
title = "COVID-19 Cases in Ca, Ny, La, Fl",
subtitle = "7 Day Rolling Mean",
caption = "Source: New York Times") +
facet_grid(~state) +
theme_light() +
theme(plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
plot.title = element_text(size = 14, face = 'bold')) +
theme(aspect.ratio = 0.75) +
theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 28 row(s) containing missing values (geom_path).

stateCovid = inner_join(pop, covid, by = "fips")
stateCovid %>%
filter(state %in% c("California", "Florida", "Louisiana", "New York")) %>%
group_by(state, date) %>%
summarise(cases = sum(cases)) %>%
mutate(NewCases = cases - lag(cases),
roll7 = rollmean(NewCases, 7, fill = NA, align="right")) %>%
ungroup()
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
## # A tibble: 2,089 x 5
## state date cases NewCases roll7
## <chr> <date> <dbl> <dbl> <dbl>
## 1 California 2020-01-25 1 NA NA
## 2 California 2020-01-26 2 1 NA
## 3 California 2020-01-27 2 0 NA
## 4 California 2020-01-28 2 0 NA
## 5 California 2020-01-29 2 0 NA
## 6 California 2020-01-30 2 0 NA
## 7 California 2020-01-31 3 1 NA
## 8 California 2020-02-01 3 0 0.286
## 9 California 2020-02-02 6 3 0.571
## 10 California 2020-02-03 6 0 0.571
## # ... with 2,079 more rows