From 8e87fc3a4ed1f3052ba83c16698971ce0b6d6a68 Mon Sep 17 00:00:00 2001 From: tyluRp Date: Tue, 3 Jul 2018 17:21:12 -0700 Subject: [PATCH 1/3] scrape.R modified --- .gitignore | 3 ++- fivethirtyeight.Rproj | 13 ++++++++++ police-deaths/scrape.R | 56 ++++++++++++++++-------------------------- 3 files changed, 36 insertions(+), 36 deletions(-) create mode 100644 fivethirtyeight.Rproj diff --git a/.gitignore b/.gitignore index acf5df0..4f6a9e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store aws.json -.Rhistory \ No newline at end of file +.Rhistory +.Rproj.user diff --git a/fivethirtyeight.Rproj b/fivethirtyeight.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/fivethirtyeight.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/police-deaths/scrape.R b/police-deaths/scrape.R index 751361e..872b3c1 100644 --- a/police-deaths/scrape.R +++ b/police-deaths/scrape.R @@ -1,41 +1,27 @@ +#------------------------------------------------------------------------------- # Dallas shooting scraping +#------------------------------------------------------------------------------- -library(dplyr) +library(tidyverse) library(rvest) -library(readr) -library(tidyr) -library(lubridate) -library(stringr) -library(ggplot2) - -yearly_url <- 'https://www.odmp.org/search/year/' - -years <- seq(1791, 2016) - -all_data <- data.frame() # Scrape data +df <- paste0("https://www.odmp.org/search/year/", seq(1791, 2016)) %>% + as_tibble() %>% + set_names("url") %>% + mutate( + data = map(url, read_html), + nodes = map(data, html_nodes, '[class="officer-short-details"]'), + text = map(nodes, html_text), + clean_text = map(text, str_trim), + clean_text = map(clean_text, str_replace_all, "\n", " separator"), + clean_data = map(clean_text, as.data.frame), + clean_data = map(clean_data, set_names, "string"), + clean_data = map(clean_data, separate, string, c("person", "dept", "eow", "cause"), "separator") + ) %>% + select(clean_data) %>% + unnest() %>% + mutate_all(str_squish) -for (year in years){ - - new_url <- paste0(yearly_url, year) - - selector_yearly <- 'p , p a' - - raw_data <- read_html(new_url) %>% - html_nodes(selector_yearly) %>% - html_text() %>% - as.data.frame() - - names(raw_data) <- c("incident") - - clean_data <- raw_data %>% - separate(incident, sep = '\n\t\t\t\t\t\t\t\t\t', - into = c("person", "dept","eow", "cause")) %>% - filter(!is.na(dept)) - - all_data <- rbind(all_data, clean_data) - -} - -write_csv(all_data, "all_data_fallen_officers.csv") +# Write to CSV +write_csv(df, "all_data_fallen_officers.csv") From f0ec1707622eecdfc55b61f4df377b150d382229 Mon Sep 17 00:00:00 2001 From: Tyler Littlefield <35909636+tyluRp@users.noreply.github.com> Date: Tue, 3 Jul 2018 18:47:25 -0700 Subject: [PATCH 2/3] remove rproj --- fivethirtyeight.Rproj | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 fivethirtyeight.Rproj diff --git a/fivethirtyeight.Rproj b/fivethirtyeight.Rproj deleted file mode 100644 index 8e3c2eb..0000000 --- a/fivethirtyeight.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX From 3fd73701c7c8a29272626612feb78d3b9308c574 Mon Sep 17 00:00:00 2001 From: Tyler Littlefield <35909636+tyluRp@users.noreply.github.com> Date: Tue, 3 Jul 2018 18:48:41 -0700 Subject: [PATCH 3/3] revert .gitignore to original --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4f6a9e5..85783f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ .DS_Store aws.json .Rhistory -.Rproj.user