add code and data for librarians post

2014-04-11 17:20:02 -04:00
parent 3e717082fd
commit 26d77edecb
3 changed files with 51 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -3,4 +3,4 @@ Article Date | Headline | File or folder
 March 17, 2014 | [FiveThirtyEight’s NCAA Tournament Predictions](http://fivethirtyeight.com/interactives/march-madness-predictions) | `march-madness-predictions`
 March 27, 2014 | [The NCAA Bracket: Checking Our Work](http://fivethirtyeight.com/datalab/the-ncaa-bracket-checking-our-work) | `historical-538-ncaa-tournament-model-results.csv`
 April 1, 2014 | [The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women](http://fivethirtyeight.com/features/the-dollar-and-cents-case-against-hollywoods-exclusion-of-women) | `bechdel`
-
+April 11, 2014 | [Where Are America’s Librarians?](http://fivethirtyeight.com/datalab/where-are-americas-librarians) | `librarians`
--- a/librarians/librarians-by-msa.csv
+++ b/librarians/librarians-by-msa.csv
--- a/librarians/librarians.R
+++ b/librarians/librarians.R
@@ -0,0 +1,49 @@
+# Re-estimates the percent standard error of specific occupational employment statistics in
+# metropolitan statistical areas (MSA's), using BLS data (http://www.bls.gov/oes/tables.htm),
+# and then calculates a margin of error (upper and lower bounds) for those MSA's
+
+# By Andrew Flowers <andrew.flowers@fivethirtyeight.com>
+# See also http://fivethirtyeight.com/datalab/where-are-americas-librarians/
+
+# install.packages(c("ggplot2", "stats"))
+library(ggplot2)
+library(stats)
+
+# Loan and clean data
+libMSA<-read.csv("librarians-by-msa.csv", header=T) # May 2013 data from BLS (http://www.bls.gov/oes/tables.htm)
+names(libMSA)<-tolower(names(libMSA))
+libMSA$tot_emp<-as.numeric(gsub("[$]|,", "", libMSA$tot_emp))
+libMSA$emp_prse<-as.numeric(gsub("[$]|,", "", libMSA$emp_prse))
+
+# Exploratory plots
+plot(libMSA$tot_emp, libMSA$emp_prse, main="Librarian Employment vs. Standard Error", xlab="Librarian Employment", ylab="Standard Error")
+
+# Linear model
+l.model<-lm(libMSA$emp_prse~libMSA$tot_emp)
+abline(l.model, col="red")
+summary(l.model)   ### Linear model is bad
+
+# Non-linear model
+nl.model<-nls(formula=emp_prse~a*tot_emp^b, start=list(a=1, b=1), data=libMSA)
+summary(nl.model)
+a<-coef(nl.model)[1]; b<-coef(nl.model)[2]
+
+plot(libMSA$tot_emp, libMSA$emp_prse, main="Librarian Employment vs. Standard Error", xlab="Librarian Employment", ylab="Standard Error")
+curve(a*x^b, col='red', add=T)
+
+# ggplot2 non-linear model
+g<-ggplot(libMSA, aes(x=tot_emp, y=emp_prse))
+g<-g+stat_smooth(method="nls", formula=y~a*x^b, se=FALSE, start=list(a=1, b=1))+geom_point()
+g+ggtitle("Librarian Employment vs Standard Error")+ylab("Standard Error")+xlab("Librarian Employment")
+
+# Create high and low estimates using new margin of error
+libMSA$mor<-(a*(libMSA$tot_emp^b))*1.96
+libMSA$high_emp<-libMSA$tot_emp*(1+(libMSA$mor/100))
+libMSA$low_emp<-libMSA$tot_emp*(1-(libMSA$mor/100))
+
+write.csv(libMSA, file="new-librarians-by-msa.csv")
+
+
+
+
+