Create college-majors-rscript.R

2025-12-19 17:37:43 -05:00 · 2014-09-11 17:24:13 -04:00
parent af6a5f1fa2
commit 613c858a34
1 changed files with 165 additions and 0 deletions
--- a/college-majors/college-majors-rscript.R
+++ b/college-majors/college-majors-rscript.R
@@ -0,0 +1,165 @@
+##############################################################
+#
+# COLLEGE MAJORS AND EARNINGS
+# This is the code used to generate data for FiveThirtyEight's
+# story on earnings by college major.
+# Analysis is based off the 2010-2012 American Community Survey
+# microdata. 
+# Download data here: http://www.census.gov/acs/www/data_documentation/pums_data/
+# Documentation here: http://www.census.gov/acs/www/data_documentation/pums_documentation/
+
+# First download data and select records for which college major 
+# (variable FOD1P) is present. Save into data frame as MAJORS1012.
+# Also download github_majorslist.csv
+
+require(dplyr)
+
+load("MAJORS1012")
+MajorsList <- read.csv("github_majorslist.csv",header=TRUE,stringsAsFactors=FALSE)
+
+working <- merge(MAJORS1012,MajorsList,by="FOD1P") # Check merge properly
+MAJORS1012  <- working
+rm(working)
+
+# Create dummy for full-time, year-round workers
+MAJORS1012$FTYR <- 0
+MAJORS1012$FTYR[MAJORS1012$WKW == 1 & MAJORS1012$WKHP >= 35] <- 1
+nrow(MAJORS1012[MAJORS1012$FTYR==1,])
+
+# If you want inflation-adjusted earnings, this is how to do it
+# MAJOR1012 <- mutate(MAJORS1012,RealEarn = PERNP * (ADJINC/1000000))
+
+# We want data on three groups
+# For ALL ages (no grad degree):
+#   - Number: Total, Employed, Unemployed, Employed FTYR
+#   - Median earnings (for FYFT)
+#   - 25th and 75th percentile earnings
+#   - Unemployment rate
+# For ages 25+
+#   - Median earnings for grad degree (FTYR)
+#   - Median earnings for non grad degree (FTYR)
+#   - Number with grad degree: Total, Employed, Unemployed, FTYR
+#   - Number without grad degree: Total, Employed, Unemployed, FTYR
+#   - Sample size
+# For YOUNG (up to 27):
+#   - Number: Total, Employed, Unemployed, Employed FTYR
+#   - Sample size
+#   - Median earnings (for FYFT)
+#   - 25% and 75% percentile earnings
+#   - Unemployment rate
+#   - Median hours worked
+#   - Share working full-time
+#   - Share employed in college/noncollege and low-end jobs
+
+# Several smaller data frames
+allage <- filter(MAJORS1012,SCHL == 21) # No grad degree earners
+allage.ftyr <- filter(MAJORS1012,SCHL == 21 & FTYR == 1 & PERNP >0)
+over25.nograd <- filter(MAJORS1012,AGEP > 24 & SCHL == 21)
+over25.grad <- filter(MAJORS1012,AGEP > 24 & SCHL > 21)
+young <- filter(MAJORS1012,AGEP < 28) # This DOES include students with grad degrees
+young.ftyr <- filter(young,FTYR == 1 & SCHL == 21 & SCH == 1) # But this doesn't. Also excludes those in school.
+
+# Create dummy for low-end job
+# Here's our list of low-end jobs
+# 4020 .EAT-COOKS
+# 4030 .EAT-FOOD PREPARATION WORKERS
+# 4040 .EAT-BARTENDERS
+# 4050 .EAT-COMBINED FOOD PREPARATION AND SERVING WORKERS,
+# .INCLUDING FAST FOOD
+# 4060 .EAT-COUNTER ATTENDANTS, CAFETERIA, FOOD CONCESSION, AND
+# .COFFEE SHOP
+# 4110 .EAT-WAITERS AND WAITRESSES
+# 4120 .EAT-FOOD SERVERS, NONRESTAURANT
+# 4130 .EAT-MISCELLANEOUS FOOD PREPARATION AND SERVING RELATED
+# .WORKERS, INCLUDING DINING ROOM AND CAFETERIA ATTENDANTS AND
+# .BARTENDER HELPERS
+# 4140 .EAT-DISHWASHERS
+# 4150 .EAT-HOSTS AND HOSTESSES, RESTAURANT, LOUNGE, AND COFFEE SHOP
+# 4220 .CLN-JANITORS AND BUILDING CLEANERS
+# 4230 .CLN-MAIDS AND HOUSEKEEPING CLEANERS
+# 4240 .CLN-PEST CONTROL WORKERS
+# 4250 .CLN-GROUNDS MAINTENANCE WORKERS
+# 4720 .SAL-CASHIERS
+# 4740 .SAL-COUNTER AND RENTAL CLERKS
+# 4750 .SAL-PARTS SALESPERSONS
+# 4760 .SAL-RETAIL SALESPERSONS
+# 4940 .SAL-TELEMARKETERS
+
+lowend <- c(4020,4030,4040,4050,4060,4110,4120,4130,4140,4150,4220,4230,4240,4250,4720,4740,4750,4760,4940)
+young$LowEnd  <-  0
+young$LowEnd[young$OCCP10 %in% as.list(lowend) | young$OCCP12 %in% as.list(lowend)] <- 1
+
+# This creates a dummy for college-level jobs. Based off non-public data from Jaison R. Abel and
+# Richard Deitz of the New York Fed. As a result, csv is not included here.
+# collegejobs <- read.csv("collegejobs.csv",header=TRUE,stringsAsFactors=FALSE)
+# young$collegejob[young$OCCP10 %in% as.list(collegejobs$OCC[collegejobs$College==1]) | 
+#                    young$OCCP12 %in% as.list(collegejobs$OCC[collegejobs$College==1])]  <- 1
+# young$noncollege[young$OCCP10 %in% as.list(collegejobs$OCC[collegejobs$College==0]) | 
+#                    young$OCCP12 %in% as.list(collegejobs$OCC[collegejobs$College==0])]  <- 1
+
+# Now we create three big dataframes for the analysis
+AllAges <- data.frame(FOD1P = as.numeric(MajorsList$FOD1P[2:nrow(MajorsList)]),Major = MajorsList$Major[2:nrow(MajorsList)],Major_Cat = MajorsList$Major_Category[2:nrow(MajorsList)],
+                      Total = NA, Employed = NA, FTYR = NA, Unemployed = NA, Urate = NA, Median = NA, P25th = NA, P75th = NA)
+Ages25up <- data.frame(FOD1P = as.numeric(MajorsList$FOD1P[2:nrow(MajorsList)]),Major = MajorsList$Major[2:nrow(MajorsList)],Major_Cat = MajorsList$Major_Category[2:nrow(MajorsList)],
+                       GradTotal = NA, GradEmployed = NA, GradFTYR = NA, GradUnemployed = NA, GradUrate = NA, GradMedian = NA, GradP25th = NA, GradP75th = NA,GradSample = NA,
+                       NoGradTotal = NA, NoGradEmployed = NA, NoGradFTYR = NA, NoGradUnemployed = NA, NoGradUrate = NA, NoGradMedian = NA, NoGradP25th = NA, NoGradP75th = NA)
+Young <- data.frame(FOD1P = as.numeric(MajorsList$FOD1P[2:nrow(MajorsList)]),Major = MajorsList$Major[2:nrow(MajorsList)],Major_Cat = MajorsList$Major_Category[2:nrow(MajorsList)],
+                    Total = NA, Employed = NA, FTYR = NA, Unemployed = NA, Urate = NA, Median = NA, P25th = NA, P75th = NA, Sample = NA,
+                    Median_hours = NA, FT = NA, PT = NA, LowEnd = NA, CollegeJob = NA, NonCollege = NA, Men = NA, Women = NA, ShareWomen = NA)
+
+# We'll do this in one big for loop
+loop <- seq(1:nrow(AllAges))
+for (i in loop) {
+  x <- AllAges$FOD1P[i]
+  AllAges[i,4] <- with(allage,sum(PWGTP[FOD1P==x]))
+  AllAges[i,5] <- with(allage,sum(PWGTP[FOD1P==x & (ESR == 1 | ESR == 2)]))
+  AllAges[i,6] <- with(allage,sum(PWGTP[FOD1P==x & FTYR == 1]))
+  AllAges[i,7] <- with(allage,sum(PWGTP[FOD1P==x & ESR == 3]))
+  AllAges[i,8] <- AllAges[i,7]/(AllAges[i,5] + AllAges[i,7])
+  AllAges[i,9] <- with(allage.ftyr[allage.ftyr$FOD1P == x,],median(rep(PERNP,times=PWGTP)))
+  AllAges[i,10] <- with(allage.ftyr[allage.ftyr$FOD1P == x,],quantile(rep(PERNP,times=PWGTP))[2])
+  AllAges[i,11] <- with(allage.ftyr[allage.ftyr$FOD1P == x,],quantile(rep(PERNP,times=PWGTP))[4])
+  
+  Ages25up[i,4] <- with(over25.grad,sum(PWGTP[FOD1P==x]))
+  Ages25up[i,5] <- with(over25.grad,sum(PWGTP[FOD1P==x & (ESR == 1 | ESR == 2)]))
+  Ages25up[i,6] <- with(over25.grad,sum(PWGTP[FOD1P==x & FTYR == 1]))
+  Ages25up[i,7] <- with(over25.grad,sum(PWGTP[FOD1P==x & ESR == 3]))
+  Ages25up[i,8] <- Ages25up[i,7]/(Ages25up[i,5] + Ages25up[i,7])
+  Ages25up[i,9] <- with(over25.grad[over25.grad$FOD1P == x & over25.grad$FTYR ==1,],median(rep(PERNP,times=PWGTP)))
+  Ages25up[i,10] <- with(over25.grad[over25.grad$FOD1P == x & over25.grad$FTYR ==1,],quantile(rep(PERNP,times=PWGTP))[2])
+  Ages25up[i,11] <- with(over25.grad[over25.grad$FOD1P == x & over25.grad$FTYR ==1,],quantile(rep(PERNP,times=PWGTP))[4])
+  Ages25up[i,12] <- nrow(over25.grad[over25.grad$FOD1P == x & over25.grad$FTYR ==1,])
+  Ages25up[i,13] <- with(over25.nograd,sum(PWGTP[FOD1P==x]))
+  Ages25up[i,14] <- with(over25.nograd,sum(PWGTP[FOD1P==x & (ESR == 1 | ESR == 2)]))
+  Ages25up[i,15] <- with(over25.nograd,sum(PWGTP[FOD1P==x & FTYR == 1]))
+  Ages25up[i,16] <- with(over25.nograd,sum(PWGTP[FOD1P==x & ESR == 3]))
+  Ages25up[i,17] <- Ages25up[i,16]/(Ages25up[i,14] + Ages25up[i,16])
+  Ages25up[i,18] <- with(over25.nograd[over25.nograd$FOD1P == x & over25.nograd$FTYR ==1,],median(rep(PERNP,times=PWGTP)))
+  Ages25up[i,19] <- with(over25.nograd[over25.nograd$FOD1P == x & over25.nograd$FTYR ==1,],quantile(rep(PERNP,times=PWGTP))[2])
+  Ages25up[i,20] <- with(over25.nograd[over25.nograd$FOD1P == x & over25.nograd$FTYR ==1,],quantile(rep(PERNP,times=PWGTP))[4])  
+  
+  Young[i,4] <- with(young,sum(PWGTP[FOD1P==x]))
+  Young[i,5] <- with(young,sum(PWGTP[FOD1P==x & (ESR == 1 | ESR == 2)]))
+  Young[i,6] <- with(young,sum(PWGTP[FOD1P==x & FTYR == 1]))
+  Young[i,7] <- with(young,sum(PWGTP[FOD1P==x & ESR == 3]))
+  Young[i,8] <- Young[i,7]/(Young[i,5] + Young[i,7])
+  Young[i,9] <- with(young.ftyr[young.ftyr$FOD1P == x & young.ftyr$PERNP>0,],median(rep(PERNP,times=PWGTP)))
+  Young[i,10] <- with(young.ftyr[young.ftyr$FOD1P == x & young.ftyr$PERNP>0,],quantile(rep(PERNP,times=PWGTP))[2])
+  Young[i,11] <- with(young.ftyr[young.ftyr$FOD1P == x & young.ftyr$PERNP>0,],quantile(rep(PERNP,times=PWGTP))[4])
+  Young[i,12] <- nrow(young.ftyr[young.ftyr$FOD1P == x & young.ftyr$PERNP>0,])
+  Young[i,13] <- with(young[young$FOD1P == x,],median(na.omit(rep(WKHP,times=PWGTP))))
+  Young[i,14] <- with(young,sum(na.omit(PWGTP[FOD1P==x & WKHP >= 35])))
+  Young[i,15] <- with(young,sum(na.omit(PWGTP[FOD1P==x & WKHP < 35])))
+  Young[i,16] <- with(young,sum(PWGTP[FOD1P==x & LowEnd == 1]))
+  Young[i,17] <- with(young,sum(na.omit(PWGTP[FOD1P==x & collegejob == 1 & (ESR == 1 | ESR == 2)]))) # only want employed
+  Young[i,18] <- with(young,sum(na.omit(PWGTP[FOD1P==x & noncollege == 1 & (ESR == 1 | ESR == 2)]))) # only want employed
+  Young[i,19] <- with(young,sum(PWGTP[FOD1P==x & SEX == 1]))
+  Young[i,20] <- with(young,sum(PWGTP[FOD1P==x & SEX == 2]))
+  Young[i,21] <- Young[i,20]/(Young[i,19] + Young[i,20])
+  
+  print(paste(i,"of",nrow(AllAges)))
+}
+
+write.csv(Young,file="Young.csv")
+write.csv(AllAges,file="AllAges.csv")
+write.csv(Ages25up,file="Ages25up.csv")