add nutrition studies repo

2016-01-05 16:59:10 -05:00
parent fb54c1e17d
commit 26bbe405e7
18 changed files with 27935 additions and 0 deletions
--- a/nutrition-studies/.RData
+++ b/nutrition-studies/.RData
--- a/nutrition-studies/.Rproj.user/1ED50CBF/pcs/files-pane.pper
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/pcs/files-pane.pper
@@ -0,0 +1,9 @@
+{
+    "path" : "~/editing/dieting-project",
+    "sortOrder" : [
+        {
+            "ascending" : true,
+            "columnIndex" : 2
+        }
+    ]
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/pcs/source-pane.pper
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/pcs/source-pane.pper
@@ -0,0 +1,3 @@
+{
+    "activeTab" : 0
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/pcs/windowlayoutstate.pper
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/pcs/windowlayoutstate.pper
@@ -0,0 +1,14 @@
+{
+    "left" : {
+        "panelheight" : 674,
+        "splitterpos" : 283,
+        "topwindowstate" : "NORMAL",
+        "windowheight" : 713
+    },
+    "right" : {
+        "panelheight" : 674,
+        "splitterpos" : 421,
+        "topwindowstate" : "NORMAL",
+        "windowheight" : 713
+    }
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/pcs/workbench-pane.pper
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/pcs/workbench-pane.pper
@@ -0,0 +1,4 @@
+{
+    "TabSet1" : 0,
+    "TabSet2" : 3
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/saved_source_markers
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/saved_source_markers
@@ -0,0 +1 @@
+{"active_set":"","sets":[]}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/194B460A
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/194B460A
@@ -0,0 +1,3 @@
+{
+    "tempName" : "Untitled1"
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/27AAD271
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/27AAD271
@@ -0,0 +1,3 @@
+{
+    "tempName" : "Untitled1"
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/DDD9C5D8
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/DDD9C5D8
@@ -0,0 +1,3 @@
+{
+    "tempName" : "Untitled1"
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/E17005B0
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/E17005B0
@@ -0,0 +1,3 @@
+{
+    "tempName" : "Untitled1"
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/INDEX
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/prop/INDEX
@@ -0,0 +1,4 @@
+~%2Fediting%2Fdieting-project%2Fdiet_p_hacking.R="194B460A"
+~%2Fediting%2Fdieting-project%2Fscratch_work.R="27AAD271"
+~%2Fprivate-data%2Fdieting-project%2Fdiet_p_hacking.R="DDD9C5D8"
+~%2Fprivate-data%2Fdieting-project%2Fdiet_p_hacking_final.R="E17005B0"
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/s-8D6D153C/3EF212EB
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/s-8D6D153C/3EF212EB
@@ -0,0 +1,25 @@
+{
+    "contents" : "",
+    "created" : 1452009684761.000,
+    "dirty" : false,
+    "encoding" : "",
+    "folds" : "",
+    "hash" : "0",
+    "id" : "3EF212EB",
+    "lastKnownWriteTime" : 140735085965160,
+    "path" : null,
+    "project_path" : null,
+    "properties" : {
+        "cacheKey" : "l91i3fvvp3",
+        "caption" : "regAnalysis",
+        "contentUrl" : "grid_resource/gridviewer.html?env=&obj=regAnalysis&cache_key=l91i3fvvp3",
+        "displayedObservations" : "27716",
+        "environment" : "",
+        "object" : "regAnalysis",
+        "totalObservations" : "27716",
+        "variables" : "3"
+    },
+    "relative_order" : 2,
+    "source_on_save" : false,
+    "type" : "r_dataframe"
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/s-8D6D153C/BC3CEA54
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/s-8D6D153C/BC3CEA54
@@ -0,0 +1,18 @@
+{
+    "contents" : "# Diet P-hacking\n# Andrew Flowers <andrew.flowers@fivethirtyeight.com>\n\nsetwd(\"~/private-data//dieting-project/\")\n\nrequire(readr)\nrequire(plyr)\nrequire(dplyr)\nrequire(tidyr)\n\nrawData <- read.csv(\"raw_anonymized_data.csv\")\n\n# Fix innie/out characteristics\n\nrawData$belly <- revalue(rawData$belly, c(\"Innie\"=\"Yes\", \"Outie\"=\"No\"))\n\n# FFQ variable names (should total 1066)\n\nffq <- names(rawData)[28:1093]\n\n# Characteristic variable names (should total 26)\n\ncharacteristics <- names(rawData)[2:27]\n\n# Linear regressions with respondent characteristic predicting food frequency\n\nregValues <- data.frame(food=ffq)\n\nfor (c in characteristics) regValues[,c] <- NA # Add characteristics as blank columns to regValues data frame\n\nfor (f in ffq){\n  for (c in characteristics){\n    \n    frm <- formula(paste0(f, \"~\", c))\n    reg <-  summary(lm(data=rawData, formula=frm))\n    regValues[which(regValues$food==f), c]  <- reg$coefficients[8]\n  }\n}\n\n# Extract p-values\n\nregAnalysis <- regValues %>%\n  gather(\"characteristic\", \"p_values\", 2:27) %>% \n  arrange(p_values)\n\n# Write out p-values\n\nwrite_csv(regAnalysis, \"p_values_analysis.csv\")\n\n# Note: This is an intentionally shady regression analysis. Both because of the \"p-hacking\" or \n# \"data mining\" behind running over 27,000 regresison, but also in that only the statistics reported \n# were the p-values of the characteristics (the independent variables).\n\n# IN OTHER WORDS: DO NOT TRY THIS AT HOME (AKA, THIS IS NOT AN EXAMPLE OF SOUND DATA ANALYSIS)\n",
+    "created" : 1450735440535.000,
+    "dirty" : false,
+    "encoding" : "UTF-8",
+    "folds" : "",
+    "hash" : "2220922350",
+    "id" : "BC3CEA54",
+    "lastKnownWriteTime" : 1452010952,
+    "path" : "~/private-data/dieting-project/diet_p_hacking_final.R",
+    "project_path" : "diet_p_hacking_final.R",
+    "properties" : {
+        "tempName" : "Untitled1"
+    },
+    "relative_order" : 1,
+    "source_on_save" : false,
+    "type" : "r_source"
+}
--- a/nutrition-studies/.Rproj.user/1ED50CBF/sdb/s-8D6D153C/lock_file
+++ b/nutrition-studies/.Rproj.user/1ED50CBF/sdb/s-8D6D153C/lock_file
--- a/nutrition-studies/README.md
+++ b/nutrition-studies/README.md
@@ -0,0 +1,19 @@
+### Nutrition Studies
+
+This directory contains data and code behind the story [You Can’t Trust What You Read About Nutrition](http://fivethirtyeight.com/features/you-cant-trust-what-you-read-about-nutrition).
+
+Many studies of diet and nutrition include multiple variables with vast amounts of data, making it easy to p-hack your way to sexy (and false) results. We learned this firsthand when we invited readers to take a survey about their eating habits known as the food frequency questionnaire and answer a few other questions about themselves. We ended up with 54 complete responses and looked for associations much as researchers look for links between foods and dreaded diseases. It was easy to find them. 
+
+*Warning*: This is evil (statistical) work. Do not go to the dark side. Do not try this at home.
+
+This directory contains three files:
+
+File | Description
+--- | -----
+`raw_anonymized_data.csv` | The FFQ and survey data from 54 respondents
+`p_hacking_final.R` | An R script that performs 27,716 regressions
+`p_values_analysis.csv` | The output data file listing the p-values
+
+**Note:** This is an intentionally shady regression analysis, both because of the "p-hacking" or "data mining" behind running more than 27,000 regressions and because the statistics reported were the p-values of the characteristics (the independent variables).
+
+**IN OTHER WORDS: THIS IS NOT AN EXAMPLE OF SOUND DATA ANALYSIS.**
--- a/nutrition-studies/p_hacking.R
+++ b/nutrition-studies/p_hacking.R
@@ -0,0 +1,54 @@
+# Diet P-hacking
+# Andrew Flowers <andrew.flowers@fivethirtyeight.com>
+
+setwd("~/private-data//dieting-project/")
+
+require(readr)
+require(plyr)
+require(dplyr)
+require(tidyr)
+
+rawData <- read.csv("raw_anonymized_data.csv")
+
+# Fix innie/out characteristics
+
+rawData$belly <- revalue(rawData$belly, c("Innie"="Yes", "Outie"="No"))
+
+# FFQ variable names (should total 1066)
+
+ffq <- names(rawData)[28:1093]
+
+# Characteristic variable names (should total 26)
+
+characteristics <- names(rawData)[2:27]
+
+# Linear regressions with respondent characteristic predicting food frequency
+
+regValues <- data.frame(food=ffq)
+
+for (c in characteristics) regValues[,c] <- NA # Add characteristics as blank columns to regValues data frame
+
+for (f in ffq){
+  for (c in characteristics){
+    
+    frm <- formula(paste0(f, "~", c))
+    reg <-  summary(lm(data=rawData, formula=frm))
+    regValues[which(regValues$food==f), c]  <- reg$coefficients[8]
+  }
+}
+
+# Extract p-values
+
+regAnalysis <- regValues %>%
+  gather("characteristic", "p_values", 2:27) %>% 
+  arrange(p_values)
+
+# Write out p-values
+
+write_csv(regAnalysis, "p_values_analysis.csv")
+
+# Note: This is an intentionally shady regression analysis. Both because of the "p-hacking" or 
+# "data mining" behind running over 27,000 regresison, but also in that only the statistics reported 
+# were the p-values of the characteristics (the independent variables).
+
+# IN OTHER WORDS: DO NOT TRY THIS AT HOME (AKA, THIS IS NOT AN EXAMPLE OF SOUND DATA ANALYSIS)
--- a/nutrition-studies/p_values_analysis.csv
+++ b/nutrition-studies/p_values_analysis.csv
--- a/nutrition-studies/raw_anonymized_data.csv
+++ b/nutrition-studies/raw_anonymized_data.csv