syberia · peterhurford · May 18, 2017 · May 18, 2017 · May 18, 2017 · May 18, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,3 @@
-*.csv
-**/*.csv
 .Rdata
 .Rhistory
 .Rproj.user

diff --git a/lib/adapters/zipped_url.R b/lib/adapters/zipped_url.R
@@ -1,4 +1,5 @@
 read <- function(name) {
+  project <- syberia_project(root())
   if (project$cache_exists(name)) {
     message("Reading from cache...")
     project$cache_get(name)

diff --git a/lockfile.yml b/lockfile.yml
@@ -5,7 +5,7 @@ packages:
     repo: robertzk/objectdiff
   -
     name: stagerunner
-    version: 0.5.6
+    version: 0.5.7
     repo: syberia/stagerunner
   -
     name: Ramd

diff --git a/models/dev/survey/anes2008pre.csv b/models/dev/survey/anes2008pre.csv
diff --git a/models/dev/survey/survey.R b/models/dev/survey/survey.R
@@ -4,8 +4,8 @@
 # will just use Syberia to clean the data and then analyze it in some different ways.
 
 # Our goal here is to look at data from the 2008 ANES election survey, look at the
-# time-series data and see whether people became more favorable to Obama after he won
-# the election.
+# time-series data and see whether people who feel they understand the issues were
+# more favorable toward Obama at the time of the eleciton.
 
 list(
   # Here we use the file adapter to simply load a CSV from the same directory as the model.
@@ -19,37 +19,43 @@ list(
   # The left-hand side names the data cleaning step (called a "mungebit") and the
   # right-hand side defines it.
   ,data = list(
-    # We have a lot of data that is 0 and 1 representing booleans, so we want to
-    # transform this into the native R logical.
-    "Convert 0 and 1 to boolean" = list(
-        column_transformation(as.logical),
-        function(x) { identical(sort(setdiff(unique(x), NA)), c(0L, 1L)) })
+    # ANES uses crazy names, so let's rename some variables.
+    "Rename" = list(renamer, list("V083004" = "voted2008",
+                                  "V083037a" = "obama_tmp",
+                                  "V083079b" = "understand_issues"))
     # We're only interested in looking at the people who actually voted, so we
     # can subset.
     ,"Subset to only those who voted" = list(
         list(select_rows, NULL),
-        function(df) { df$voted2008 == TRUE }, whole = TRUE)
-    # We then can engineer a new variable looking at favorability.
-    ,"Find the post-pre difference in Obama favorability" = list(
-        new_variable,
-        function(obama_tmp_pre, obama_tmp_post) { obama_tmp_post - obama_tmp_pre },
-        "obama_tmp_diff"
-    )
+        function(df) { df$voted2008 == "1. Yes" }, whole = TRUE)
+    # The understand_issues variable in ANES is a mess, so we will recode into numbers.
+    , "Clean issue understanding" = list(value_replacer, 'understand_issues',
+        list("-1. INAP, R selected for VERSION D" = NA,
+             "1. Agree strongly" = 5,
+             "2. Agree somewhat" = 4,
+             "3. Neither agree nor disagree" = 3,
+             "4. Disagree somewhat" = 2,
+             "5. Disagree strongly" = 1,
+             "-8. Don't know" = 1))
+    # THe data has to be numeric, so we use a column transformation.
+    , "Turn to numeric" = list(column_transformation(as.numeric), 'understand_issues')
   ) 
 
   # While models have a model stage, survey analysis has an analyze stage.
   # The analyze stage prints the results of each computation for you to review.
   ,analyze = list(
-    "Mean difference in Obama favorability" =
-        function(df) mean(df$obama_tmp_diff, na.rm = TRUE),
-    "Pre-election post-election t-test" =
-        function(df) t.test(df$obama_tmp_pre, df$obama_tmp_post)
-  )
-
-  # After the analyze stage, we see that there is a mean difference of +7.984 in Obama
-  # favorability (on an 100-point scale).  A t-test of favorability before and after the
-  # election has p < 0.0001, which indicates statistical significance.
-  #
-  # Therefore we declare that there was an increase in average favorability toward Obama
-  # after he got elected.
-)
+    "Mean Obama favorability" =
+        function(df) mean(df$obama_tmp, na.rm = TRUE)
+    , "Mean self-reported issue understanding" =
+        function(df) mean(df$understand_issues, na.rm = TRUE)
+    , "Look at mean Obama favorability by issue understanding" =
+        function(df) tapply(df$obama_tmp, df$understand_issues, mean)
+    # Here we see:
+    #       1        2        3        4        5
+    #       61.94737 57.15517 59.00000 63.16010 67.01230
+    # ...which means that as issue understanding goes toward 5 (greater understanding)
+    # Obama favoriability increases.
+    , "Feelings toward obama x understand issues chisq test" =
+        function(df) chisq.test(df$obama_tmp, df$understand_issues)
+    # We then look at a Chi Square test which shows the result is statistically significant.
+  ))
diff --git a/models/dev/uci/msd/msd.R b/models/dev/uci/msd/msd.R
@@ -6,7 +6,8 @@ list(
   ),
 
   data = list(
-     "Rename dep_var"                = list( renamer ~ NULL, c(X1  = 'dep_var'))
+    "Rename dep_var"                 = list( renamer ~ NULL, c(X1  = 'dep_var'))
+    ,"Create ID var"                 = list(multi_column_transformation(seq_along), "dep_var", "id")
     ,"Rename timbre average vars"    = list( renamer, setNames(paste0('timbre_average_', 1:12), paste0('X', 2:13)))
     ,"Rename timbre covariance vars" = list( renamer, setNames(paste0('timbre_cov_', 1:78), paste0('X', 14:91)))
     ,"Select training rows"          = list( select_rows ~ NULL, 1:TRAIN_CUTOFF)
@@ -15,6 +16,7 @@ list(
   ),
 
   model = list('gbm'
+    , .id_var             = 'id'
     , distribution        = 'multinomial'
     , number_of_trees     = 3000
     , shrinkage_factor    = 0.005