Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
*.csv
**/*.csv
.Rdata
.Rhistory
.Rproj.user
Expand Down
1 change: 1 addition & 0 deletions lib/adapters/zipped_url.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
read <- function(name) {
project <- syberia_project(root())
if (project$cache_exists(name)) {
message("Reading from cache...")
project$cache_get(name)
Expand Down
2 changes: 1 addition & 1 deletion lockfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ packages:
repo: robertzk/objectdiff
-
name: stagerunner
version: 0.5.6
version: 0.5.7
repo: syberia/stagerunner
-
name: Ramd
Expand Down
2,323 changes: 2,323 additions & 0 deletions models/dev/survey/anes2008pre.csv

Large diffs are not rendered by default.

60 changes: 33 additions & 27 deletions models/dev/survey/survey.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# will just use Syberia to clean the data and then analyze it in some different ways.

# Our goal here is to look at data from the 2008 ANES election survey, look at the
# time-series data and see whether people became more favorable to Obama after he won
# the election.
# time-series data and see whether people who feel they understand the issues were
# more favorable toward Obama at the time of the eleciton.

list(
# Here we use the file adapter to simply load a CSV from the same directory as the model.
Expand All @@ -19,37 +19,43 @@ list(
# The left-hand side names the data cleaning step (called a "mungebit") and the
# right-hand side defines it.
,data = list(
# We have a lot of data that is 0 and 1 representing booleans, so we want to
# transform this into the native R logical.
"Convert 0 and 1 to boolean" = list(
column_transformation(as.logical),
function(x) { identical(sort(setdiff(unique(x), NA)), c(0L, 1L)) })
# ANES uses crazy names, so let's rename some variables.
"Rename" = list(renamer, list("V083004" = "voted2008",
"V083037a" = "obama_tmp",
"V083079b" = "understand_issues"))
# We're only interested in looking at the people who actually voted, so we
# can subset.
,"Subset to only those who voted" = list(
list(select_rows, NULL),
function(df) { df$voted2008 == TRUE }, whole = TRUE)
# We then can engineer a new variable looking at favorability.
,"Find the post-pre difference in Obama favorability" = list(
new_variable,
function(obama_tmp_pre, obama_tmp_post) { obama_tmp_post - obama_tmp_pre },
"obama_tmp_diff"
)
function(df) { df$voted2008 == "1. Yes" }, whole = TRUE)
# The understand_issues variable in ANES is a mess, so we will recode into numbers.
, "Clean issue understanding" = list(value_replacer, 'understand_issues',
list("-1. INAP, R selected for VERSION D" = NA,
"1. Agree strongly" = 5,
"2. Agree somewhat" = 4,
"3. Neither agree nor disagree" = 3,
"4. Disagree somewhat" = 2,
"5. Disagree strongly" = 1,
"-8. Don't know" = 1))
# THe data has to be numeric, so we use a column transformation.
, "Turn to numeric" = list(column_transformation(as.numeric), 'understand_issues')
)

# While models have a model stage, survey analysis has an analyze stage.
# The analyze stage prints the results of each computation for you to review.
,analyze = list(
"Mean difference in Obama favorability" =
function(df) mean(df$obama_tmp_diff, na.rm = TRUE),
"Pre-election post-election t-test" =
function(df) t.test(df$obama_tmp_pre, df$obama_tmp_post)
)

# After the analyze stage, we see that there is a mean difference of +7.984 in Obama
# favorability (on an 100-point scale). A t-test of favorability before and after the
# election has p < 0.0001, which indicates statistical significance.
#
# Therefore we declare that there was an increase in average favorability toward Obama
# after he got elected.
)
"Mean Obama favorability" =
function(df) mean(df$obama_tmp, na.rm = TRUE)
, "Mean self-reported issue understanding" =
function(df) mean(df$understand_issues, na.rm = TRUE)
, "Look at mean Obama favorability by issue understanding" =
function(df) tapply(df$obama_tmp, df$understand_issues, mean)
# Here we see:
# 1 2 3 4 5
# 61.94737 57.15517 59.00000 63.16010 67.01230
# ...which means that as issue understanding goes toward 5 (greater understanding)
# Obama favoriability increases.
, "Feelings toward obama x understand issues chisq test" =
function(df) chisq.test(df$obama_tmp, df$understand_issues)
# We then look at a Chi Square test which shows the result is statistically significant.
))
4 changes: 3 additions & 1 deletion models/dev/uci/msd/msd.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ list(
),

data = list(
"Rename dep_var" = list( renamer ~ NULL, c(X1 = 'dep_var'))
"Rename dep_var" = list( renamer ~ NULL, c(X1 = 'dep_var'))
,"Create ID var" = list(multi_column_transformation(seq_along), "dep_var", "id")
,"Rename timbre average vars" = list( renamer, setNames(paste0('timbre_average_', 1:12), paste0('X', 2:13)))
,"Rename timbre covariance vars" = list( renamer, setNames(paste0('timbre_cov_', 1:78), paste0('X', 14:91)))
,"Select training rows" = list( select_rows ~ NULL, 1:TRAIN_CUTOFF)
Expand All @@ -15,6 +16,7 @@ list(
),

model = list('gbm'
, .id_var = 'id'
, distribution = 'multinomial'
, number_of_trees = 3000
, shrinkage_factor = 0.005
Expand Down