This does the modeling

Load libraries

##regis\practicum2\test>pandoc -f docx -t markdown "Chris Busch - practicum2 proposal.docx" -o foo.md
##pandoc --extract-media ./myMediaFolder input.docx -o output.md

rm(list = ls(all = TRUE)) #clear memory
library(stringr)

setwd("~/../practicum2")
source("common.R")
require(ggplot2)

## Loading required package: ggplot2

###############load all the data

bigdata=NULL
for(year in 2010:2015){
  filenames=c(Sys.glob(paste0('data/county/',year,'*.csv')),
              Sys.glob(paste0('data/irsclean/',year,'*.csv')),
              Sys.glob(paste0('data/wonderclean/',year,'*.csv')))
  yeardata=NULL
  for(f in filenames){# f=filenames[2]
    ##gotta go by year
    message(year,' ',f)
    fn=str_match(f, '/(\\d+)')[,2]
    d=read.csv(f,stringsAsFactors = T)
    
    if(!is.null(yeardata)){
      message('merging')
      yeardata=(merge(yeardata,d,by=c('fips','Year')))
    }else{
      yeardata=d
    }
  }
  message('storing')
  if(is.null(bigdata)){
    bigdata=yeardata
  }else{
    bigdata=dplyr::bind_rows(bigdata,yeardata)
  }
}

## 2010 data/county/2010Ranked Measure Data.csv

## 2010 data/irsclean/2010-irs-soi.csv

## merging

## 2010 data/wonderclean/2010cdc.csv

## merging

## storing

## 2011 data/county/2011Additional Measure Data.csv

## 2011 data/county/2011Ranked Measure Data.csv

## merging

## 2011 data/irsclean/2011-irs-soi.csv

## merging

## 2011 data/wonderclean/2011cdc.csv

## merging

## storing

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## 2012 data/county/2012Additional Measure Data.csv

## 2012 data/county/2012Ranked Measure Data.csv

## merging

## 2012 data/irsclean/2012-irs-soi.csv

## merging

## 2012 data/wonderclean/2012cdc.csv

## merging

## storing

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## 2013 data/county/2013Additional Measure Data.csv

## 2013 data/county/2013Ranked Measure Data.csv

## merging

## 2013 data/irsclean/2013-irs-soi.csv

## merging

## 2013 data/wonderclean/2013cdc.csv

## merging

## storing

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## 2014 data/county/2014Additional Measure Data.csv

## 2014 data/county/2014Ranked Measure Data.csv

## merging

## 2014 data/irsclean/2014-irs-soi.csv

## merging

## 2014 data/wonderclean/2014cdc.csv

## merging

## storing

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## 2015 data/county/2015Additional Measure Data.csv

## 2015 data/county/2015Ranked Measure Data.csv

## merging

## 2015 data/irsclean/2015-irs-soi.csv

## merging

## 2015 data/wonderclean/2015cdc.csv

## merging

## storing

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

rm(yeardata)
rm(d)

###done loading

checkVar=function(var1){
  message(var1,'not na=',length(which(!is.na(bigdata[[var1]]))))
  print(table(!is.na(bigdata[[ var1  ]]),bigdata$Year,dnn=c(var1,'Year')))
  summary(bigdata[[ var1  ]])
}


#this checks to see if a var was renamed
sapply(qw('diabetes.pct_diabetic pct_diabetic.diabetes'),checkVar)

## diabetes.pct_diabeticnot na=24566

##                      Year
## diabetes.pct_diabetic 2010 2011 2012 2013 2014 2015
##                 FALSE 6125 6120    0    0    0    0
##                 TRUE     0    0 6114 6120 6157 6175

## pct_diabetic.diabetesnot na=6120

##                      Year
## pct_diabetic.diabetes 2010 2011 2012 2013 2014 2015
##                 FALSE 6125    0 6114 6120 6157 6175
##                 TRUE     0 6120    0    0    0    0

##         diabetes.pct_diabetic pct_diabetic.diabetes
## Min.                  3.20000              3.000000
## 1st Qu.               9.00000              8.500000
## Median               10.50000              9.800000
## Mean                 10.58208              9.922827
## 3rd Qu.              12.00000             11.300000
## Max.                 21.60000             18.200000
## NA's              12245.00000          30691.000000

##              #thisVar becomes thatVar
varsToCombine=
  c(pct_diabetic.diabetes='diabetes.pct_diabetic',
    some_college_post_secondary_education.psed='some_college_post_secondary_education.pct',
    some_college_post_secondary_education.pct_psed='some_college_post_secondary_education.pct',
    access_to_healthy_foods.pct.x='access_to_healthy_foods.pct_food',
    access_to_recreational_facilities.rec_facility_rate='access_to_recreational_facilities.rec_fac_rate',
    air_pollution_particulate_matter.average_daily_pm2_5='daily_fine_particulate_matter.average_pm25',
    air_pollution_particulate_matter.average_daily_pm25='daily_fine_particulate_matter.average_pm25',
    diabetic_monitoring.pct_receiving_hba1c='diabetic_screening.pct_hba1c'
  )
for(n in names(varsToCombine)){
  print(table(sign(bigdata[[ varsToCombine[n]  ]]),bigdata$Year,dnn=c(varsToCombine[n],'Year (before)')))
  bigdata[[ varsToCombine[n]  ]]  = ifelse(is.na(bigdata[[ varsToCombine[n]  ]]),bigdata[[ n  ]],bigdata[[ varsToCombine[n]  ]])
  bigdata[[ n  ]]=NULL
  print(table(sign(bigdata[[ varsToCombine[n]  ]]),bigdata$Year,dnn=c(varsToCombine[n],'Year (after)')))
}

##                      Year (before)
## diabetes.pct_diabetic 2010 2011 2012 2013 2014 2015
##                     1    0    0 6114 6120 6157 6175
##                      Year (after)
## diabetes.pct_diabetic 2010 2011 2012 2013 2014 2015
##                     1    0 6120 6114 6120 6157 6175
##                                          Year (before)
## some_college_post_secondary_education.pct 2010 2011 2012 2013 2014 2015
##                                         1    0    0    0 6120 6157    0
##                                          Year (after)
## some_college_post_secondary_education.pct 2010 2011 2012 2013 2014 2015
##                                         1    0 6120    0 6120 6157    0
##                                          Year (before)
## some_college_post_secondary_education.pct 2010 2011 2012 2013 2014 2015
##                                         1    0 6120    0 6120 6157    0
##                                          Year (after)
## some_college_post_secondary_education.pct 2010 2011 2012 2013 2014 2015
##                                         1    0 6120 6114 6120 6157    0
##                                 Year (before)
## access_to_healthy_foods.pct_food 2010 2011 2012 2013 2014 2015
##                                0   68   33    0    0    0    0
##                                1 6057 6078    0    0    0    0
##                                 Year (after)
## access_to_healthy_foods.pct_food 2010 2011 2012 2013 2014 2015
##                                0   68   33   35    0    0    0
##                                1 6057 6078 6069    0    0    0
##                                               Year (before)
## access_to_recreational_facilities.rec_fac_rate 2010 2011 2012 2013 2014
##                                              0    0 1180 1269    0    0
##                                              1    0 4940 4845    0    0
##                                               Year (before)
## access_to_recreational_facilities.rec_fac_rate 2015
##                                              0    0
##                                              1    0
##                                               Year (after)
## access_to_recreational_facilities.rec_fac_rate 2010 2011 2012 2013 2014
##                                              0    0 1180 1269 1320    0
##                                              1    0 4940 4845 4800    0
##                                               Year (after)
## access_to_recreational_facilities.rec_fac_rate 2015
##                                              0    0
##                                              1    0
##                                           Year (before)
## daily_fine_particulate_matter.average_pm25 2010 2011 2012 2013 2014 2015
##                                          1    0    0    0 6079    0    0
##                                           Year (after)
## daily_fine_particulate_matter.average_pm25 2010 2011 2012 2013 2014 2015
##                                          1    0    0    0 6079    0 6134
##                                           Year (before)
## daily_fine_particulate_matter.average_pm25 2010 2011 2012 2013 2014 2015
##                                          1    0    0    0 6079    0 6134
##                                           Year (after)
## daily_fine_particulate_matter.average_pm25 2010 2011 2012 2013 2014 2015
##                                          1    0    0    0 6079 6119 6134
##                             Year (before)
## diabetic_screening.pct_hba1c 2010 2011 2012 2013 2014 2015
##                            1 6084 6045 6107 6110 6154    0
##                             Year (after)
## diabetic_screening.pct_hba1c 2010 2011 2012 2013 2014 2015
##                            1 6084 6045 6107 6110 6154 6163

#################  define the predictors under consideration
yvar='Death.per.100k'
predictors=read.csv('data/predictors.csv',stringsAsFactors = F,na.strings = "")
predictors$column=coalesce(predictors$shorter,predictors$origcolumn)
##rename those pesky long column names
changelist=list()
for(i in 1:nrow(predictors)){
  if(!is.na(predictors$shorter[i])){
    changelist[[ predictors$origcolumn[i] ]]= predictors$shorter[i]
  }
}
predictors[!is.na(predictors$shorter),c('shorter','origcolumn')]

##                       shorter
## 219   mentally_unhealthy_days
## 223 physically_unhealthy_days
##                                         origcolumn
## 219     poor_mental_health_days.mentally_unhealthy
## 223 poor_physical_health_days.physically_unhealthy

bigdata=rename.columns(bigdata,changelist)

## poor_physical_health_days.physically_unhealthy renamed columns to physically_unhealthy_days

## poor_mental_health_days.mentally_unhealthy renamed columns to mentally_unhealthy_days

setdiff(names(bigdata),predictors$column) ##what is missing in the first compared to second

## [1] "Death.per.100k"

setdiff(predictors$column,names(bigdata)) ##what is missing in the first compared to second

## [1] "access_to_healthy_foods.pct.x"                       
## [2] "access_to_recreational_facilities.rec_facility_rate" 
## [3] "air_pollution_particulate_matter.average_daily_pm2_5"
## [4] "air_pollution_particulate_matter.average_daily_pm25" 
## [5] "diabetic_monitoring.pct_receiving_hba1c"             
## [6] "pct_diabetic.diabetes"                               
## [7] "some_college_post_secondary_education.pct_psed"      
## [8] "some_college_post_secondary_education.psed"

ignore=unique(predictors$column[predictors$predictor==0])
discardVars=c(ignore)
predictorVarsRaw=unique(predictors$column[predictors$predictor==1 & predictors$column!=yvar])

average.out=function(bigdata){
  shush({
    bigdata=bigdata[!is.na(bigdata$Death.per.100k),] ##only data with Death.per.100k
    impute.df=as.data.frame(aggregate(bigdata,list(fips=bigdata$fips),FUN=function(x) mean(x,na.rm=T)))
  })
  impute.df
}

median.out=function(bigdata){
  shush({
    bigdata=bigdata[!is.na(bigdata$Death.per.100k),] ##only data with Death.per.100k
    impute.df=as.data.frame(aggregate(bigdata,list(fips=bigdata$fips),FUN=function(x) median(x,na.rm=T)))
  })
  impute.df
}

impute.df=average.out(bigdata);




cor(bigdata[,qw('other_primary_care_providers.pcp_rate
previous_other_primary_care_providers_data.pcp_rate
                previous_primary_care_physician_data_used_to_calculate_rankings.pcp_rate
                primary_care_physicians.pcp_rate
                ')],use = "pairwise.complete.obs")

##                                                                          other_primary_care_providers.pcp_rate
## other_primary_care_providers.pcp_rate                                                                1.0000000
## previous_other_primary_care_providers_data.pcp_rate                                                  0.9917949
## previous_primary_care_physician_data_used_to_calculate_rankings.pcp_rate                                    NA
## primary_care_physicians.pcp_rate                                                                     0.5299042
##                                                                          previous_other_primary_care_providers_data.pcp_rate
## other_primary_care_providers.pcp_rate                                                                              0.9917949
## previous_other_primary_care_providers_data.pcp_rate                                                                1.0000000
## previous_primary_care_physician_data_used_to_calculate_rankings.pcp_rate                                                  NA
## primary_care_physicians.pcp_rate                                                                                   0.5243828
##                                                                          previous_primary_care_physician_data_used_to_calculate_rankings.pcp_rate
## other_primary_care_providers.pcp_rate                                                                                                          NA
## previous_other_primary_care_providers_data.pcp_rate                                                                                            NA
## previous_primary_care_physician_data_used_to_calculate_rankings.pcp_rate                                                                1.0000000
## primary_care_physicians.pcp_rate                                                                                                        0.9614487
##                                                                          primary_care_physicians.pcp_rate
## other_primary_care_providers.pcp_rate                                                           0.5299042
## previous_other_primary_care_providers_data.pcp_rate                                             0.5243828
## previous_primary_care_physician_data_used_to_calculate_rankings.pcp_rate                        0.9614487
## primary_care_physicians.pcp_rate                                                                1.0000000

What years hold what data

fullness=function() {
  years=c()
  for(y in unique(bigdata$Year)){
    years[[as.character(y)]]=0
    for(n in names(bigdata)){
      #catln(y,n,mean(!is.na(bigdata[[n]][bigdata$Year==y])))
      years[[as.character(y)]]=years[[as.character(y)]]+mean(!is.na(bigdata[[n]][bigdata$Year==y]))
    }
  }
  years
}
fullness()

##      2010      2011      2012      2013      2014      2015 
##  68.32180  99.32582 115.05806 150.32745 126.24395 142.90623

bigdata hotdeck mean imputation to only impute those measures for the same fips county

for(n in names(bigdata)){
  if(is.numeric(bigdata[[n]]) && any(is.na(bigdata[[n]]) )){
    
    lookup=impute.df[[n]]
    names(lookup)=as.character(impute.df$fips)
    
    bigdata[[n]]=ifelse(is.na(bigdata[[n]]), lookup[as.character(bigdata$fips)] ,bigdata[[n]])
  }
}
fullness()

##     2010     2011     2012     2013     2014     2015 
## 294.1607 296.1165 296.0870 295.9634 295.9289 296.0363

require(usmap)

## Loading required package: usmap

## Warning: package 'usmap' was built under R version 3.4.2

for(age in unique(bigdata$Age.Grouping)){
  # plot_counties(bigdata[bigdata$Age.Grouping==age,c('fips','Deaths')],
  #               yvar='Deaths',low='green',high='red',main=paste(age,"Deaths"))
  plot_counties(winsor1Df(
    median.out(bigdata[bigdata$Age.Grouping==age,c('fips','Death.per.100k')]),
                          ignore='fips',fraction=.05),
                yvar='Death.per.100k',low='green',high='red',
                main=paste(age,"Death Rates (Winsored)"),ylab='Deaths/\nPopulation\n*100k')
}

## void winsor(){ //generated by winsor1Df 
## Death.per.100k =max( 3582.19444996898 ,min( 7819.82215733183 , Death.per.100k )); //limits are  518.116166354715 3582.19444996898 7819.82215733183 41463.4146341463 
## }

## void winsor(){ //generated by winsor1Df 
## Death.per.100k =max( 288.532980447823 ,min( 1355.51046017491 , Death.per.100k )); //limits are  137.962103908121 288.532980447823 1355.51046017491 3254.43786982249 
## }

## void winsor(){ //generated by winsor1Df 
## Death.per.100k =max( 67.1297848933185 ,min( 1515.55824885756 , Death.per.100k )); //limits are  31.5845054292382 67.1297848933185 1515.55824885756 3703.7037037037 
## }

a100k=100000
n='ADULT' #hand executing this line allows one to step into the loop to bypass the for loop
for(n in unique(bigdata$Age.Grouping)){
  
  d=bigdata[bigdata$Age.Grouping==n,]
  d=d[!is.na(d$Deaths) & !is.na(d$Population),]
  #plot(density(log(d$Population)),main=paste(n,'log(Age Group Populations) Density'))
  m=sum(d$Deaths)/sum(d$Population)
  priori=sum(d$Deaths,na.rm = T)/sum(d$Population,na.rm = T)
  prioriweight=1/priori * 10
  catln(n,priori,prioriweight)
  priori=sum(d$Deaths[d$Population>prioriweight],na.rm = T)/sum(d$Population[d$Population>prioriweight],na.rm = T)
  prioriweight=1/priori * 10
  catln(n,priori,prioriweight)
  
  ##good
  plot(d$Population, (d$Deaths)/(d$Population)*a100k,
       col=rgb(1,0,0,0.2),log='x',main=paste('Deaths/Population for',n,'Group'),
       xlab='Age Group Population in County',
       ylab='Deaths/Population*100k in Age Group')
  # points(d$Population,
  #        (d$Deaths+10/2)/(d$Population+prioriweight/2), #*a100k,
  #        col='purple',pch='.')
  abline(v=10*1/priori,col='gray');
  grid()
  abline(h=quantile((d$Deaths)/(d$Population)*a100k,0.95),col='blue') #winsor
  text(10*1/priori,max((d$Deaths)/(d$Population)*a100k)*0.05,round(prioriweight))
  ##
  # plot(d$Population, 
  #      d$Deaths,pch=20,
  #      col=rgb(1,0,0,0.1),main=n,log='xy',xlab='Age Group Population',ylab='Deaths in Age Group');grid()
  # points(sort(d$Population),sort(d$Population)*priori,type='l')
  # abline(v=prioriweight,col='gray');grid()
  # text(prioriweight,800,round(10*1/priori))
  ###
  print(summary(d$Deaths))
  catln(n,'priori',priori,
        'prioriweight',prioriweight,
        'need at least a pop of this size',min(d$Deaths)*1/priori,
        " small counties ",      sum(d$Population<round(10*1/priori)),
        " big counties ",sum(d$Population>=round(10*1/priori) ))
}

## SENIOR 0.04299791 232.5694 
## SENIOR 0.04298034 232.6645

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.0   101.0   210.0   624.5   502.0 45604.0 
## SENIOR priori 0.04298034 prioriweight 232.6645 need at least a pop of this size 232.6645  small counties  461  big counties  17611 
## ADULT 0.003633879 2751.88 
## ADULT 0.003605117 2773.835

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.0    35.0    77.0   236.4   197.0 15835.0 
## ADULT priori 0.003605117 prioriweight 2773.835 need at least a pop of this size 2773.835  small counties  2705  big counties  13128 
## YOUTH 0.0008552847 11692.01 
## YOUTH 0.000729517 13707.7

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   14.00   24.00   54.51   57.00 1182.00 
## YOUTH priori 0.000729517 prioriweight 13707.7 need at least a pop of this size 13707.7  small counties  1605  big counties  1301

Trees

age='SENIOR' #hand executing this line allows one to step into the loop to bypass the for loop
age='ADULT'
age='YOUTH'
year=0
importance=data.frame()
trees=list()
perf.table=NULL
for(age in unique(bigdata$Age.Grouping)){
  
  d=bigdata[bigdata$Age.Grouping==age & !is.na(bigdata$Death.per.100k),] 
  set.seed(7)
  trainset=runif(nrow(d))<0.8
  label=paste(age,ifelse(year==0,'',year))
  mean(is.na(d$Death.per.100k))
  
  d=winsor1Df(d,ignore = ignore,trace=F) 
  
  #trees handle missing data
  #d=impute(d,ignore = ignore,missing.threshold = 0.25)
  
  require(MASS)
  
  predictorVars=intersect(names(d),predictorVarsRaw)

  colinearvars=caret::findCorrelation(cor(d[,predictorVars],use="pairwise.complete.obs"),names = T)
  colinearpos=caret::findCorrelation(cor(d[,predictorVars],use="pairwise.complete.obs"),names = F)
  catln('co-linear variables to be ignored:',caret::findCorrelation(cor(d[,predictorVars],use="pairwise.complete.obs"),names = T))
  if(! setequal(predictorVars[colinearpos],colinearvars)) stop('vars mismatch')
  predictorVars=predictorVars[-colinearpos]
  
  require(rpart)
  require(rpart.plot)
  library(partykit)
  require(dplyr)
  summary(d$Population)
  
  mtree=rpart(ezformula(c(yvar,predictorVars)),d[trainset,],weights = d$Population[trainset],
              control = rpart.control(cp = 0.005))
  #printcp(mtree) # display the results 
  plotcp(mtree,main=label) # visualize cross-validation results 
  cp=mtree$cptable[which.min(mtree$cptable[,"xerror"]),"CP"] ##best CP
  message(label,'cp=',cp)
  mtree=prune(mtree,cp)

  catln(label,'tree depth is',max(rpart:::tree.depth(as.numeric(rownames(mtree$frame)))))
  print_rpart(mtree,digits=2,nlab = 'Counties:',ylab=paste0(yvar,':'))
  
  agedata=bigdata[bigdata$Age.Grouping==age & !is.na(bigdata$Death.per.100k),] 
  if(nrow(agedata)!=nrow(d))stop('the winsored and not-winsored should be the same length')

  catln(age,'all data')
  cbind(agedata,node=round((predict(mtree,agedata,type='vector'))),
        response=(predict(mtree,agedata,type='vector'))) %>%
    dplyr::group_by(node) %>% 
    dplyr::summarise(counties=length(fips),
                     deaths.pred=round(sum(response/100000*Population)),
                     deaths.act=sum(Deaths),
                     age.pop=sum(Population),
                     #dr100k.mean=mean(Death.per.100k),
                     dr100k.fit=mean(response)) %>% 
    dplyr::mutate(dr100k.group=deaths.act*100000/age.pop) %>% as.data.frame %>% print
  
  catln(age,'test data')
  cbind(agedata[!trainset,],node=round((predict(mtree,agedata[!trainset,],type='vector'))),
        response=(predict(mtree,agedata[!trainset,],type='vector'))) %>%
    dplyr::group_by(node) %>% 
    dplyr::summarise(counties=length(fips),
                     deaths.pred=round(sum(response/100000*Population)),
                     deaths.act=sum(Deaths),
                     age.pop=sum(Population),
                     #dr100k.mean=mean(Death.per.100k),
                     dr100k.fit=mean(response)) %>% 
    dplyr::mutate(dr100k.group=deaths.act*100000/age.pop) %>% as.data.frame %>% print
  
  importance=dplyr::bind_rows(importance,
                              cbind(data.frame(age=age,year=year),
                                    as.data.frame(t(as.data.frame(mtree$variable.importance)))))
  
  #savedPlots=list()
  for(n in (used.rpart.vars(mtree))){
    lm.m=lm(ezformula(c(yvar,n)),d[trainset,],weights = d$Population[trainset])
    s.lm.m=summary(lm.m)
    c.lm.m=coef(s.lm.m)
    if(c.lm.m[2,"Pr(>|t|)"]<0.05){
      catln(n,paste("slope=",signif(c.lm.m[2,"Estimate"],2),
                    "r^2=",signif(s.lm.m$adj.r.squared,2)))
      plot(d[trainset,c(n,yvar)],ylab=yvar,
           xlab=n,main=age,
           sub=paste0(round(mtree$variable.importance[[n]]/sum(mtree$variable.importance)*100),'% importance'),
           #col=rgb(0,0,0,0.1/2)
           col=rgb(0,0,0,(log(d$Population[trainset])/log(max(d$Population[trainset])))/5)
           );grid()
      text(mean(d[[n]],na.rm=T),mean(d[[yvar]]),
           paste("slope=",signif(c.lm.m[2,"Estimate"],2),
                 "\nr^2=",signif(s.lm.m$adj.r.squared,2)),
           col=ifelse(c.lm.m[2,"Estimate"]<0,'darkgreen','red'),font=2,cex=1.5)
      abline(lm.m,col='steelblue')  
    }
  }
  
  
  trees[[age]]=mtree;
  
  #plot(d$Deaths,(predict(mtree)/100000)*d$Population,col=rgb(0,0,0,0.2),main=label);grid()
  catln(label,'all',
        '\nrmse deaths=',rmse(d$Deaths,(predict(mtree,d)/100000)*d$Population),
        '\nrmse Deaths by priori=',rmse(d$Deaths,sum(d$Deaths)/sum(d$Population)*d$Population),
        '\ntree Deaths rsq=',rsq(d$Deaths,(predict(mtree,d)/100000)*d$Population),
        '\nprior Deaths rsq=',rsq(d$Deaths,sum(d$Deaths)/sum(d$Population)*d$Population),
        
        '\nfitted Death.per.100k rmse=',rmse(d$Death.per.100k,predict(mtree,d)),
        '\nweighted Death.per.100k rmse=',rmse(d$Death.per.100k,predict(mtree,d),weights = d$Population),
        '\nfitted Death.per.100k rsq=',rsq(d$Death.per.100k,(predict(mtree,d))))
  .=data.frame(#'Age Group'=label,
        'Deaths RMSE'=rmse(d$Deaths[!trainset],((predict(mtree,d[!trainset,]))/100000)*d$Population[!trainset]),
        'Deaths by Priori RMSE'=rmse(d$Deaths[!trainset],sum(d$Deaths[!trainset])/sum(d$Population[!trainset])*d$Population[!trainset]),
        'Tree Deaths RSq'=rsq(d$Deaths[!trainset],(predict(mtree,d[!trainset,])/100000)*d$Population[!trainset]),
        'Priori Deaths RSq'=rsq(d$Deaths[!trainset],sum(d$Deaths[!trainset])/sum(d$Population[!trainset])*d$Population[!trainset]),
        'Fitted Death.per.100k RMSE'=rmse(d$Death.per.100k[!trainset],(predict(mtree,d[!trainset,]))),
        'Weighted Death.per.100k RMSE'=rmse(d$Death.per.100k[!trainset],predict(mtree,d[!trainset,]),weights = d$Population[!trainset]),
        'Fitted Death.per.100k RSq'=rsq(d$Death.per.100k[!trainset],(predict(mtree,d[!trainset,]))))
  rownames(.)=age
  if(is.null(perf.table))perf.table=.
  else perf.table=rbind(perf.table,.)
  
  print(summary(mtree))
  
}  

## Loading required package: MASS

## co-linear variables to be ignored: pct_of_children_eligible_for_free_lunch teen_births.birth_rate homicides.homicide_rate sexually_transmitted_infections.rates_per_100000 excessive_drinking.pct high_housing_costs.pct violent_crime_rate hiv_prevalence_rate hiv_rate commuting_alone.pct_drive

## Loading required package: rpart

## Loading required package: rpart.plot

## Loading required package: grid

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following object is masked _by_ '.GlobalEnv':
## 
##     coalesce

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## SENIOR cp=0.005

## SENIOR  tree depth is 6 
## n= 14460 
## 
## node), split, n, yval
##       * denotes terminal node
## 
##  1) root Counties:14460 Death.per.100k:4300  
##    2) adult_obesity.pct_obese< 26 Counties:2191 Death.per.100k:3900  
##      4) preventable_hospital_stays.hosp__rate< 57 Counties:1581 Death.per.100k:3800  
##        8) other_primary_care_providers.pcp_rate< 68 Counties:1052 Death.per.100k:3700  
##         16) unemployed.ratio< 0.07 Counties:508 Death.per.100k:3600  
##           32) mammography_screening.pct>=74 Counties:54 Death.per.100k:3200 *
##           33) mammography_screening.pct< 74 Counties:454 Death.per.100k:3600  
##             66) median_household_income>=8.1e+04 Counties:70 Death.per.100k:3300 *
##             67) median_household_income< 8.1e+04 Counties:384 Death.per.100k:3700 *
##         17) unemployed.ratio>=0.07 Counties:544 Death.per.100k:3900  
##           34) social_associations.association_rate< 7.3 Counties:183 Death.per.100k:3800 *
##           35) social_associations.association_rate>=7.3 Counties:361 Death.per.100k:4100 *
##        9) other_primary_care_providers.pcp_rate>=68 Counties:529 Death.per.100k:4000  
##         18) adult_obesity.pct_obese< 16 Counties:22 Death.per.100k:3400 *
##         19) adult_obesity.pct_obese>=16 Counties:507 Death.per.100k:4100 *
##      5) preventable_hospital_stays.hosp__rate>=57 Counties:610 Death.per.100k:4200  
##       10) social_associations.association_rate< 6.1 Counties:77 Death.per.100k:3800 *
##       11) social_associations.association_rate>=6.1 Counties:533 Death.per.100k:4300  
##         22) unemployed.ratio< 0.099 Counties:369 Death.per.100k:4200 *
##         23) unemployed.ratio>=0.099 Counties:164 Death.per.100k:4600 *
##    3) adult_obesity.pct_obese>=26 Counties:12269 Death.per.100k:4600  
##      6) social_associations.association_rate< 8.8 Counties:2124 Death.per.100k:4200  
##       12) adult_smoking.pct_smokers< 20 Counties:818 Death.per.100k:4000  
##         24) primary_care_provider_rate.pcp< 42 Counties:166 Death.per.100k:3200  
##           48) physical_inactivity.pct_physically_inactive< 24 Counties:34 Death.per.100k:2700 *
##           49) physical_inactivity.pct_physically_inactive>=24 Counties:132 Death.per.100k:4300 *
##         25) primary_care_provider_rate.pcp>=42 Counties:652 Death.per.100k:4100  
##           50) injury_deaths.death_rate< 39 Counties:78 Death.per.100k:3700 *
##           51) injury_deaths.death_rate>=39 Counties:574 Death.per.100k:4100 *
##       13) adult_smoking.pct_smokers>=20 Counties:1306 Death.per.100k:4500  
##         26) income_inequality.ratio< 4.4 Counties:476 Death.per.100k:4200 *
##         27) income_inequality.ratio>=4.4 Counties:830 Death.per.100k:4800 *
##      7) social_associations.association_rate>=8.8 Counties:10145 Death.per.100k:4700  
##       14) physical_inactivity.pct_physically_inactive< 25 Counties:1586 Death.per.100k:4400 *
##       15) physical_inactivity.pct_physically_inactive>=25 Counties:8559 Death.per.100k:4800  
##         30) access_to_parks.pct_park< 14 Counties:4422 Death.per.100k:4700  
##           60) preventable_hospital_stays.hosp__rate< 64 Counties:1244 Death.per.100k:4400 *
##           61) preventable_hospital_stays.hosp__rate>=64 Counties:3178 Death.per.100k:4800 *
##         31) access_to_parks.pct_park>=14 Counties:4137 Death.per.100k:4900  
##           62) median_household_income>=5.2e+04 Counties:480 Death.per.100k:4600 *
##           63) median_household_income< 5.2e+04 Counties:3657 Death.per.100k:5000 *
## SENIOR all data

## Warning: package 'bindrcpp' was built under R version 3.4.2

##    node counties deaths.pred deaths.act  age.pop dr100k.fit dr100k.group
## 1  2674       45       21588      21659   807286   2674.206     2682.940
## 2  3196       68      115344     115441  3609119   3195.912     3198.592
## 3  3263       91      161255     163354  4941412   3263.334     3305.816
## 4  3404       29       78373      78255  2302246   3404.182     3399.072
## 5  3660       95      107655     106612  2941527   3659.823     3624.376
## 6  3689      488      804212     806502 21800523   3688.956     3699.462
## 7  3757      223      672525     674100 17901841   3756.736     3765.535
## 8  3796       93      293166     291992  7722843   3796.091     3780.887
## 9  4083      437      341294     339971  8359615   4082.653     4066.826
## 10 4121      636      776557     778067 18843798   4121.023     4129.035
## 11 4128      713     1093669    1095347 26495287   4127.786     4134.120
## 12 4195      466      666555     669173 15889744   4194.878     4211.352
## 13 4233      589      332330     331236  7850776   4233.081     4219.150
## 14 4268      165       18587      18448   435522   4267.786     4235.837
## 15 4424     1566      456308     456279 10313613   4424.326     4424.046
## 16 4449     2038     1157043    1157498 26009674   4448.509     4450.260
## 17 4568      598      258440     258423  5657841   4567.829     4567.520
## 18 4586      197      325972     326373  7107596   4586.246     4591.890
## 19 4767     1030      449956     450335  9439133   4766.921     4770.936
## 20 4818     3984      959949     959437 19922227   4818.483     4815.912
## 21 4960     4521     2188147    2186906 44112484   4960.380     4957.567
## SENIOR test data 
##    node counties deaths.pred deaths.act age.pop dr100k.fit dr100k.group
## 1  2674       11        3378       3833  126323   2674.206     3034.285
## 2  3196       14       26165      26361  818703   3195.912     3219.849
## 3  3263       21       54365      56478 1665945   3263.334     3390.148
## 4  3404        7       11472      11354  336987   3404.182     3369.269
## 5  3660       17       20872      19829  570292   3659.823     3476.991
## 6  3689      104      165680     168229 4491256   3688.956     3745.701
## 7  3757       40      121589     123175 3236555   3756.736     3805.744
## 8  3796       16       36541      35372  962590   3796.091     3674.669
## 9  4083       76       57433      56305 1406756   4082.653     4002.471
## 10 4121      129      178252     179772 4325441   4121.023     4156.154
## 11 4128      139      221235     222921 5359658   4127.786     4159.239
## 12 4195       97      198648     201252 4735485   4194.878     4249.871
## 13 4233      113       65760      64666 1553472   4233.081     4162.676
## 14 4268       33        4561       4457  106880   4267.786     4170.097
## 15 4424      321       90473      90426 2044898   4424.326     4422.030
## 16 4449      452      248280     248736 5581190   4448.509     4456.684
## 17 4568      119       47256      47230 1034531   4567.829     4565.354
## 18 4586       33       49314      49715 1075255   4586.246     4623.554
## 19 4767      200       90096      90479 1890033   4766.921     4787.165
## 20 4818      804      191822     191275 3980966   4818.483     4804.738
## 21 4960      866      430585     429279 8680491   4960.380     4945.331
## adult_obesity.pct_obese slope= 79 r^2= 0.3

## physical_inactivity.pct_physically_inactive slope= 73 r^2= 0.32

## social_associations.association_rate slope= 79 r^2= 0.25

## preventable_hospital_stays.hosp__rate slope= 16 r^2= 0.2

## injury_deaths.death_rate slope= 13 r^2= 0.14

## adult_smoking.pct_smokers slope= 68 r^2= 0.29

## unemployed.ratio slope= 3200 r^2= 0.032

## median_household_income slope= -0.02 r^2= 0.17

## other_primary_care_providers.pcp_rate slope= 3.4 r^2= 0.03

## mammography_screening.pct slope= -9.7 r^2= 0.0091

## primary_care_provider_rate.pcp slope= -0.71 r^2= 0.003

## income_inequality.ratio slope= 39 r^2= 0.0016

## access_to_parks.pct_park slope= -6 r^2= 0.047

## SENIOR  all 
## rmse deaths= 100.7995 
## rmse Deaths by priori= 251.1529 
## tree Deaths rsq= 0.996113 
## prior Deaths rsq= 0.984624 
## fitted Death.per.100k rmse= 1923.5 
## weighted Death.per.100k rmse= 435.1074 
## fitted Death.per.100k rsq= 0.04584812 
## Call:
## rpart(formula = ezformula(c(yvar, predictorVars)), data = d[trainset, 
##     ], weights = d$Population[trainset], control = rpart.control(cp = 0.005))
##   n= 14460 
## 
##             CP nsplit rel error    xerror         xstd
## 1  0.258824362      0 1.0000000 1.0008793 0.0003132421
## 2  0.081461956      1 0.7411756 0.7599032 0.0002994681
## 3  0.028995912      2 0.6597137 0.6665662 0.0002921715
## 4  0.025553120      3 0.6307178 0.6401958 0.0002915996
## 5  0.023350731      4 0.6051646 0.6186134 0.0002892209
## 6  0.015967146      5 0.5818139 0.6046426 0.0002889076
## 7  0.014609014      6 0.5658468 0.5952839 0.0002888330
## 8  0.011201562      7 0.5512378 0.5791833 0.0002882086
## 9  0.010895643      8 0.5400362 0.5725608 0.0002882920
## 10 0.010182935      9 0.5291406 0.5705973 0.0002888298
## 11 0.009458984     10 0.5189576 0.5611237 0.0002886842
## 12 0.008731830     12 0.5000397 0.5529538 0.0002892816
## 13 0.007215268     13 0.4913078 0.5470928 0.0002886462
## 14 0.006865151     14 0.4840926 0.5381785 0.0002880015
## 15 0.006444455     15 0.4772274 0.5321912 0.0002878544
## 16 0.005735774     16 0.4707829 0.5207696 0.0002879072
## 17 0.005400564     17 0.4650472 0.5189931 0.0002872032
## 18 0.005345174     19 0.4542460 0.5163929 0.0002868944
## 19 0.005000000     20 0.4489009 0.5092958 0.0002868466
## 
## Variable importance
##                                                   adult_obesity.pct_obese 
##                                                                        17 
##                               physical_inactivity.pct_physically_inactive 
##                                                                        11 
##                                                       college_degrees.pct 
##                                                                         9 
##                                    households_with_high_housing_costs.pct 
##                                                                         9 
##                                                     diabetes.pct_diabetic 
##                                                                         8 
##                                               severe_housing_problems.pct 
##                                                                         7 
##                                      social_associations.association_rate 
##                                                                         7 
##                                     preventable_hospital_stays.hosp__rate 
##                                                                         3 
## preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate 
##                                                                         2 
##                                     long_commute_driving_alone.pct_drives 
##                                                                         2 
##                                demographics.pct_not_proficient_in_english 
##                                                                         2 
##                                                  injury_deaths.death_rate 
##                                                                         2 
##                                                 adult_smoking.pct_smokers 
##                                                                         1 
##                                                          unemployed.ratio 
##                                                                         1 
##                                                            pct_illiterate 
##                                                                         1 
##                                                   median_household_income 
##                                                                         1 
##                                                  air_pollution_ozone_days 
##                                                                         1 
##                                                mammography_screening.rate 
##                                                                         1 
##                                     other_primary_care_providers.pcp_rate 
##                                                                         1 
##                                                 mammography_screening.pct 
##                                                                         1 
##                              motor_vehicle_crash_deaths.mv_mortality_rate 
##                                                                         1 
##                                       alcohol_impaired_driving_deaths.pct 
##                                                                         1 
##                                            primary_care_provider_rate.pcp 
##                                                                         1 
##                                                         health_care_costs 
##                                                                         1 
##                                                       hiv_prevalence.rate 
##                                                                         1 
##                                                   income_inequality.ratio 
##                                                                         1 
##                                                          some_college.pct 
##                                                                         1 
##                                                       sahie.pct.uninsured 
##                                                                         1 
##                                                  access_to_parks.pct_park 
##                                                                         1 
##                                      drug_poisoning_deaths.mortality_rate 
##                                                                         1 
## 
## Node number 1: 14460 observations,    complexity param=0.2588244
##   mean=4300.236, MSE=418978.2 
##   left son=2 (2191 obs) right son=3 (12269 obs)
##   Primary splits:
##       adult_obesity.pct_obese                     < 26.35      to the left,  improve=0.2588244, (0 missing)
##       physical_inactivity.pct_physically_inactive < 24.48      to the left,  improve=0.2575101, (2 missing)
##       adult_smoking.pct_smokers                   < 18.33167   to the left,  improve=0.2256138, (992 missing)
##       social_associations.association_rate        < 9.821322   to the left,  improve=0.2122582, (97 missing)
##       demographics.pct_not_proficient_in_english  < 4.22358    to the right, improve=0.1695233, (2 missing)
##   Surrogate splits:
##       college_degrees.pct                         < 27.15661   to the right, agree=0.784, adj=0.473, (0 split)
##       physical_inactivity.pct_physically_inactive < 22.85      to the left,  agree=0.784, adj=0.473, (0 split)
##       diabetes.pct_diabetic                       < 8.865152   to the left,  agree=0.775, adj=0.451, (0 split)
##       households_with_high_housing_costs.pct      < 36.65      to the right, agree=0.767, adj=0.431, (0 split)
##       severe_housing_problems.pct                 < 19.10782   to the right, agree=0.741, adj=0.369, (0 split)
## 
## Node number 2: 2191 observations,    complexity param=0.02899591
##   mean=3904.995, MSE=222407.2 
##   left son=4 (1581 obs) right son=5 (610 obs)
##   Primary splits:
##       preventable_hospital_stays.hosp__rate                                     < 57.235     to the left,  improve=0.1327627, (49 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 65.25673   to the left,  improve=0.1263855, (8 missing)
##       social_associations.association_rate                                      < 6.930659   to the left,  improve=0.1197737, (15 missing)
##       adult_smoking.pct_smokers                                                 < 16.865     to the left,  improve=0.1150870, (65 missing)
##       unemployed.ratio                                                          < 0.0837066  to the left,  improve=0.1117071, (0 missing)
##   Surrogate splits:
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 65.89869   to the left,  agree=0.912, adj=0.686, (41 split)
##       physical_inactivity.pct_physically_inactive                               < 21.70357   to the left,  agree=0.792, adj=0.259, (8 split)
##       married.pct                                                               < 0.2916978  to the right, agree=0.780, adj=0.215, (0 split)
##       health_care_costs                                                         < 11703.18   to the left,  agree=0.765, adj=0.163, (0 split)
##       children_in_single_parent_households.pct                                  < 37.56781   to the left,  agree=0.763, adj=0.155, (0 split)
## 
## Node number 3: 12269 observations,    complexity param=0.08146196
##   mean=4574.605, MSE=371714.2 
##   left son=6 (2124 obs) right son=7 (10145 obs)
##   Primary splits:
##       social_associations.association_rate                                      < 8.840807   to the left,  improve=0.15529100, (82 missing)
##       physical_inactivity.pct_physically_inactive                               < 25.31      to the left,  improve=0.14841010, (2 missing)
##       adult_smoking.pct_smokers                                                 < 18.53      to the left,  improve=0.12065990, (927 missing)
##       median_household_income                                                   < 49635      to the right, improve=0.09786335, (2 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 71.33925   to the left,  improve=0.08241451, (23 missing)
##   Surrogate splits:
##       demographics.pct_not_proficient_in_english < 5.883435   to the right, agree=0.781, adj=0.296, (80 split)
##       long_commute_driving_alone.pct_drives      < 38.65      to the right, agree=0.775, adj=0.279, (0 split)
##       severe_housing_problems.pct                < 20.28085   to the right, agree=0.769, adj=0.260, (0 split)
##       households_with_high_housing_costs.pct     < 39.55      to the right, agree=0.768, adj=0.255, (0 split)
##       air_pollution_ozone_days                   < 23.5       to the right, agree=0.749, adj=0.194, (2 split)
## 
## Node number 4: 1581 observations,    complexity param=0.01460901
##   mean=3797.528, MSE=193246.5 
##   left son=8 (1052 obs) right son=9 (529 obs)
##   Primary splits:
##       other_primary_care_providers.pcp_rate < 68.03784   to the left,  improve=0.10737600, (9 missing)
##       social_associations.association_rate  < 10.46158   to the left,  improve=0.09835927, (7 missing)
##       long_commute_driving_alone.pct_drives < 33.15      to the right, improve=0.09229659, (7 missing)
##       college_degrees.pct                   < 51.61495   to the right, improve=0.09088646, (21 missing)
##       pct_illiterate                        < 10.75      to the right, improve=0.09075853, (17 missing)
##   Surrogate splits:
##       social_associations.association_rate     < 10.96351   to the left,  agree=0.795, adj=0.235, (2 split)
##       primary_care_provider_rate.pcp           < 204.0669   to the left,  agree=0.791, adj=0.221, (3 split)
##       high_school_graduation.pct_afgr          < 69.2       to the right, agree=0.787, adj=0.206, (0 split)
##       children_in_single_parent_households.pct < 37.81873   to the left,  agree=0.787, adj=0.205, (4 split)
##       chlamydia_rate.rates_per_100000          < 527.2      to the left,  agree=0.786, adj=0.202, (0 split)
## 
## Node number 5: 610 observations,    complexity param=0.01596715
##   mean=4180.887, MSE=191503.4 
##   left son=10 (77 obs) right son=11 (533 obs)
##   Primary splits:
##       social_associations.association_rate < 6.120965   to the left,  improve=0.3043597, (8 missing)
##       pct_illiterate                       < 19.85      to the right, improve=0.2753760, (12 missing)
##       severe_housing_problems.pct          < 24.45407   to the right, improve=0.2477433, (1 missing)
##       married.pct                          < 0.2845878  to the left,  improve=0.2471087, (0 missing)
##       mammography_screening.rate           < 52.862     to the left,  improve=0.2443286, (55 missing)
##   Surrogate splits:
##       alcohol_impaired_driving_deaths.pct    < 21.16731   to the left,  agree=0.936, adj=0.774, (7 split)
##       pct_illiterate                         < 20.55      to the right, agree=0.936, adj=0.773, (0 split)
##       mammography_screening.rate             < 57.47445   to the left,  agree=0.928, adj=0.744, (0 split)
##       hiv_prevalence.rate                    < 924.75     to the right, agree=0.925, adj=0.733, (0 split)
##       households_with_high_housing_costs.pct < 49         to the right, agree=0.924, adj=0.731, (0 split)
## 
## Node number 6: 2124 observations,    complexity param=0.02335073
##   mean=4217.313, MSE=285054.4 
##   left son=12 (818 obs) right son=13 (1306 obs)
##   Primary splits:
##       adult_smoking.pct_smokers                                                 < 20.255     to the left,  improve=0.1855976, (148 missing)
##       drug_poisoning_deaths.mortality_rate                                      < 13.80764   to the left,  improve=0.1685312, (428 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 73.69278   to the left,  improve=0.1561667, (7 missing)
##       physical_inactivity.pct_physically_inactive                               < 26.05      to the left,  improve=0.1514961, (0 missing)
##       uninsured_children.pct                                                    < 9.676387   to the right, improve=0.1483545, (2 missing)
##   Surrogate splits:
##       injury_deaths.death_rate                    < 62.25408   to the left,  agree=0.811, adj=0.473, (134 split)
##       diabetes.pct_diabetic                       < 10.22      to the left,  agree=0.787, adj=0.407, (14 split)
##       physical_inactivity.pct_physically_inactive < 25.72      to the left,  agree=0.787, adj=0.407, (0 split)
##       preventable_hospital_stays.hosp__rate       < 66.635     to the left,  agree=0.782, adj=0.394, (0 split)
##       drug_poisoning_deaths.mortality_rate        < 17.07774   to the left,  agree=0.763, adj=0.340, (0 split)
## 
## Node number 7: 10145 observations,    complexity param=0.02555312
##   mean=4736.444, MSE=326952 
##   left son=14 (1586 obs) right son=15 (8559 obs)
##   Primary splits:
##       physical_inactivity.pct_physically_inactive                               < 24.57333   to the left,  improve=0.08058856, (2 missing)
##       preventable_hospital_stays.hosp__rate                                     < 59.15      to the left,  improve=0.07578943, (197 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 65.798     to the left,  improve=0.06706655, (16 missing)
##       median_household_income                                                   < 51800      to the right, improve=0.06420144, (2 missing)
##       income_inequality.ratio                                                   < 4.298682   to the left,  improve=0.05959308, (63 missing)
##   Surrogate splits:
##       diabetes.pct_diabetic   < 8.73       to the left,  agree=0.814, adj=0.227, (0 split)
##       college_degrees.pct     < 28.24019   to the right, agree=0.795, adj=0.149, (2 split)
##       some_college.pct        < 71.51324   to the right, agree=0.793, adj=0.142, (0 split)
##       health_care_costs       < 7991.533   to the left,  agree=0.788, adj=0.123, (0 split)
##       median_household_income < 56084      to the right, agree=0.788, adj=0.123, (0 split)
## 
## Node number 8: 1052 observations,    complexity param=0.01089564
##   mean=3710.324, MSE=164243.6 
##   left son=16 (508 obs) right son=17 (544 obs)
##   Primary splits:
##       unemployed.ratio                          < 0.06980548 to the left,  improve=0.12878740, (0 missing)
##       social_associations.association_rate      < 10.54073   to the left,  improve=0.10612200, (7 missing)
##       college_degrees.pct                       < 50.56903   to the right, improve=0.08444942, (17 missing)
##       drinking_water_violations.pct_pop_in_viol < 17.1353    to the left,  improve=0.08253329, (32 missing)
##       sahie.pct.uninsured                       < 20.43      to the right, improve=0.08057780, (44 missing)
##   Surrogate splits:
##       demographics.pct_not_proficient_in_english < 15.46075   to the left,  agree=0.688, adj=0.350, (0 split)
##       diabetic_screening.pct_hba1c               < 79.23486   to the right, agree=0.686, adj=0.346, (0 split)
##       single_parent_households.pct               < 11.37577   to the right, agree=0.644, adj=0.258, (0 split)
##       liquor_store_density.rate                  < 1.547533   to the right, agree=0.643, adj=0.256, (0 split)
##       low_birthweight.pct_lbw                    < 6.502618   to the right, agree=0.639, adj=0.249, (0 split)
## 
## Node number 9: 529 observations,    complexity param=0.01018293
##   mean=4035.557, MSE=194997.1 
##   left son=18 (22 obs) right son=19 (507 obs)
##   Primary splits:
##       adult_obesity.pct_obese     < 16.41775   to the left,  improve=0.2767254, (0 missing)
##       college_degrees.pct         < 47.85632   to the right, improve=0.2677454, (4 missing)
##       pct_illiterate              < 16.75      to the right, improve=0.2669924, (7 missing)
##       severe_housing_problems.pct < 24.94777   to the right, improve=0.2583796, (0 missing)
##       some_college.pct            < 80.3951    to the right, improve=0.2563665, (0 missing)
##   Surrogate splits:
##       college_degrees.pct                          < 51.87942   to the right, agree=0.966, adj=0.718, (0 split)
##       long_commute_driving_alone.pct_drives        < 44.7       to the right, agree=0.964, adj=0.697, (0 split)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 4.888512   to the left,  agree=0.964, adj=0.695, (0 split)
##       injury_deaths.death_rate                     < 33.46756   to the left,  agree=0.964, adj=0.694, (0 split)
##       limited_access_to_healthy_foods.pct          < 0.005      to the left,  agree=0.963, adj=0.686, (0 split)
## 
## Node number 10: 77 observations
##   mean=3796.091, MSE=65375.53 
## 
## Node number 11: 533 observations,    complexity param=0.006865151
##   mean=4332.245, MSE=159964.1 
##   left son=22 (369 obs) right son=23 (164 obs)
##   Primary splits:
##       unemployed.ratio                                                          < 0.09917085 to the left,  improve=0.2181193, (0 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 78.05615   to the left,  improve=0.1457259, (2 missing)
##       preventable_hospital_stays.hosp__rate                                     < 70.95      to the left,  improve=0.1359340, (16 missing)
##       long_commute_driving_alone.pct_drives                                     < 36.65      to the right, improve=0.1353127, (2 missing)
##       adult_smoking.pct_smokers                                                 < 18.32      to the left,  improve=0.1302174, (36 missing)
##   Surrogate splits:
##       single_parent_households.pct                                              < 10.69067   to the right, agree=0.770, adj=0.344, (0 split)
##       liquor_store_density.rate                                                 < 2.676875   to the right, agree=0.766, adj=0.334, (0 split)
##       diabetic_screening.pct_hba1c                                              < 80.48071   to the right, agree=0.743, adj=0.267, (0 split)
##       taxcredits.ratio                                                          < 0.3321441  to the left,  agree=0.736, adj=0.249, (0 split)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 80.5505    to the left,  agree=0.720, adj=0.201, (0 split)
## 
## Node number 12: 818 observations,    complexity param=0.00873183
##   mean=4044.027, MSE=205480.2 
##   left son=24 (166 obs) right son=25 (652 obs)
##   Primary splits:
##       primary_care_provider_rate.pcp < 42.13645   to the left,  improve=0.1551356, (19 missing)
##       wages.avg                      < 22.34552   to the left,  improve=0.1514776, (0 missing)
##       income_inequality.ratio        < 3.763576   to the left,  improve=0.1505764, (12 missing)
##       mammography_screening.pct      < 77.99824   to the right, improve=0.1496536, (4 missing)
##       dentists.dentist_rate          < 27.35067   to the left,  improve=0.1463942, (0 missing)
##   Surrogate splits:
##       primary_care_physicians.pcp_rate < 23.01542   to the left,  agree=0.990, adj=0.747, (17 split)
##       wages.avg                        < 22.63266   to the left,  agree=0.971, adj=0.286, (2 split)
##       taxcredits.ratio                 < 0.2407121  to the left,  agree=0.970, adj=0.278, (0 split)
##       mammography_screening.pct        < 79.65746   to the right, agree=0.970, adj=0.274, (0 split)
##       some_college.pct                 < 39.54731   to the left,  agree=0.970, adj=0.264, (0 split)
## 
## Node number 13: 1306 observations,    complexity param=0.01120156
##   mean=4524.133, MSE=278642.7 
##   left son=26 (476 obs) right son=27 (830 obs)
##   Primary splits:
##       income_inequality.ratio             < 4.406645   to the left,  improve=0.2540602, (7 missing)
##       uninsured_children.pct              < 7.900187   to the right, improve=0.2341967, (1 missing)
##       limited_access_to_healthy_foods.pct < 4.329132   to the right, improve=0.2113412, (1 missing)
##       low_birthweight.pct_lbw             < 9.35       to the left,  improve=0.2035723, (0 missing)
##       income_inequality.gini              < 44.55      to the left,  improve=0.1957590, (2 missing)
##   Surrogate splits:
##       income_inequality.gini               < 43.95      to the left,  agree=0.820, adj=0.604, (7 split)
##       homicide_rate                        < 5.6933     to the left,  agree=0.742, adj=0.432, (0 split)
##       low_birthweight.pct_lbw              < 8.515      to the left,  agree=0.739, adj=0.426, (0 split)
##       children_eligible_for_free_lunch.pct < 50.02899   to the left,  agree=0.729, adj=0.405, (0 split)
##       uninsured_children.pct               < 7.769359   to the right, agree=0.715, adj=0.373, (0 split)
## 
## Node number 14: 1586 observations
##   mean=4448.509, MSE=213831.7 
## 
## Node number 15: 8559 observations,    complexity param=0.009458984
##   mean=4827.972, MSE=328178.9 
##   left son=30 (4422 obs) right son=31 (4137 obs)
##   Primary splits:
##       access_to_parks.pct_park              < 13.5       to the left,  improve=0.03883445, (1011 missing)
##       income_inequality.ratio               < 4.298921   to the left,  improve=0.03862518, (58 missing)
##       preventable_hospital_stays.hosp__rate < 61.535     to the left,  improve=0.03624648, (168 missing)
##       diabetic_screening.pct_hba1c          < 82.83689   to the right, improve=0.03508955, (3 missing)
##       median_household_income               < 49684      to the right, improve=0.03481088, (2 missing)
##   Surrogate splits:
##       motor_vehicle_crash_deaths.mv_mortality_rate < 17.56553   to the right, agree=0.753, adj=0.299, (831 split)
##       sahie.pct.uninsured                          < 15.18583   to the right, agree=0.750, adj=0.291, (60 split)
##       access_to_exercise_opportunities.pct_with    < 58.20196   to the left,  agree=0.747, adj=0.282, (109 split)
##       dentists.dentist_rate                        < 30.88099   to the left,  agree=0.742, adj=0.268, (8 split)
##       some_college.pct                             < 51.85744   to the left,  agree=0.735, adj=0.247, (0 split)
## 
## Node number 16: 508 observations,    complexity param=0.005400564
##   mean=3570.458, MSE=159965.1 
##   left son=32 (54 obs) right son=33 (454 obs)
##   Primary splits:
##       mammography_screening.pct        < 73.84467   to the right, improve=0.1188342, (2 missing)
##       sahie.pct.uninsured              < 20.1875    to the right, improve=0.1161690, (29 missing)
##       some_college.pct                 < 52.90441   to the left,  improve=0.1079978, (4 missing)
##       access_to_healthy_foods.pct_food < 61.5828    to the right, improve=0.1025952, (4 missing)
##       median_household_income          < 85933      to the right, improve=0.1001463, (0 missing)
##   Surrogate splits:
##       mammography_screening.rate      < 72.4327    to the right, agree=0.947, adj=0.558, (0 split)
##       diabetes.pct_diabetic           < 9.85       to the right, agree=0.932, adj=0.431, (2 split)
##       fast_food_restaurants.pct_foods < 37.77769   to the left,  agree=0.920, adj=0.329, (0 split)
##       some_college.pct                < 52.90441   to the left,  agree=0.916, adj=0.296, (0 split)
##       sahie.pct.uninsured             < 20.1875    to the right, agree=0.913, adj=0.270, (0 split)
## 
## Node number 17: 544 observations,    complexity param=0.005735774
##   mean=3861.558, MSE=124845.6 
##   left son=34 (183 obs) right son=35 (361 obs)
##   Primary splits:
##       social_associations.association_rate                < 7.337901   to the left,  improve=0.1858073, (3 missing)
##       severe_housing_problems.pct                         < 18.44201   to the right, improve=0.1576965, (0 missing)
##       could_not_see_doctor_due_to_cost.pct_couldnt_access < 9.15       to the right, improve=0.1288276, (16 missing)
##       violent_crime.rate                                  < 204.1354   to the right, improve=0.1274380, (8 missing)
##       uninsured_children.pct                              < 4.356934   to the right, improve=0.1272138, (0 missing)
##   Surrogate splits:
##       access_to_recreational_facilities.rec_fac_rate < 12.51896   to the left,  agree=0.866, adj=0.582, (3 split)
##       teen_birth_rate                                < 24.45      to the right, agree=0.864, adj=0.579, (0 split)
##       children_eligible_for_free_lunch.pct           < 26.18888   to the right, agree=0.863, adj=0.574, (0 split)
##       pct_illiterate                                 < 10.55      to the right, agree=0.862, adj=0.572, (0 split)
##       sexually_transmitted_infections.chlamydia_rate < 259.9      to the right, agree=0.855, adj=0.548, (0 split)
## 
## Node number 18: 22 observations
##   mean=3404.182, MSE=92072.08 
## 
## Node number 19: 507 observations
##   mean=4121.023, MSE=147664.4 
## 
## Node number 22: 369 observations
##   mean=4194.878, MSE=134245 
## 
## Node number 23: 164 observations
##   mean=4586.246, MSE=108113 
## 
## Node number 24: 166 observations,    complexity param=0.006444455
##   mean=3192.941, MSE=1044677 
##   left son=48 (34 obs) right son=49 (132 obs)
##   Primary splits:
##       physical_inactivity.pct_physically_inactive                               < 23.95      to the left,  improve=0.5337147, (0 missing)
##       mammography_screening.rate                                                < 66.92715   to the right, improve=0.5124461, (27 missing)
##       could_not_see_doctor_due_to_cost.pct_couldnt_access                       < 12.95      to the left,  improve=0.4393661, (45 missing)
##       income_inequality.ratio                                                   < 3.949739   to the left,  improve=0.4078268, (8 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 68.46535   to the left,  improve=0.4046166, (1 missing)
##   Surrogate splits:
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 68.46535   to the left,  agree=0.951, adj=0.849, (0 split)
##       sahie.pct.uninsured                                                       < 20.575     to the left,  agree=0.925, adj=0.770, (0 split)
##       mammography_screening.pct                                                 < 61.58763   to the right, agree=0.906, adj=0.710, (0 split)
##       social_associations.association_rate                                      < 5.848229   to the left,  agree=0.899, adj=0.690, (0 split)
##       college_degrees.pct                                                       < 12.44625   to the right, agree=0.891, adj=0.666, (0 split)
## 
## Node number 25: 652 observations,    complexity param=0.005345174
##   mean=4080.581, MSE=136990.8 
##   left son=50 (78 obs) right son=51 (574 obs)
##   Primary splits:
##       injury_deaths.death_rate              < 38.98509   to the left,  improve=0.1449049, (8 missing)
##       adult_smoking.pct_smokers             < 14.795     to the left,  improve=0.1222545, (29 missing)
##       mentally_unhealthy_days               < 3.315      to the left,  improve=0.1105845, (24 missing)
##       long_commute_driving_alone.pct_drives < 44.3       to the right, improve=0.1104462, (4 missing)
##       dentists.dentist_rate                 < 62.13478   to the left,  improve=0.1034940, (0 missing)
##   Surrogate splits:
##       health_care_costs                    < 12092.46   to the right, agree=0.923, adj=0.238, (8 split)
##       children_in_poverty.pct              < 44.55      to the right, agree=0.916, adj=0.171, (0 split)
##       sahie.pct.uninsured                  < 30.11667   to the right, agree=0.915, adj=0.160, (0 split)
##       drug_poisoning_deaths.mortality_rate < 4.981702   to the left,  agree=0.914, adj=0.144, (0 split)
##       income_inequality.gini               < 48.75      to the right, agree=0.912, adj=0.127, (0 split)
## 
## Node number 26: 476 observations
##   mean=4233.081, MSE=186597.7 
## 
## Node number 27: 830 observations
##   mean=4766.921, MSE=225814.2 
## 
## Node number 30: 4422 observations,    complexity param=0.009458984
##   mean=4683.863, MSE=409354.8 
##   left son=60 (1244 obs) right son=61 (3178 obs)
##   Primary splits:
##       preventable_hospital_stays.hosp__rate                                     < 63.9       to the left,  improve=0.08601551, (82 missing)
##       income_inequality.ratio                                                   < 4.500345   to the left,  improve=0.08190500, (32 missing)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 91.45407   to the left,  improve=0.08153864, (10 missing)
##       physical_inactivity.pct_physically_inactive                               < 29.23      to the left,  improve=0.07034809, (1 missing)
##       median_household_income                                                   < 45907.5    to the right, improve=0.06694624, (1 missing)
##   Surrogate splits:
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 70.2685    to the left,  agree=0.859, adj=0.587, (73 split)
##       health_care_costs                                                         < 9180.875   to the left,  agree=0.752, adj=0.275, (9 split)
##       college_degrees.pct                                                       < 19.7649    to the right, agree=0.733, adj=0.217, (0 split)
##       physical_inactivity.pct_physically_inactive                               < 27.2619    to the left,  agree=0.721, adj=0.183, (0 split)
##       mammography_screening.pct                                                 < 69.19766   to the right, agree=0.712, adj=0.156, (0 split)
## 
## Node number 31: 4137 observations,    complexity param=0.007215268
##   mean=4915.069, MSE=258980.4 
##   left son=62 (480 obs) right son=63 (3657 obs)
##   Primary splits:
##       median_household_income                     < 51848      to the right, improve=0.06074987, (1 missing)
##       physical_inactivity.pct_physically_inactive < 28.9       to the left,  improve=0.05055453, (1 missing)
##       wages.avg                                   < 34.18568   to the right, improve=0.05041817, (0 missing)
##       social_associations.association_rate        < 21.35227   to the left,  improve=0.04270936, (26 missing)
##       injury_deaths.death_rate                    < 65.45651   to the left,  improve=0.04001469, (85 missing)
##   Surrogate splits:
##       children_in_poverty.pct   < 14.05      to the left,  agree=0.914, adj=0.253, (1 split)
##       wages.avg                 < 47.68349   to the right, agree=0.906, adj=0.182, (0 split)
##       pct_illiterate            < 6.55       to the left,  agree=0.903, adj=0.164, (0 split)
##       adjusted.gross.income.avg < 68.57197   to the right, agree=0.898, adj=0.114, (0 split)
##       some_college.pct          < 76.01044   to the right, agree=0.895, adj=0.088, (0 split)
## 
## Node number 32: 54 observations
##   mean=3195.912, MSE=191489.3 
## 
## Node number 33: 454 observations,    complexity param=0.005400564
##   mean=3621.231, MSE=134097.2 
##   left son=66 (70 obs) right son=67 (384 obs)
##   Primary splits:
##       median_household_income               < 80853.5    to the right, improve=0.1807546, (0 missing)
##       college_degrees.pct                   < 50.56903   to the right, improve=0.1759987, (13 missing)
##       long_commute_driving_alone.pct_drives < 49.2       to the right, improve=0.1625147, (4 missing)
##       limited_access_to_healthy_foods.pct   < 2.176528   to the left,  improve=0.1452561, (0 missing)
##       injury_deaths.death_rate              < 42.57526   to the left,  improve=0.1432071, (10 missing)
##   Surrogate splits:
##       college_degrees.pct                          < 46.55999   to the right, agree=0.951, adj=0.691, (0 split)
##       food_environment_index                       < 9.085718   to the right, agree=0.924, adj=0.519, (0 split)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 5.846693   to the left,  agree=0.923, adj=0.514, (0 split)
##       injury_deaths.death_rate                     < 34.98867   to the left,  agree=0.921, adj=0.504, (0 split)
##       wages.avg                                    < 69.66999   to the right, agree=0.920, adj=0.498, (0 split)
## 
## Node number 34: 183 observations
##   mean=3756.736, MSE=72419.48 
## 
## Node number 35: 361 observations
##   mean=4082.653, MSE=163366.6 
## 
## Node number 48: 34 observations
##   mean=2674.206, MSE=191757.7 
## 
## Node number 49: 132 observations
##   mean=4267.786, MSE=1099119 
## 
## Node number 50: 78 observations
##   mean=3659.823, MSE=112477.7 
## 
## Node number 51: 574 observations
##   mean=4127.786, MSE=117650.5 
## 
## Node number 60: 1244 observations
##   mean=4424.326, MSE=391618.7 
## 
## Node number 61: 3178 observations
##   mean=4818.483, MSE=365492.9 
## 
## Node number 62: 480 observations
##   mean=4567.829, MSE=221241.2 
## 
## Node number 63: 3657 observations
##   mean=4960.38, MSE=246117.8 
## 
## Node number 66: 70 observations
##   mean=3263.334, MSE=113612.6 
## 
## Node number 67: 384 observations
##   mean=3688.956, MSE=109148.2 
## 
## n= 14460 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 14460 87348740000000 4300.236  
##    2) adult_obesity.pct_obese< 26.35 2191 18998860000000 3904.995  
##      4) preventable_hospital_stays.hosp__rate< 57.235 1581 11880200000000 3797.528  
##        8) other_primary_care_providers.pcp_rate< 68.03784 1052  7389859000000 3710.324  
##         16) unemployed.ratio< 0.06980548 508  3739209000000 3570.458  
##           32) mammography_screening.pct>=73.84467 54   534334800000 3195.912 *
##           33) mammography_screening.pct< 73.84467 454  2760356000000 3621.231  
##             66) median_household_income>=80853.5 70   372134300000 3263.334 *
##             67) median_household_income< 80853.5 384  1889275000000 3688.956 *
##         17) unemployed.ratio>=0.06980548 544  2698930000000 3861.558  
##           34) social_associations.association_rate< 7.337901 183  1062052000000 3756.736 *
##           35) social_associations.association_rate>=7.337901 361  1135865000000 4082.653 *
##        9) other_primary_care_providers.pcp_rate>=68.03784 529  3214257000000 4035.557  
##         18) adult_obesity.pct_obese< 16.41775 22   180945500000 3404.182 *
##         19) adult_obesity.pct_obese>=16.41775 507  2143845000000 4121.023 *
##      5) preventable_hospital_stays.hosp__rate>=57.235 610  4585905000000 4180.887  
##       10) social_associations.association_rate< 6.120965 77   441955100000 3796.091 *
##       11) social_associations.association_rate>=6.120965 533  2749240000000 4332.245  
##         22) unemployed.ratio< 0.09917085 369  1497403000000 4194.878 *
##         23) unemployed.ratio>=0.09917085 164   652174200000 4586.246 *
##    3) adult_obesity.pct_obese>=26.35 12269 45741900000000 4574.605  
##      6) social_associations.association_rate< 8.840807 2124 10935510000000 4217.313  
##       12) adult_smoking.pct_smokers< 20.255 818  5037650000000 4044.027  
##         24) primary_care_provider_rate.pcp< 42.13645 166  1054712000000 3192.941  
##           48) physical_inactivity.pct_physically_inactive< 23.95 34   130579900000 2674.206 *
##           49) physical_inactivity.pct_physically_inactive>=23.95 132   361216700000 4267.786 *
##         25) primary_care_provider_rate.pcp>=42.13645 652  3220224000000 4080.581  
##           50) injury_deaths.death_rate< 38.98509 78   266711000000 3659.823 *
##           51) injury_deaths.death_rate>=38.98509 574  2486618000000 4127.786 *
##       13) adult_smoking.pct_smokers>=20.255 1306  3858199000000 4524.133  
##         26) income_inequality.ratio< 4.406645 476  1175063000000 4233.081 *
##         27) income_inequality.ratio>=4.406645 830  1704694000000 4766.921 *
##      7) social_associations.association_rate>=8.840807 10145 27690800000000 4736.444  
##       14) physical_inactivity.pct_physically_inactive< 24.57333 1586  4368257000000 4448.509 *
##       15) physical_inactivity.pct_physically_inactive>=24.57333 8559 21090510000000 4827.972  
##         30) access_to_parks.pct_park< 13.5 4422  9910143000000 4683.863  
##           60) preventable_hospital_stays.hosp__rate< 63.9 1244  3238057000000 4424.326 *
##           61) preventable_hospital_stays.hosp__rate>=63.9 3178  5826245000000 4818.483 *
##         31) access_to_parks.pct_park>=13.5 4137 10373750000000 4915.069  
##           62) median_household_income>=51848 480  1022939000000 4567.829 *
##           63) median_household_income< 51848 3657  8720562000000 4960.380 *
## co-linear variables to be ignored: pct_of_children_eligible_for_free_lunch teen_births.birth_rate homicides.homicide_rate adjusted.gross.income.avg sexually_transmitted_infections.rates_per_100000 excessive_drinking.pct high_housing_costs.pct violent_crime_rate hiv_prevalence_rate hiv_rate commuting_alone.pct_drive

## ADULT cp=0.005

## ADULT  tree depth is 5 
## n= 12675 
## 
## node), split, n, yval
##       * denotes terminal node
## 
##  1) root Counties:12675 Death.per.100k:360  
##    2) motor_vehicle_crash_deaths.mv_mortality_rate< 17 Counties:5416 Death.per.100k:330  
##      4) injury_deaths.death_rate< 54 Counties:1843 Death.per.100k:270  
##        8) adult_obesity.pct_obese< 25 Counties:512 Death.per.100k:240 *
##        9) adult_obesity.pct_obese>=25 Counties:1331 Death.per.100k:310  
##         18) injury_deaths.death_rate< 39 Counties:155 Death.per.100k:240 *
##         19) injury_deaths.death_rate>=39 Counties:1176 Death.per.100k:320  
##           38) physical_inactivity.pct_physically_inactive< 25 Counties:670 Death.per.100k:310 *
##           39) physical_inactivity.pct_physically_inactive>=25 Counties:506 Death.per.100k:370 *
##      5) injury_deaths.death_rate>=54 Counties:3573 Death.per.100k:390  
##       10) diabetes.pct_diabetic< 10 Counties:1907 Death.per.100k:350  
##         20) wages.avg>=36 Counties:896 Death.per.100k:330  
##           40) physical_inactivity.pct_physically_inactive< 20 Counties:248 Death.per.100k:300 *
##           41) physical_inactivity.pct_physically_inactive>=20 Counties:648 Death.per.100k:350 *
##         21) wages.avg< 36 Counties:1011 Death.per.100k:410  
##           42) chlamydia_rate.rates_per_100000>=2.5e+02 Counties:355 Death.per.100k:380 *
##           43) chlamydia_rate.rates_per_100000< 2.5e+02 Counties:656 Death.per.100k:490 *
##       11) diabetes.pct_diabetic>=10 Counties:1666 Death.per.100k:460  
##         22) median_household_income>=4.1e+04 Counties:999 Death.per.100k:440 *
##         23) median_household_income< 4.1e+04 Counties:667 Death.per.100k:540 *
##    3) motor_vehicle_crash_deaths.mv_mortality_rate>=17 Counties:7259 Death.per.100k:610  
##      6) median_household_income>=3.9e+04 Counties:3495 Death.per.100k:530  
##       12) mental_health_providers.mph_rate>=0.6 Counties:2320 Death.per.100k:510  
##         24) access_to_recreational_facilities.rec_fac_rate>=4.5 Counties:1804 Death.per.100k:500 *
##         25) access_to_recreational_facilities.rec_fac_rate< 4.5 Counties:516 Death.per.100k:650 *
##       13) mental_health_providers.mph_rate< 0.6 Counties:1175 Death.per.100k:710 *
##      7) median_household_income< 3.9e+04 Counties:3764 Death.per.100k:740  
##       14) access_to_recreational_facilities.rec_fac_rate>=0.33 Counties:2407 Death.per.100k:700  
##         28) motor_vehicle_crash_deaths.mv_mortality_rate< 21 Counties:677 Death.per.100k:620 *
##         29) motor_vehicle_crash_deaths.mv_mortality_rate>=21 Counties:1730 Death.per.100k:770 *
##       15) access_to_recreational_facilities.rec_fac_rate< 0.33 Counties:1357 Death.per.100k:970 *
## ADULT all data 
##    node counties deaths.pred deaths.act   age.pop dr100k.fit dr100k.group
## 1   236      191       71400      71231  30253811   236.0030     235.4447
## 2   241      628      628286     622234 260634916   241.0600     238.7378
## 3   300      319      222774     223156  74311496   299.7840     300.2981
## 4   309      870      466598     464074 151008722   308.9876     307.3160
## 5   352      800      463522     464005 131596720   352.2293     352.5962
## 6   374      653      162150     162096  43327334   374.2434     374.1195
## 7   383      438      145181     145163  37915070   382.9116     382.8636
## 8   436     1258      492113     491294 112942122   435.7212     434.9963
## 9   491      799       69216      69439  14109818   490.5498     492.1325
## 10  499     2277      355343     356459  71172707   499.2683     500.8366
## 11  539      840      203304     202061  37697675   539.3013     536.0039
## 12  621      848      114734     114954  18471374   621.1452     622.3359
## 13  650      632       42934      42128   6605965   649.9298     637.7267
## 14  714     1460       59280      59331   8303426   713.9179     714.5364
## 15  767     2141      192141     190801  25052181   766.9651     761.6143
## 16  975     1679       64271      64459   6593752   974.7333     977.5770
## ADULT test data 
##    node counties deaths.pred deaths.act  age.pop dr100k.fit dr100k.group
## 1   236       36        9671       9547  4097974   236.0030     232.9688
## 2   241      116      126929     124353 52654624   241.0600     236.1673
## 3   300       71       58548      58943 19530162   299.7840     301.8050
## 4   309      201      100354      97849 32478198   308.9876     301.2760
## 5   352      152       85822      86305 24365446   352.2293     354.2106
## 6   374      147       35972      35918  9611845   374.2434     373.6848
## 7   383       83       24101      24083  6294194   382.9116     382.6225
## 8   436      259       96444      95625 22134275   435.7212     432.0223
## 9   491      142       10734      10949  2188245   490.5498     500.3553
## 10  499      473       70386      71502 14097781   499.2683     507.1862
## 11  539      173       54478      53215 10101556   539.3013     526.8000
## 12  621      171       22225      22443  3578062   621.1452     627.2390
## 13  650      116        8625       7815  1327054   649.9298     588.8984
## 14  714      285       11498      11541  1610491   713.9179     716.6138
## 15  767      411       40411      39057  5268944   766.9651     741.2681
## 16  975      322       12365      12455  1268526   974.7333     981.8482

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## motor_vehicle_crash_deaths.mv_mortality_rate slope= 20 r^2= 0.56

## diabetes.pct_diabetic slope= 58 r^2= 0.47

## physical_inactivity.pct_physically_inactive slope= 21 r^2= 0.44

## injury_deaths.death_rate slope= 6.4 r^2= 0.54

## median_household_income slope= -0.0068 r^2= 0.35

## mental_health_providers.mph_rate slope= -1.6 r^2= 0.15

## adult_obesity.pct_obese slope= 21 r^2= 0.37

## wages.avg slope= -7.7 r^2= 0.34

## access_to_recreational_facilities.rec_fac_rate slope= -11 r^2= 0.097

## chlamydia_rate.rates_per_100000 slope= 0.049 r^2= 0.0049

## ADULT  all 
## rmse deaths= 70.36103 
## rmse Deaths by priori= 233.1597 
## tree Deaths rsq= 0.9872463 
## prior Deaths rsq= 0.9451815 
## fitted Death.per.100k rmse= 261.9283 
## weighted Death.per.100k rmse= 79.12766 
## fitted Death.per.100k rsq= 0.5002931 
## Call:
## rpart(formula = ezformula(c(yvar, predictorVars)), data = d[trainset, 
##     ], weights = d$Population[trainset], control = rpart.control(cp = 0.005))
##   n= 12675 
## 
##             CP nsplit rel error    xerror          xstd
## 1  0.387573355      0 1.0000000 1.0008092 0.00011610187
## 2  0.126231042      1 0.6124266 0.6132801 0.00007317471
## 3  0.057157247      2 0.4861956 0.4890044 0.00006973413
## 4  0.046117609      3 0.4290384 0.4341584 0.00006219423
## 5  0.024064699      4 0.3829207 0.3883235 0.00006026314
## 6  0.017480628      5 0.3588560 0.3665534 0.00005974281
## 7  0.012759840      6 0.3413754 0.3493192 0.00005554479
## 8  0.011760296      7 0.3286156 0.3337649 0.00005244683
## 9  0.010781808      8 0.3168553 0.3279402 0.00005172415
## 10 0.009357202      9 0.3060735 0.3185148 0.00005053259
## 11 0.008837985     10 0.2967163 0.3148207 0.00005004606
## 12 0.005789113     11 0.2878783 0.3025517 0.00004914691
## 13 0.005680555     12 0.2820892 0.2914575 0.00004784434
## 14 0.005194775     13 0.2764086 0.2847439 0.00004720771
## 15 0.005165097     14 0.2712138 0.2801932 0.00004677474
## 16 0.005000000     15 0.2660487 0.2783297 0.00004671009
## 
## Variable importance
##   motor_vehicle_crash_deaths.mv_mortality_rate 
##                                             25 
##                          diabetes.pct_diabetic 
##                                              8 
##    physical_inactivity.pct_physically_inactive 
##                                              8 
##                       injury_deaths.death_rate 
##                                              8 
##                        median_household_income 
##                                              7 
##      access_to_exercise_opportunities.pct_with 
##                                              6 
##                            college_degrees.pct 
##                                              6 
##               mental_health_providers.mph_rate 
##                                              6 
##           drug_poisoning_deaths.mortality_rate 
##                                              4 
##                      adult_smoking.pct_smokers 
##                                              4 
##          long_commute_driving_alone.pct_drives 
##                                              3 
##                        adult_obesity.pct_obese 
##                                              2 
##                                      wages.avg 
##                                              2 
##                        children_in_poverty.pct 
##                                              2 
## access_to_recreational_facilities.rec_fac_rate 
##                                              1 
##          preventable_hospital_stays.hosp__rate 
##                                              1 
##       children_in_single_parent_households.pct 
##                                              1 
##                driving_alone_to_work.pct_drive 
##                                              1 
##            limited_access_to_healthy_foods.pct 
##                                              1 
## 
## Node number 1: 12675 observations,    complexity param=0.3875734
##   mean=364.2569, MSE=23564.04 
##   left son=2 (5416 obs) right son=3 (7259 obs)
##   Primary splits:
##       motor_vehicle_crash_deaths.mv_mortality_rate < 16.9804   to the left,  improve=0.3839362, (335 missing)
##       injury_deaths.death_rate                     < 64.29935  to the left,  improve=0.3678195, (12 missing)
##       diabetes.pct_diabetic                        < 10.32364  to the left,  improve=0.3598620, (1 missing)
##       college_degrees.pct                          < 24.84014  to the right, improve=0.3378618, (11 missing)
##       physical_inactivity.pct_physically_inactive  < 28.27444  to the left,  improve=0.3260522, (1 missing)
##   Surrogate splits:
##       access_to_exercise_opportunities.pct_with   < 58.61636  to the right, agree=0.902, adj=0.262, (328 split)
##       physical_inactivity.pct_physically_inactive < 30.30556  to the left,  agree=0.900, adj=0.249, (6 split)
##       college_degrees.pct                         < 14.84097  to the right, agree=0.898, adj=0.235, (1 split)
##       mental_health_providers.mph_rate            < 5.45      to the right, agree=0.898, adj=0.230, (0 split)
##       diabetes.pct_diabetic                       < 12.30385  to the left,  agree=0.892, adj=0.190, (0 split)
## 
## Node number 2: 5416 observations,    complexity param=0.126231
##   mean=326.8133, MSE=10282.81 
##   left son=4 (1843 obs) right son=5 (3573 obs)
##   Primary splits:
##       injury_deaths.death_rate  < 54.15686  to the left,  improve=0.3335287, (6 missing)
##       adult_smoking.pct_smokers < 18.39     to the left,  improve=0.3231496, (104 missing)
##       diabetes.pct_diabetic     < 9.930714  to the left,  improve=0.3215485, (1 missing)
##       median_household_income   < 50667     to the right, improve=0.3023767, (1 missing)
##       college_degrees.pct       < 26.92086  to the right, improve=0.2949028, (5 missing)
##   Surrogate splits:
##       drug_poisoning_deaths.mortality_rate         < 11.73105  to the left,  agree=0.824, adj=0.614, (0 split)
##       long_commute_driving_alone.pct_drives        < 37.25     to the right, agree=0.730, adj=0.407, (0 split)
##       adult_smoking.pct_smokers                    < 18.4      to the left,  agree=0.727, adj=0.401, (3 split)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 10.50784  to the left,  agree=0.722, adj=0.390, (0 split)
##       median_household_income                      < 51456.5   to the right, agree=0.714, adj=0.374, (2 split)
## 
## Node number 3: 7259 observations,    complexity param=0.05715725
##   mean=608.1646, MSE=41454.1 
##   left son=6 (3495 obs) right son=7 (3764 obs)
##   Primary splits:
##       median_household_income                        < 39392     to the right, improve=0.2441317, (0 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate   < 22.7746   to the left,  improve=0.2214078, (198 missing)
##       mental_health_providers.mph_rate               < 0.6       to the right, improve=0.2134086, (8 missing)
##       college_degrees.pct                            < 13.16847  to the right, improve=0.2039336, (6 missing)
##       access_to_recreational_facilities.rec_fac_rate < 0.6687665 to the right, improve=0.1869118, (1 missing)
##   Surrogate splits:
##       children_in_poverty.pct                     < 28.75     to the left,  agree=0.812, adj=0.489, (0 split)
##       wages.avg                                   < 31.3082   to the right, agree=0.779, adj=0.399, (0 split)
##       diabetes.pct_diabetic                       < 12.60417  to the left,  agree=0.749, adj=0.316, (0 split)
##       physical_inactivity.pct_physically_inactive < 31.15778  to the left,  agree=0.748, adj=0.313, (0 split)
##       college_degrees.pct                         < 12.29639  to the right, agree=0.736, adj=0.280, (0 split)
## 
## Node number 4: 1843 observations,    complexity param=0.0240647
##   mean=273.1774, MSE=4046.38 
##   left son=8 (512 obs) right son=9 (1331 obs)
##   Primary splits:
##       adult_obesity.pct_obese   < 24.95     to the left,  improve=0.2971914, (0 missing)
##       injury_deaths.death_rate  < 38.7985   to the left,  improve=0.2667106, (2 missing)
##       adult_smoking.pct_smokers < 15.59     to the left,  improve=0.2504855, (22 missing)
##       diabetes.pct_diabetic     < 8.304167  to the left,  improve=0.2469641, (1 missing)
##       college_degrees.pct       < 27.24505  to the right, improve=0.2379269, (5 missing)
##   Surrogate splits:
##       limited_access_to_healthy_foods.pct       < 4.517023  to the left,  agree=0.763, adj=0.487, (0 split)
##       adult_smoking.pct_smokers                 < 15.59     to the left,  agree=0.763, adj=0.487, (0 split)
##       diabetes.pct_diabetic                     < 8.12      to the left,  agree=0.747, adj=0.452, (0 split)
##       driving_alone_to_work.pct_drive           < 78.35703  to the left,  agree=0.747, adj=0.452, (0 split)
##       access_to_exercise_opportunities.pct_with < 93.52691  to the right, agree=0.746, adj=0.450, (0 split)
## 
## Node number 5: 3573 observations,    complexity param=0.04611761
##   mean=390.7844, MSE=10197.51 
##   left son=10 (1907 obs) right son=11 (1666 obs)
##   Primary splits:
##       diabetes.pct_diabetic                       < 10.07     to the left,  improve=0.2695403, (0 missing)
##       median_household_income                     < 44720.6   to the right, improve=0.2522379, (0 missing)
##       physical_inactivity.pct_physically_inactive < 26.69286  to the left,  improve=0.2509207, (0 missing)
##       adult_smoking.pct_smokers                   < 19.885    to the left,  improve=0.2129811, (82 missing)
##       wages.avg                                   < 36.07804  to the right, improve=0.2122985, (0 missing)
##   Surrogate splits:
##       physical_inactivity.pct_physically_inactive < 25.72     to the left,  agree=0.794, adj=0.435, (0 split)
##       adult_obesity.pct_obese                     < 29.65     to the left,  agree=0.749, adj=0.314, (0 split)
##       preventable_hospital_stays.hosp__rate       < 65.91     to the left,  agree=0.745, adj=0.302, (0 split)
##       children_in_single_parent_households.pct    < 38.78801  to the left,  agree=0.742, adj=0.295, (0 split)
##       median_household_income                     < 44223.8   to the right, agree=0.729, adj=0.258, (0 split)
## 
## Node number 6: 3495 observations,    complexity param=0.01275984
##   mean=531.5937, MSE=22682.4 
##   left son=12 (2320 obs) right son=13 (1175 obs)
##   Primary splits:
##       mental_health_providers.mph_rate               < 0.6       to the right, improve=0.1573519, (3 missing)
##       access_to_recreational_facilities.rec_fac_rate < 4.476977  to the right, improve=0.1297916, (0 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate   < 22.77923  to the left,  improve=0.1287799, (88 missing)
##       college_degrees.pct                            < 15.97005  to the right, improve=0.1105543, (0 missing)
##       preventable_hospital_stays.hosp__rate          < 83.89     to the left,  improve=0.1057088, (15 missing)
##   Surrogate splits:
##       primary_care_provider_rate.pcp              < 18.14577  to the right, agree=0.909, adj=0.064, (3 split)
##       physical_inactivity.pct_physically_inactive < 35.35     to the left,  agree=0.906, adj=0.029, (0 split)
##       fast_food_restaurants.pct_foods             < 71.11143  to the left,  agree=0.906, adj=0.027, (0 split)
##       some_college.pct                            < 36.31884  to the right, agree=0.905, adj=0.022, (0 split)
##       married.pct                                 < 0.5272176 to the left,  agree=0.904, adj=0.015, (0 split)
## 
## Node number 7: 3764 observations,    complexity param=0.01748063
##   mean=740.3331, MSE=46267.02 
##   left son=14 (2407 obs) right son=15 (1357 obs)
##   Primary splits:
##       access_to_recreational_facilities.rec_fac_rate < 0.3333333 to the right, improve=0.1823829, (1 missing)
##       mental_health_providers.mph_rate               < 0.6       to the right, improve=0.1766176, (5 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate   < 22.06412  to the left,  improve=0.1510220, (110 missing)
##       median_household_income                        < 34046.2   to the right, improve=0.1444368, (0 missing)
##       mammography_screening.pct                      < 57.55155  to the right, improve=0.1230853, (1 missing)
##   Surrogate splits:
##       college_degrees.pct                       < 6.874589  to the right, agree=0.875, adj=0.064, (1 split)
##       primary_care_provider_rate.pcp            < 11.9189   to the right, agree=0.875, adj=0.058, (0 split)
##       primary_care_physicians.pcp_rate          < 14.55286  to the right, agree=0.874, adj=0.052, (0 split)
##       fast_food_restaurants.pct_foods           < 72.65396  to the left,  agree=0.873, adj=0.044, (0 split)
##       access_to_exercise_opportunities.pct_with < 14.72906  to the right, agree=0.873, adj=0.044, (0 split)
## 
## Node number 8: 512 observations
##   mean=241.06, MSE=1199.397 
## 
## Node number 9: 1331 observations,    complexity param=0.008837985
##   mean=310.6197, MSE=4760.897 
##   left son=18 (155 obs) right son=19 (1176 obs)
##   Primary splits:
##       injury_deaths.death_rate                    < 38.7985   to the left,  improve=0.2008820, (1 missing)
##       physical_inactivity.pct_physically_inactive < 24.91     to the left,  improve=0.1987558, (1 missing)
##       college_degrees.pct                         < 25.0477   to the right, improve=0.1502515, (0 missing)
##       diabetes.pct_diabetic                       < 8.67      to the left,  improve=0.1355016, (1 missing)
##       demographics.pct_not_proficient_in_english  < 1.621169  to the right, improve=0.1332136, (1 missing)
##   Surrogate splits:
##       health_care_costs                    < 12072.11  to the right, agree=0.885, adj=0.215, (0 split)
##       adult_smoking.pct_smokers            < 13.355    to the left,  agree=0.885, adj=0.215, (0 split)
##       wages.avg                            < 59.61102  to the right, agree=0.872, adj=0.130, (1 split)
##       drug_poisoning_deaths.mortality_rate < 4.566742  to the left,  agree=0.872, adj=0.125, (0 split)
##       children_in_poverty.pct              < 44.05     to the right, agree=0.871, adj=0.122, (0 split)
## 
## Node number 10: 1907 observations,    complexity param=0.01078181
##   mean=350.994, MSE=5827.4 
##   left son=20 (896 obs) right son=21 (1011 obs)
##   Primary splits:
##       wages.avg                            < 36.07804  to the right, improve=0.1737923, (0 missing)
##       social_associations.association_rate < 11.32763  to the left,  improve=0.1654727, (2 missing)
##       median_household_income              < 51408     to the right, improve=0.1521359, (0 missing)
##       college_degrees.pct                  < 26.99055  to the right, improve=0.1425542, (0 missing)
##       adult_smoking.pct_smokers            < 19.985    to the left,  improve=0.1383303, (19 missing)
##   Surrogate splits:
##       median_household_income                   < 44759.7   to the right, agree=0.878, adj=0.424, (0 split)
##       college_degrees.pct                       < 19.6033   to the right, agree=0.830, adj=0.198, (0 split)
##       contributions.ratio                       < 0.7906229 to the right, agree=0.829, adj=0.191, (0 split)
##       physically_unhealthy_days                 < 3.97      to the left,  agree=0.819, adj=0.147, (0 split)
##       access_to_exercise_opportunities.pct_with < 71.1032   to the right, agree=0.818, adj=0.142, (0 split)
## 
## Node number 11: 1666 observations,    complexity param=0.0117603
##   mean=459.8623, MSE=10263.82 
##   left son=22 (999 obs) right son=23 (667 obs)
##   Primary splits:
##       median_household_income < 41145.5   to the right, improve=0.1868458, (0 missing)
##       college_degrees.pct     < 16.23905  to the right, improve=0.1828298, (0 missing)
##       wages.avg               < 33.80492  to the right, improve=0.1601795, (0 missing)
##       some_college.pct        < 59.81483  to the right, improve=0.1462554, (6 missing)
##       taxcredits.ratio        < 0.2816365 to the right, improve=0.1383583, (0 missing)
##   Surrogate splits:
##       income_inequality.ratio                    < 5.705983  to the left,  agree=0.869, adj=0.438, (0 split)
##       inadequate_social_support.pct_no_emotional < 24.038    to the left,  agree=0.868, adj=0.436, (0 split)
##       food_insecurity.pct_insecure               < 21.625    to the left,  agree=0.862, adj=0.407, (0 split)
##       children_in_poverty.pct                    < 33.25     to the left,  agree=0.859, adj=0.395, (0 split)
##       severe_housing_problems.pct                < 22.69124  to the left,  agree=0.854, adj=0.374, (0 split)
## 
## Node number 12: 2320 observations,    complexity param=0.005680555
##   mean=512.0234, MSE=16291.14 
##   left son=24 (1804 obs) right son=25 (516 obs)
##   Primary splits:
##       access_to_recreational_facilities.rec_fac_rate < 4.476977  to the right, improve=0.10797330, (0 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate   < 20.8635   to the left,  improve=0.09896063, (39 missing)
##       college_degrees.pct                            < 11.69338  to the right, improve=0.08851621, (0 missing)
##       access_to_exercise_opportunities.pct_with      < 54.38523  to the right, improve=0.08368683, (0 missing)
##       preventable_hospital_stays.hosp__rate          < 83.92     to the left,  improve=0.08260993, (4 missing)
##   Surrogate splits:
##       primary_care_physicians.pcp_rate      < 11.62487  to the right, agree=0.920, adj=0.052, (0 split)
##       college_degrees.pct                   < 9.237706  to the right, agree=0.920, adj=0.051, (0 split)
##       primary_care_provider_rate.pcp        < 26.03554  to the right, agree=0.919, adj=0.043, (0 split)
##       dentists.dentist_rate                 < 11.39345  to the right, agree=0.919, adj=0.038, (0 split)
##       preventable_hospital_stays.hosp__rate < 106.38    to the left,  agree=0.918, adj=0.037, (0 split)
## 
## Node number 13: 1175 observations
##   mean=713.9179, MSE=45415.44 
## 
## Node number 14: 2407 observations,    complexity param=0.009357202
##   mean=704.3366, MSE=33528.82 
##   left son=28 (677 obs) right son=29 (1730 obs)
##   Primary splits:
##       motor_vehicle_crash_deaths.mv_mortality_rate < 21.37521  to the left,  improve=0.1562181, (25 missing)
##       mental_health_providers.mph_rate             < 0.6       to the right, improve=0.1365845, (1 missing)
##       median_household_income                      < 34660     to the right, improve=0.1175709, (0 missing)
##       mammography_screening.pct                    < 57.55155  to the right, improve=0.1071864, (1 missing)
##       college_degrees.pct                          < 10.16521  to the right, improve=0.1054515, (1 missing)
##   Surrogate splits:
##       injury_deaths.death_rate                  < 84.91093  to the left,  agree=0.741, adj=0.397, (25 split)
##       college_degrees.pct                       < 14.45925  to the right, agree=0.682, adj=0.258, (0 split)
##       some_college.pct                          < 47.63203  to the right, agree=0.677, adj=0.246, (0 split)
##       access_to_exercise_opportunities.pct_with < 66.25781  to the right, agree=0.669, adj=0.229, (0 split)
##       preventable_hospital_stays.hosp__rate     < 71.07     to the left,  agree=0.669, adj=0.229, (0 split)
## 
## Node number 15: 1357 observations
##   mean=974.7333, MSE=65834.01 
## 
## Node number 18: 155 observations
##   mean=236.003, MSE=2116.901 
## 
## Node number 19: 1176 observations,    complexity param=0.005789113
##   mean=323.4387, MSE=4094.288 
##   left son=38 (670 obs) right son=39 (506 obs)
##   Primary splits:
##       physical_inactivity.pct_physically_inactive < 24.91     to the left,  improve=0.1793633, (1 missing)
##       demographics.pct_not_proficient_in_english  < 1.621402  to the right, improve=0.1496125, (1 missing)
##       access_to_exercise_opportunities.pct_with   < 66.72005  to the right, improve=0.1463830, (1 missing)
##       wages.avg                                   < 35.24007  to the right, improve=0.1456332, (0 missing)
##       college_degrees.pct                         < 25.0477   to the right, improve=0.1428851, (0 missing)
##   Surrogate splits:
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 83.64189  to the left,  agree=0.841, adj=0.280, (1 split)
##       preventable_hospital_stays.hosp__rate                                     < 65.4      to the left,  agree=0.836, adj=0.258, (0 split)
##       wages.avg                                                                 < 35.24007  to the right, agree=0.833, adj=0.246, (0 split)
##       contributions.ratio                                                       < 0.7447357 to the right, agree=0.829, adj=0.230, (0 split)
##       binge_drinking.pct                                                        < 13.495    to the right, agree=0.827, adj=0.218, (0 split)
## 
## Node number 20: 896 observations,    complexity param=0.005165097
##   mean=334.496, MSE=3039.772 
##   left son=40 (248 obs) right son=41 (648 obs)
##   Primary splits:
##       physical_inactivity.pct_physically_inactive < 19.65     to the left,  improve=0.2025019, (0 missing)
##       some_college.pct                            < 78.52289  to the right, improve=0.1899559, (0 missing)
##       adult_smoking.pct_smokers                   < 18.55     to the left,  improve=0.1774663, (2 missing)
##       diabetes.pct_diabetic                       < 7.49      to the left,  improve=0.1744028, (0 missing)
##       preventable_hospital_stays.hosp__rate       < 47.15     to the left,  improve=0.1627104, (0 missing)
##   Surrogate splits:
##       preventable_hospital_stays.hosp__rate                                     < 45.23     to the left,  agree=0.913, adj=0.743, (0 split)
##       preventable_hospital_stays_ambulatory_care_sensitive_conditions.acsc_rate < 51.31477  to the left,  agree=0.888, adj=0.669, (0 split)
##       diabetes.pct_diabetic                                                     < 8.22      to the left,  agree=0.846, adj=0.543, (0 split)
##       adult_obesity.pct_obese                                                   < 23.95     to the left,  agree=0.823, adj=0.476, (0 split)
##       driving_alone_to_work.pct_drive                                           < 78.40481  to the left,  agree=0.811, adj=0.441, (0 split)
## 
## Node number 21: 1011 observations,    complexity param=0.005194775
##   mean=412.3807, MSE=11418.69 
##   left son=42 (355 obs) right son=43 (656 obs)
##   Primary splits:
##       chlamydia_rate.rates_per_100000                < 248.95    to the right, improve=0.2017369, (0 missing)
##       households_with_high_housing_costs.pct         < 30.45     to the right, improve=0.1838919, (0 missing)
##       social_associations.association_rate           < 11.163    to the left,  improve=0.1781584, (2 missing)
##       sexually_transmitted_infections.chlamydia_rate < 362.4812  to the right, improve=0.1581570, (0 missing)
##       fast_food_restaurants.pct_foods                < 48.75019  to the right, improve=0.1478975, (0 missing)
##   Surrogate splits:
##       sexually_transmitted_infections.chlamydia_rate < 285.8643  to the right, agree=0.923, adj=0.721, (0 split)
##       food_environment_index                         < 7.597611  to the left,  agree=0.849, adj=0.449, (0 split)
##       violent_crime.rate                             < 219.6522  to the right, agree=0.837, adj=0.403, (0 split)
##       food_insecurity.pct_insecure                   < 14.15     to the right, agree=0.814, adj=0.322, (0 split)
##       fast_food_restaurants.pct_foods                < 41.82962  to the right, agree=0.800, adj=0.269, (0 split)
## 
## Node number 22: 999 observations
##   mean=435.7212, MSE=5989.755 
## 
## Node number 23: 667 observations
##   mean=539.3013, MSE=16099.74 
## 
## Node number 24: 1804 observations
##   mean=499.2683, MSE=12620.57 
## 
## Node number 25: 516 observations
##   mean=649.9298, MSE=35199.76 
## 
## Node number 28: 677 observations
##   mean=621.1452, MSE=19919.39 
## 
## Node number 29: 1730 observations
##   mean=766.9651, MSE=34641.85 
## 
## Node number 38: 670 observations
##   mean=308.9876, MSE=2130.497 
## 
## Node number 39: 506 observations
##   mean=374.2434, MSE=7682.959 
## 
## Node number 40: 248 observations
##   mean=299.784, MSE=2332.652 
## 
## Node number 41: 648 observations
##   mean=352.2293, MSE=2470.988 
## 
## Node number 42: 355 observations
##   mean=382.9116, MSE=6529.838 
## 
## Node number 43: 656 observations
##   mean=490.5498, MSE=15972.78 
## 
## n= 12675 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 12675 19308130000000 364.2569  
##    2) motor_vehicle_crash_deaths.mv_mortality_rate< 16.9804 5416  7304307000000 326.8133  
##      4) injury_deaths.death_rate< 54.15686 1843  1563452000000 273.1774  
##        8) adult_obesity.pct_obese< 24.95 512   249451000000 241.0600 *
##        9) adult_obesity.pct_obese>=24.95 1331   849356500000 310.6197  
##         18) injury_deaths.death_rate< 38.7985 155    55369320000 236.0030 *
##         19) injury_deaths.death_rate>=38.7985 1176   623342200000 323.4387  
##           38) physical_inactivity.pct_physically_inactive< 24.91 670   252530500000 308.9876 *
##           39) physical_inactivity.pct_physically_inactive>=24.91 506   259034700000 374.2434 *
##      5) injury_deaths.death_rate>=54.15686 3573  3303569000000 390.7844  
##       10) diabetes.pct_diabetic< 10.07 1907  1197847000000 350.9940  
##         20) wages.avg>=36.07804 896   492481300000 334.4960  
##           40) physical_inactivity.pct_physically_inactive< 19.65 248   127785800000 299.7840 *
##           41) physical_inactivity.pct_physically_inactive>=19.65 648   264967200000 352.2293 *
##         21) wages.avg< 36.07804 1011   497189200000 412.3807  
##           42) chlamydia_rate.rates_per_100000>=248.95 355   206479200000 382.9116 *
##           43) chlamydia_rate.rates_per_100000< 248.95 656   190408600000 490.5498 *
##       11) diabetes.pct_diabetic>=10.07 1666  1215277000000 459.8623  
##         22) median_household_income>=41145.5 999   543916800000 435.7212 *
##         23) median_household_income< 41145.5 667   444290500000 539.3013 *
##    3) motor_vehicle_crash_deaths.mv_mortality_rate>=16.9804 7259  4520509000000 608.1646  
##      6) median_household_income>=39392 3495  1566147000000 531.5937  
##       12) mental_health_providers.mph_rate>=0.6 2320  1015815000000 512.0234  
##         24) access_to_recreational_facilities.rec_fac_rate>=4.476977 1804   720317900000 499.2683 *
##         25) access_to_recreational_facilities.rec_fac_rate< 4.476977 516   185816400000 649.9298 *
##       13) mental_health_providers.mph_rate< 0.6 1175   303962600000 713.9179 *
##      7) median_household_income< 39392 3764  1850763000000 740.3331  
##       14) access_to_recreational_facilities.rec_fac_rate>=0.3333333 2407  1162664000000 704.3366  
##         28) motor_vehicle_crash_deaths.mv_mortality_rate< 21.37521 677   296665600000 621.1452 *
##         29) motor_vehicle_crash_deaths.mv_mortality_rate>=21.37521 1730   685327900000 766.9651 *
##       15) access_to_recreational_facilities.rec_fac_rate< 0.3333333 1357   350581000000 974.7333 *
## co-linear variables to be ignored: teen_births.birth_rate children_eligible_for_free_lunch.pct food_insecurity.pct_insecure wages.avg homicides.homicide_rate sexually_transmitted_infections.chlamydia_rate sexually_transmitted_infections.rates_per_100000 violent_crime.rate hiv_prevalence.rate hiv_prevalence_rate driving_alone_to_work.pct_drive households_with_high_housing_costs.pct sahie.pct.uninsured binge_drinking.pct

## YOUTH cp=0.0173159759897632

## YOUTH  tree depth is 3 
## n= 2313 
## 
## node), split, n, yval
##       * denotes terminal node
## 
##  1) root Counties:2313 Death.per.100k: 86  
##    2) demographics.pct_not_proficient_in_english>=3.9 Counties:908 Death.per.100k: 69  
##      4) social_associations.association_rate< 7.3 Counties:370 Death.per.100k: 57 *
##      5) social_associations.association_rate>=7.3 Counties:538 Death.per.100k:100  
##       10) access_to_exercise_opportunities.pct_with>=83 Counties:360 Death.per.100k: 93 *
##       11) access_to_exercise_opportunities.pct_with< 83 Counties:178 Death.per.100k:240 *
##    3) demographics.pct_not_proficient_in_english< 3.9 Counties:1405 Death.per.100k:170  
##      6) access_to_exercise_opportunities.pct_with>=71 Counties:1002 Death.per.100k:150  
##       12) access_to_exercise_opportunities.pct_with>=86 Counties:408 Death.per.100k:120 *
##       13) access_to_exercise_opportunities.pct_with< 86 Counties:594 Death.per.100k:220 *
##      7) access_to_exercise_opportunities.pct_with< 71 Counties:403 Death.per.100k:460 *
## YOUTH all data 
##   node counties deaths.pred deaths.act   age.pop dr100k.fit dr100k.group
## 1   57      467       64214      63811 112330034   57.16540     56.80671
## 2   93      434       33492      32970  36033472   92.94713     91.49826
## 3  122      512       29466      28751  24187179  121.82282    118.86876
## 4  224      747       18960      18743   8446910  224.46242    221.89179
## 5  237      228        5606       5519   2365179  237.00361    233.34386
## 6  459      518        8473       8612   1845729  459.06937    466.59071
## YOUTH test data 
##   node counties deaths.pred deaths.act  age.pop dr100k.fit dr100k.group
## 1   57       97       12046      12001 21072618   57.16540     56.95068
## 2   93       74        5928       5417  6377396   92.94713     84.94062
## 3  122      104        7079       6358  5810586  121.82282    109.42098
## 4  224      153        4440       4223  1978192  224.46242    213.47776
## 5  237       50        1240       1153   523013  237.00361    220.45341
## 6  459      115        1790       1910   389905  459.06937    489.86292

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## demographics.pct_not_proficient_in_english slope= -4.4 r^2= 0.095

## access_to_exercise_opportunities.pct_with slope= -3.3 r^2= 0.096

## social_associations.association_rate slope= 15 r^2= 0.15

## YOUTH  all 
## rmse deaths= 39.99027 
## rmse Deaths by priori= 78.00859 
## tree Deaths rsq= 0.9072014 
## prior Deaths rsq= 0.9117817 
## fitted Death.per.100k rmse= 446.975 
## weighted Death.per.100k rmse= 85.34004 
## fitted Death.per.100k rsq= 0.3502687 
## Call:
## rpart(formula = ezformula(c(yvar, predictorVars)), data = d[trainset, 
##     ], weights = d$Population[trainset], control = rpart.control(cp = 0.005))
##   n= 2313 
## 
##           CP nsplit rel error    xerror         xstd
## 1 0.13456289      0 1.0000000 1.0016483 0.0006912473
## 2 0.08714994      1 0.8654371 0.8712310 0.0006112062
## 3 0.03312596      2 0.7782872 0.8153507 0.0005250473
## 4 0.03007403      3 0.7451612 0.7765289 0.0004993224
## 5 0.02365492      4 0.7150872 0.7456848 0.0004905201
## 6 0.01731598      5 0.6914323 0.7356099 0.0004763674
## 
## Variable importance
##     demographics.pct_not_proficient_in_english 
##                                             24 
##      access_to_exercise_opportunities.pct_with 
##                                             23 
##           social_associations.association_rate 
##                                             10 
##                    severe_housing_problems.pct 
##                                              5 
##                      commuting_alone.pct_drive 
##                                              5 
##   motor_vehicle_crash_deaths.mv_mortality_rate 
##                                              4 
##                         high_housing_costs.pct 
##                                              4 
##                          diabetes.pct_diabetic 
##                                              3 
##                       access_to_parks.pct_park 
##                                              3 
##                            college_degrees.pct 
##                                              3 
##                          dentists.dentist_rate 
##                                              2 
## access_to_recreational_facilities.rec_fac_rate 
##                                              2 
##                                 pct_illiterate 
##                                              2 
##          other_primary_care_providers.pcp_rate 
##                                              2 
##                 primary_care_provider_rate.pcp 
##                                              2 
##            limited_access_to_healthy_foods.pct 
##                                              2 
##                               some_college.pct 
##                                              1 
##                         excessive_drinking.pct 
##                                              1 
##                      physically_unhealthy_days 
##                                              1 
##                                teen_birth_rate 
##                                              1 
##                      adult_smoking.pct_smokers 
##                                              1 
## 
## Node number 1: 2313 observations,    complexity param=0.1345629
##   mean=85.66406, MSE=10208.18 
##   left son=2 (908 obs) right son=3 (1405 obs)
##   Primary splits:
##       demographics.pct_not_proficient_in_english   < 3.873321 to the right, improve=0.13456290, (0 missing)
##       social_associations.association_rate         < 7.547315 to the left,  improve=0.11589750, (1 missing)
##       high_housing_costs.pct                       < 30.29605 to the right, improve=0.10937370, (0 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 14.68285 to the left,  improve=0.09595310, (1 missing)
##       long_commute_driving_alone.pct_drives        < 30.65    to the right, improve=0.09399445, (1 missing)
##   Surrogate splits:
##       social_associations.association_rate < 9.795606 to the left,  agree=0.867, adj=0.247, (0 split)
##       severe_housing_problems.pct          < 16.35279 to the right, agree=0.865, adj=0.234, (0 split)
##       commuting_alone.pct_drive            < 82.35599 to the left,  agree=0.864, adj=0.232, (0 split)
##       high_housing_costs.pct               < 31.38978 to the right, agree=0.854, adj=0.172, (0 split)
##       diabetes.pct_diabetic                < 10.50417 to the left,  agree=0.849, adj=0.147, (0 split)
## 
## Node number 2: 908 observations,    complexity param=0.03007403
##   mean=68.50856, MSE=3199.906 
##   left son=4 (370 obs) right son=5 (538 obs)
##   Primary splits:
##       social_associations.association_rate         < 7.301414 to the left,  improve=0.11649660, (0 missing)
##       long_commute_driving_alone.pct_drives        < 30.7     to the right, improve=0.10707980, (0 missing)
##       high_housing_costs.pct                       < 46.73384 to the right, improve=0.07899071, (0 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 15.63478 to the left,  improve=0.07721895, (0 missing)
##       pct_illiterate                               < 17.15    to the right, improve=0.07669416, (1 missing)
##   Surrogate splits:
##       access_to_recreational_facilities.rec_fac_rate < 11.7317  to the left,  agree=0.855, adj=0.437, (0 split)
##       pct_illiterate                                 < 12.45    to the right, agree=0.851, adj=0.420, (0 split)
##       other_primary_care_providers.pcp_rate          < 67.70461 to the left,  agree=0.849, adj=0.410, (0 split)
##       demographics.pct_not_proficient_in_english     < 5.968491 to the right, agree=0.840, adj=0.377, (0 split)
##       primary_care_provider_rate.pcp                 < 163.076  to the left,  agree=0.835, adj=0.357, (0 split)
## 
## Node number 3: 1405 observations,    complexity param=0.08714994
##   mean=165.7342, MSE=35133.15 
##   left son=6 (1002 obs) right son=7 (403 obs)
##   Primary splits:
##       access_to_exercise_opportunities.pct_with < 70.575   to the right, improve=0.14317820, (1 missing)
##       access_to_parks.pct_park                  < 22.5     to the right, improve=0.11819240, (8 missing)
##       college_degrees.pct                       < 19.36358 to the right, improve=0.10176070, (0 missing)
##       high_housing_costs.pct                    < 30.49355 to the right, improve=0.08775901, (0 missing)
##       dentists.dentist_rate                     < 52.69253 to the right, improve=0.08135788, (0 missing)
##   Surrogate splits:
##       college_degrees.pct                          < 18.17608 to the right, agree=0.949, adj=0.083, (1 split)
##       physically_unhealthy_days                    < 4.71     to the left,  agree=0.948, adj=0.058, (0 split)
##       teen_birth_rate                              < 71.82796 to the left,  agree=0.948, adj=0.058, (0 split)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 21.7016  to the left,  agree=0.947, adj=0.049, (0 split)
##       adult_smoking.pct_smokers                    < 28.275   to the left,  agree=0.947, adj=0.046, (0 split)
## 
## Node number 4: 370 observations
##   mean=57.1654, MSE=1230.264 
## 
## Node number 5: 538 observations,    complexity param=0.02365492
##   mean=101.3722, MSE=7453.597 
##   left son=10 (360 obs) right son=11 (178 obs)
##   Primary splits:
##       access_to_exercise_opportunities.pct_with    < 82.59758 to the right, improve=0.15330970, (0 missing)
##       college_degrees.pct                          < 17.23121 to the right, improve=0.12165630, (0 missing)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 14.37506 to the left,  improve=0.10994210, (0 missing)
##       high_housing_costs.pct                       < 30.90253 to the right, improve=0.08266816, (0 missing)
##       dentists.dentist_rate                        < 45.15872 to the right, improve=0.08082371, (0 missing)
##   Surrogate splits:
##       motor_vehicle_crash_deaths.mv_mortality_rate < 14.97141 to the left,  agree=0.970, adj=0.484, (0 split)
##       some_college.pct                             < 53.57157 to the right, agree=0.962, adj=0.358, (0 split)
##       college_degrees.pct                          < 17.83316 to the right, agree=0.962, adj=0.357, (0 split)
##       dentists.dentist_rate                        < 44.50392 to the right, agree=0.961, adj=0.339, (0 split)
##       access_to_parks.pct_park                     < 13.5     to the right, agree=0.960, adj=0.315, (0 split)
## 
## Node number 6: 1002 observations,    complexity param=0.03312596
##   mean=148.546, MSE=22421.61 
##   left son=12 (408 obs) right son=13 (594 obs)
##   Primary splits:
##       access_to_exercise_opportunities.pct_with < 85.6941  to the right, improve=0.09048102, (0 missing)
##       access_to_parks.pct_park                  < 22.5     to the right, improve=0.06748198, (1 missing)
##       high_housing_costs.pct                    < 30.4761  to the right, improve=0.05777937, (0 missing)
##       dentists.dentist_rate                     < 53.61436 to the right, improve=0.05159972, (0 missing)
##       long_commute_driving_alone.pct_drives     < 16.75    to the right, improve=0.04868044, (0 missing)
##   Surrogate splits:
##       access_to_parks.pct_park                     < 34.5     to the right, agree=0.834, adj=0.363, (0 split)
##       limited_access_to_healthy_foods.pct          < 9.110028 to the left,  agree=0.818, adj=0.300, (0 split)
##       motor_vehicle_crash_deaths.mv_mortality_rate < 14.93367 to the left,  agree=0.805, adj=0.252, (0 split)
##       excessive_drinking.pct                       < 12.95    to the right, agree=0.791, adj=0.198, (0 split)
##       dentists.dentist_rate                        < 50.45416 to the right, agree=0.788, adj=0.188, (0 split)
## 
## Node number 7: 403 observations
##   mean=459.0694, MSE=160982.7 
## 
## Node number 10: 360 observations
##   mean=92.94713, MSE=3269.302 
## 
## Node number 11: 178 observations
##   mean=237.0036, MSE=55275.81 
## 
## Node number 12: 408 observations
##   mean=121.8228, MSE=9427.879 
## 
## Node number 13: 594 observations
##   mean=224.4624, MSE=51542.7 
## 
## n= 2313 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 2313 1521599000000  85.66406  
##    2) demographics.pct_not_proficient_in_english>=3.873321 908  392806500000  68.50856  
##      4) social_associations.association_rate< 7.301414 370  112270700000  57.16540 *
##      5) social_associations.association_rate>=7.301414 538  234775200000 101.37220  
##       10) access_to_exercise_opportunities.pct_with>=82.59758 360   96954670000  92.94713 *
##       11) access_to_exercise_opportunities.pct_with< 82.59758 178  101827200000 237.00360 *
##    3) demographics.pct_not_proficient_in_english< 3.873321 1405  924041700000 165.73420  
##      6) access_to_exercise_opportunities.pct_with>=70.575 1002  557071900000 148.54600  
##       12) access_to_exercise_opportunities.pct_with>=85.6941 408  173252300000 121.82280 *
##       13) access_to_exercise_opportunities.pct_with< 85.6941 594  333415200000 224.46240 *
##      7) access_to_exercise_opportunities.pct_with< 70.575 403  234362500000 459.06940 *

Performance Table

require(gridExtra)

## Loading required package: gridExtra

## Warning: package 'gridExtra' was built under R version 3.4.1

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

g <- tableGrob(signif(t(perf.table),2))
grid.newpage()
grid.draw(g)

Variable.Importance Barchart

for(age in unique(bigdata$Age.Grouping)){
  mtree=trees[[age]]
  data=mtree$variable.importance
  names(data)=abbreviate(names.arg = names(data),
                         minlength = floor(mean(nchar(names(data)))+sd(nchar(names(data)))))
  print(lattice::barchart( rev(data/sum(data)*100),main=age,
                           xlab='Variable Importance'))
  catln(age,'very important vars:',names(mtree$variable.importance)[(mtree$variable.importance/max(mtree$variable.importance)*100)>50])
}

## SENIOR very important vars: adult_obesity.pct_obese physical_inactivity.pct_physically_inactive college_degrees.pct households_with_high_housing_costs.pct

## ADULT very important vars: motor_vehicle_crash_deaths.mv_mortality_rate

## YOUTH very important vars: demographics.pct_not_proficient_in_english access_to_exercise_opportunities.pct_with

prp chart

for(age in unique(bigdata$Age.Grouping)){
  mtree=trees[[age]]
  prp(mtree,varlen=ceiling(max(nchar(names(mtree$variable.importance)))),cex=0.8,nn=F,main=age,box.palette="GnRd",fallen.leaves = F)
}

for(age in unique(bigdata$Age.Grouping)){
  mtree=trees[[age]]
  print(
    plot_counties(data.frame(Death.per.100k=(round(predict(mtree,impute.df,type='vector'))),
                             fips=impute.df$fips),'Death.per.100k',low='green',high='red',
                  main=paste(age,'Fitted'),print = F)
  )
}

#importance
#View(importance)
###################END OF TREE

Clustering

#https://stackoverflow.com/questions/23714052/ggplot-mapping-us-counties-problems-with-visualization-shapes-in-r


#####################

Lantz (2015) suggests the elbow method for cluster size determination.

Compute and plot wss for k = 1 to k = 17:

require(factoextra)

## Loading required package: factoextra

## Warning: package 'factoextra' was built under R version 3.4.2

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

require(cluster)

## Loading required package: cluster

## Warning: package 'cluster' was built under R version 3.4.2

age='SENIOR'
age='YOUTH'
allclusters=list()
allcentermeans=list()
for(age in unique(bigdata$Age.Grouping)){
  shush({
    mtree=trees[[age]]
    vip=names(mtree$variable.importance)[(mtree$variable.importance/max(mtree$variable.importance))>0.3]
    data=as.data.frame(aggregate(bigdata[,vip],
                                 list(fips=bigdata$fips),FUN=function(x) mean(x,na.rm=T)))
    fips=data$fips
    data$fips=NULL
    data=winsor1Df(data,trace=F)
    odata=data
    data=impute(data,missing.threshold = 0.1,trace=F)
    data=as.data.frame(scale(keepNumeric( data)))
  })
  #'VIF Double Check
  #'
  #' The HH library allows for the calculation Variance Inflation Factor for checking for collinearity
  #' without requiring a response variable.
  catln('removing multi-collinear vars via vif:',names(data)[HH::vif(data)>=10])
  data=data[,HH::vif(data)<10]
  
  # myvif=car::vif(lm(ezformula(c(yvar,predictorVars)), impute(d[,c(yvar,predictorVars)],trace=F)))
  # names(myvif)[myvif > 10] # problem?
  # 
  set.seed(7)
  k.max <- 8 # Maximal number of clusters
  wss <- sapply(1:k.max, function(k){set.seed(17); kmeans(data, k, nstart=5 )$tot.withinss})
  plot(1:k.max, wss, type="b", pch = 19, frame = FALSE,    xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares",main=paste(age,'elbow method'));grid()

  bestK=4
 
  set.seed(7)
  cl=kmeans(data, bestK, nstart=5,iter.max = 30 )
  print(fviz_cluster(cl, data=data,stand = FALSE, geom = "point",
                     pointsize = 1,main=paste(age,'Cluster Plot')))
  cl$clustering=cl$cluster
  allclusters[[age]]=cl
 
  data$fips=fips
  
  plot_counties(df = data.frame(fips=data$fips,cluster=as.factor(cl$clustering)),
                main=age,yvar = 'cluster')
  center.means=NULL
  for(i in sort(unique(cl$clustering))){
    w=impute.df$fips %in% data$fips[cl$clustering==i]
    catln(age,'cluster=',i,'deathRate per 100k:',sum(1.0*impute.df$Deaths[w])/
            sum(1.0*impute.df$Population[w])*100000,
          'counties:',length(unique(impute.df$fips[w])))
    .=as.data.frame(colMeans(odata[cl$clustering==i,],na.rm = T))
    names(.)=paste0('Cluster',i)
    if(is.null(center.means)) center.means=.
    else center.means=cbind(center.means,.)
  }
  allcentermeans[[age]]=center.means
  ######################
}

## removing multi-collinear vars via vif:

## SENIOR cluster= 1 deathRate per 100k: 2393.272 counties: 716 
## SENIOR cluster= 2 deathRate per 100k: 1673.197 counties: 991 
## SENIOR cluster= 3 deathRate per 100k: 1274.992 counties: 942 
## SENIOR cluster= 4 deathRate per 100k: 809.0095 counties: 419 
## removing multi-collinear vars via vif:

## ADULT cluster= 1 deathRate per 100k: 1294.359 counties: 1304 
## ADULT cluster= 2 deathRate per 100k: 847.3716 counties: 701 
## ADULT cluster= 3 deathRate per 100k: 1875.764 counties: 750 
## ADULT cluster= 4 deathRate per 100k: 1857.314 counties: 313 
## removing multi-collinear vars via vif:

## YOUTH cluster= 1 deathRate per 100k: 1884.152 counties: 1022 
## YOUTH cluster= 2 deathRate per 100k: 2344.622 counties: 472 
## YOUTH cluster= 3 deathRate per 100k: 1162.552 counties: 1368 
## YOUTH cluster= 4 deathRate per 100k: 717.2535 counties: 206

draw.table=function(data,main){
  library(grid)
  library(gridExtra)
  library(gtable)
  
  t1 <- tableGrob(data)
  title <- textGrob(main) #,gp=gpar(fontsize=50)
  padding <- unit(5,"mm")
  
  table <- gtable_add_rows(
    t1, 
    heights = grobHeight(title) + padding,
    pos = 0)
  table <- gtable_add_grob(
    table, 
    title, 
    1, 1, 1, ncol(table))
  
  grid.newpage()
  grid.draw(table)
}

for(age in unique(bigdata$Age.Grouping)){
  data=allcentermeans[[age]]
  
  names(data)=abbreviate(names.arg = names(data),
                         minlength = floor(mean(nchar(names(data)))))
  
  draw.table(signif(data,2),main=age)
  gplots::textplot(signif(data,2),valign='top');title(age)
}

for(age in unique(bigdata$Age.Grouping)){
  cl=allclusters[[age]]
  data=as.data.frame(cl$centers)
  names(data)=abbreviate(names.arg = names(data),
                         minlength = floor(mean(nchar(names(data)))+sd(nchar(names(data)))))
  ezplot2(data,xlab='cluster',col=darken(rainbow(1+ncol(data))),
          title = paste(age,'Cluster Centers'),type='bar')
  
}

## Loading required package: reshape2

## Warning: package 'reshape2' was built under R version 3.4.1

## Loading required package: ggthemes

# 
# #+ fig.width=7, fig.height=5
# for(age in unique(bigdata$Age.Grouping)){
#   cl=allclusters[[age]]
#   mtree=trees[[age]]
#   vip=names(mtree$variable.importance)[(
#     mtree$variable.importance/max(mtree$variable.importance))>0.3]
#   vip=base::intersect(colnames(cl$centers),vip)
#   catln(age,vip)
#   data=as.data.frame(cl$centers[,vip,drop=F])
#   
#   names(data)=abbreviate(names.arg = names(data),
#                          minlength = floor(mean(nchar(names(data)))+sd(nchar(names(data)))))
#   
#   ezplot2(data,xlab='cluster',col=darken(rainbow(1+ncol(data))),
#           title = paste(age,'Very Important Cluster Centers'),type='bar')
# }
#### end 

#### end

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Load libraries

What years hold what data

Trees

Performance Table

Variable.Importance Barchart

prp chart

Clustering

FilesExpand file tree

modeling2.md

Latest commit

History

modeling2.md

File metadata and controls

Load libraries

What years hold what data

Trees

Performance Table

Variable.Importance Barchart

prp chart

Clustering