From ffcaf8ab8b5fed874ff971a1443a348409dcfb50 Mon Sep 17 00:00:00 2001
From: Adam Reevesman <areevesman@gmail.com>
Date: Mon, 19 Mar 2018 11:42:16 -0700
Subject: [PATCH 1/4] added script to generate data tables for the ameriflux
 data

---
 R/Ameriflux_data_tables.R | 562 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 562 insertions(+)
 create mode 100644 R/Ameriflux_data_tables.R

diff --git a/R/Ameriflux_data_tables.R b/R/Ameriflux_data_tables.R
new file mode 100644
index 0000000..bfa5a02
--- /dev/null
+++ b/R/Ameriflux_data_tables.R
@@ -0,0 +1,562 @@
+##############################################################################
+# Adam Reevesman
+# March 19, 2018
+
+# This script defines the function "attribute_table" which
+# generates an attribute table for any one of the csvs in the Ameriflux data at
+# https://drive.google.com/drive/folders/1g0TuYt1cJEuRxe5tXoUjJ1pSneNFbPXN
+
+# The first step is to read in each of the csvs and my modified version of
+# the attribute metadata from
+# https://docs.google.com/spreadsheets/d/1K-C7DahQ80KifW1NAbFaU-VusLZe4eAPHqhBl4t_y9s/edit#gid=1340082892
+
+# the attribute metadata is read into a dataframe as definitions
+# the csvs are read in as dataframes called data1, data2, ... , data12
+
+# The function "attribute_table" takes in 
+# the data dataframe and definitions dataframe to produce an attribute table for the data
+# It calls indivual functions that define each column of the attribute table
+
+# The resulting attribute table will not be perfect...
+# The measurement scales, number types, and missing value information will need
+# to be adjusted manually
+# Any custom units that exist in the "unit" column will need to be defined
+
+# An example of the implimentation exists at the very end of this script
+
+##############################################################################
+
+library(stringr)
+
+
+definitions <- read.csv('/home/reevesman/Ameriflux/attribute_function/definitions.csv',
+                        stringsAsFactors = F)
+
+data1 <- read.csv('/home/reevesman/Ameriflux/AMF_US-Ivo/AMF_US-Ivo_BASE_HH_2-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data2 <- read.csv('/home/reevesman/Ameriflux/AMF_US-ICt/AMF_US-ICt_BASE_HH_2-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data3 <- read.csv('/home/reevesman/Ameriflux/AMF_US-ICs/AMF_US-ICs_BASE_HH_2-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data4 <- read.csv('/home/reevesman/Ameriflux/AMF_US-ICh/AMF_US-ICh_BASE_HH_2-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data5 <- read.csv('/home/reevesman/Ameriflux/AMF_US-EML/AMF_US-EML_BASE_HH_1-4.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data6 <- read.csv('/home/reevesman/Ameriflux/AMF_US-Brw/AMF_US-Brw_BASE_HH_2-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data7 <- read.csv('/home/reevesman/Ameriflux/AMF_US-Atq/AMF_US-Atq_BASE_HH_1-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data8 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Qfo/AMF_CA-Qfo_BASE_HH_1-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data9 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Ojp/AMF_CA-Ojp_BASE_HH_1-1.csv',
+                  skip = 2,
+                  stringsAsFactors = F)
+
+data10 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Oas/AMF_CA-Oas_BASE_HH_1-1.csv',
+                   skip = 2,
+                   stringsAsFactors = F)
+
+data11 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Man/AMF_CA-Man_BASE_HH_2-1.csv',
+                   skip = 2,
+                   stringsAsFactors = F)
+
+data12 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Gro/AMF_CA-Gro_BASE_HH_1-1.csv',
+                   skip = 2,
+                   stringsAsFactors = F)
+
+
+##############################################################################
+
+#function definition for attribute_definitions
+  #inputs:
+    #data: a dataframe with attributes to define
+    #definitions: dataframe in the style of definitions.csv
+  #output: a vector of attribute definitions to be used in attribute table
+
+##############################################################################
+
+attribute_definitions <- function(data, definitions){
+  
+  attributes <- colnames(data)
+  
+  #will be vector of attribute definitions
+  defs <- vector(mode = 'character', length = length(attributes))
+  
+  #i will be index at which to populate defs
+  i <- 1
+  #for each attribute
+  for (att in attributes){
+    
+    #if the attribute has one of the attached qualifiers
+    #get definition of qualifier to paste onto regular definition later
+    #set att to the regular attribute name (w/out qualifier)
+    #set a flag that this attribute includes a qualifier
+    
+    #did not do all of the possible combinations of qualifiers
+    #because not all possible combinations appeared in data
+    
+    if (str_sub(att,-5,-1) == '_PI_F'){
+      x <- str_sub(att,-5,-1)
+      extra <- paste(definitions[which(definitions$uniqueAttributeLabel == '_PI'), 'uniqueAttributeDefinition'],
+                     definitions[which(definitions$uniqueAttributeLabel == '_F'), 'uniqueAttributeDefinition'],
+                     sep = '; ')
+      str_sub(att,-5,-1) <- ""
+      QUALIFIERS_EXIST <- TRUE
+    }
+    
+    else if (str_sub(att,-3,-1) %in% c('_PI','_QC','_IU','_SD')){
+      x <- str_sub(att,-3,-1)
+      extra <- definitions[which(definitions$uniqueAttributeLabel == x), 'uniqueAttributeDefinition']
+      str_sub(att,-3,-1) <- ""
+      QUALIFIERS_EXIST <- TRUE
+    }
+    
+    else if (str_sub(att,-2,-1) == '_F'){
+      x <- '_F'
+      extra <- definitions[which(definitions$uniqueAttributeLabel == '_F'), 'uniqueAttributeDefinition']
+      str_sub(att,-2,-1) <- ""
+      QUALIFIERS_EXIST <- TRUE
+    }
+    
+    else if (str_sub(att,-2,-1) %in% c('_1','_2','_3','_4','_5','_6','_7','_8','_9')){
+      x <- '_#'
+      extra <- definitions[which(definitions$uniqueAttributeLabel == '_#'), 'uniqueAttributeDefinition']
+      str_sub(att,-2,-1) <- ""
+      QUALIFIERS_EXIST <- TRUE
+    }
+    
+    else if (str_sub(att,-2,-1) %in% c('_R','_A')){
+      x <- paste('_H_V', str_sub(att,-2,-1), sep = '')
+      extra <- definitions[which(definitions$uniqueAttributeLabel == x), 'uniqueAttributeDefinition']
+      str_sub(att,-6,-1) <- ""
+      QUALIFIERS_EXIST <- TRUE
+    }
+    
+    else {
+      QUALIFIERS_EXIST <- FALSE
+    }
+    
+    #if the attribute (with potential qualifiers removed) 
+    #is not defined in definitions, there is a problem
+    
+    if (!(att %in% definitions$uniqueAttributeLabel)){
+      print(paste('Error: The attribute: ', 
+                  att, 
+                  ' is not defined in the definitions dataframe.',
+                  sep = ''))
+    }
+    
+    #if there were qualifiers, include the extra part of the definition
+    #when populating defs
+    else if (QUALIFIERS_EXIST){
+      #def is the corresponding definition for i-th attribute
+      def <- paste(definitions[which(definitions$uniqueAttributeLabel == att),'uniqueAttributeDefinition'],
+                   '; With qualifier \'',
+                   x,
+                   '\' ',
+                   extra,
+                   sep = '')
+      
+      #assign def to the corresponding position in defs
+      defs[i] <- def
+      i <- i+1
+    }
+    
+    #if there were no qualifiers, use regular definition
+    else {
+      def <- definitions[which(definitions$uniqueAttributeLabel == att),'uniqueAttributeDefinition']
+      #assign def to the corresponding position in defs
+      defs[i] <- def
+      i <- i+1
+    }
+    
+  }
+  return(defs)
+}
+
+##############################################################################
+
+
+
+
+
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_units
+#inputs:
+  #data: a dataframe with attributes to define
+  #definitions: dataframe in the style of definitions.csv
+#output: a vector of attribute definitions to be used in attribute table
+
+##############################################################################
+
+attribute_units <- function(data, definitions){
+  
+  attributes <- colnames(data)
+  
+  #will be vector of attribute definitions
+  units <- vector(mode = 'character', length = length(attributes))
+  
+  #i will be index at which to populate defs
+  i <- 1
+  #for each attribute
+  for (att in attributes){
+    
+    #set att to the regular attribute name (w/out qualifier)
+    
+    #did not do all of the possible combinations of qualifiers
+    #because not all possible combinations appeared in data
+    
+    if (str_sub(att,-5,-1) == '_PI_F'){
+      str_sub(att,-5,-1) <- ""
+    }
+    
+    else if (str_sub(att,-3,-1) %in% c('_PI','_QC','_IU','_SD')){
+      str_sub(att,-3,-1) <- ""
+    }
+    
+    else if (str_sub(att,-2,-1) == '_F'){
+      str_sub(att,-2,-1) <- ""
+    }
+    
+    else if (str_sub(att,-2,-1) %in% c('_1','_2','_3','_4','_5','_6','_7','_8','_9')){
+      str_sub(att,-2,-1) <- ""
+    }
+    
+    else if (str_sub(att,-2,-1) %in% c('_R','_A')){
+      str_sub(att,-6,-1) <- ""
+    }
+    
+    #if the attribute (with potential qualifiers removed) 
+    #is not defined in definitions, there is a problem
+    
+    if (!(att %in% definitions$uniqueAttributeLabel)){
+      print(paste('Error: The attribute: ', 
+                  att, 
+                  ' is not defined in the definitions dataframe.',
+                  sep = ''))
+    }
+    
+    else if (att %in% c('TIMESTAMP_START','TIMESTAMP_END')){
+      unit <- NA
+      units[i] <- unit
+      i <- i+1
+    }
+    
+    else {
+      unit <- definitions[which(definitions$uniqueAttributeLabel == att),'unitSpelledOut']
+      #assign unit to the corresponding position in units
+      units[i] <- unit
+      i <- i+1
+    }
+    
+  }
+  return(units)
+}
+
+##############################################################################
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_names
+#inputs:
+  #data: a dataframe with attribute names
+#output: a vector of attribute names to be used in attribute table
+
+##############################################################################
+
+attribute_names <- function(data){
+  
+  return(colnames(data))
+}
+
+##############################################################################
+
+
+
+
+##############################################################################
+
+#function definition for attribute_domains
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of attribute domains to be used in attribute table
+
+##############################################################################
+
+attribute_domains <- function(data){
+  
+  attributes <- colnames(data)
+  domains <- vector(mode = 'character', length = length(attributes))
+  
+  for (i in 1:length(attributes)){
+    if (attributes[i] %in% c('TIMESTAMP_START','TIMESTAMP_END')){
+      domains[i] <- 'dateTimeDomain'
+    }    
+    else {
+      domains[i] <- 'numericDomain'
+    }
+  }
+  
+  return(domains)
+}
+
+##############################################################################
+
+
+
+
+
+##############################################################################
+
+#function definition for textDomain_definitions
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of definitions for textDomain attributes to be used in attribute table
+
+##############################################################################
+
+textDomain_definitions <- function(data){
+  
+  attributes <- colnames(data)
+  #no textDomains in any of the data
+  return(rep(NA, times = length(attributes)))
+}
+
+##############################################################################
+
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_measurement_scales
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of attribute measurement scales to be used in attribute table
+
+##############################################################################
+
+attribute_measurement_scales <- function(data){
+  
+  attributes <- colnames(data)
+  scales <- vector(mode = 'character', length = length(attributes))
+  
+  for (i in 1:length(attributes)){
+    if (attributes[i] %in% c('TIMESTAMP_START','TIMESTAMP_END')){
+      scales[i] <- 'dateTime'
+    }
+    #will have to manually check measurement scales
+    else {
+      scales[i] <- 'ratio'
+    }
+  }
+  
+  return(scales)
+}
+
+##############################################################################
+
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_format_strings
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of format strings for dateTime attributes to be used in attribute table
+
+##############################################################################
+
+attribute_format_strings <- function(data){
+  
+  attributes <- colnames(data)
+  formats <- vector(mode = 'character', length = length(attributes))
+  
+  for (i in 1:length(attributes)){
+    if (attributes[i] %in% c('TIMESTAMP_START','TIMESTAMP_END')){
+      formats[i] <- 'YYYYMMDDHHMM'
+    }    
+    else {
+      formats[i] <- NA
+    }
+  }
+  
+  return(formats)
+}
+
+##############################################################################
+
+
+
+
+##############################################################################
+
+#function definition for attribute_number_type
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of attribute numberTypes to be used in attribute table
+
+##############################################################################
+
+attribute_number_type <- function(data){
+  
+  attributes <- colnames(data)
+  types <- vector(mode = 'character', length = length(attributes))
+  
+  for (i in 1:length(attributes)){
+    if (attributes[i] %in% c('TIMESTAMP_START','TIMESTAMP_END')){
+      types[i] <- NA
+    }
+    #will have to manually check number type
+    else {
+      types[i] <- 'real'
+    }
+  }
+  
+  return(types)
+}
+
+##############################################################################
+
+
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_missing_value_codes
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of definitions for textDomain attributes to be used in attribute table
+
+##############################################################################
+
+attribute_missing_value_codes <- function(data){
+  
+  attributes <- colnames(data)
+  codes <- rep(NA, times = length(attributes))
+  
+  for (i in 1:length(attributes)){
+    x <- attributes[i]
+    if (sum(data[, x] == '-9999') > 0){
+      codes[i] <- '-9999'
+    }
+  }
+  
+  return(codes)
+}
+
+##############################################################################
+
+
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_missing_value_explanations
+#inputs:
+#data: a dataframe with attributes to define
+#output: a vector of definitions for textDomain attributes to be used in attribute table
+
+##############################################################################
+
+attribute_missing_value_explanations <- function(data){
+  
+  attributes <- colnames(data)
+  explanations <- rep(NA, times = length(attributes))
+  
+  for (i in 1:length(attributes)){
+    x <- attributes[i]
+    if (sum(data[, x] == '-9999') > 0){
+      explanations[i] <- 'Missing data records are indicated by the -9999 value.'
+    }
+  }
+  
+  return(explanations)
+}
+
+##############################################################################
+
+
+
+
+
+
+
+
+
+##############################################################################
+
+#function definition for attribute_table
+#inputs:
+  #data: a dataframe with attributes to define
+  #definitions: dataframe in the style of definitions.csv
+#output: a dataTable (dataframe) for the data, to be passed into set_attributes
+
+##############################################################################
+
+attribute_table <- function(data, definitions){
+  
+  print('Manually examine measurement scales')
+  print('Manually examine number types')
+  print('Manually check for custom units')
+  
+  attributeTable <- data.frame(
+    attributeName = attribute_names(data),
+    domain = attribute_domains(data),
+    attributeDefinition = attribute_definitions(data, definitions),
+    definition = textDomain_definitions(data),
+    measurementScale = attribute_measurement_scales(data),
+    formatString = attribute_format_strings(data),
+    numberType = attribute_number_type(data),
+    unit = attribute_units(data, definitions),
+    missingValueCode = attribute_missing_value_codes(data),
+    missingValueCodeExplanation = attribute_missing_value_explanations(data),
+    stringsAsFactors = FALSE)
+  
+  return(attributeTable)
+}
+
+##############################################################################
+
+
+# Example:
+
+# attribute_table(data2,definitions)
+# set_attributes(attribute_table(data1,definitions))

From 64cd9326ee72c706c83798693c64708c50b33fbe Mon Sep 17 00:00:00 2001
From: Adam Reevesman <areevesman@gmail.com>
Date: Tue, 3 Apr 2018 12:35:11 -0700
Subject: [PATCH 2/4] spelled out T/F and changed args of functions to paths
 instead of dataframes

---
 R/Ameriflux_data_tables.R | 129 +++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 71 deletions(-)

diff --git a/R/Ameriflux_data_tables.R b/R/Ameriflux_data_tables.R
index bfa5a02..df29dce 100644
--- a/R/Ameriflux_data_tables.R
+++ b/R/Ameriflux_data_tables.R
@@ -30,55 +30,20 @@ library(stringr)
 
 
 definitions <- read.csv('/home/reevesman/Ameriflux/attribute_function/definitions.csv',
-                        stringsAsFactors = F)
+                        stringsAsFactors = FALSE)
 
-data1 <- read.csv('/home/reevesman/Ameriflux/AMF_US-Ivo/AMF_US-Ivo_BASE_HH_2-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data2 <- read.csv('/home/reevesman/Ameriflux/AMF_US-ICt/AMF_US-ICt_BASE_HH_2-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data3 <- read.csv('/home/reevesman/Ameriflux/AMF_US-ICs/AMF_US-ICs_BASE_HH_2-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data4 <- read.csv('/home/reevesman/Ameriflux/AMF_US-ICh/AMF_US-ICh_BASE_HH_2-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data5 <- read.csv('/home/reevesman/Ameriflux/AMF_US-EML/AMF_US-EML_BASE_HH_1-4.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data6 <- read.csv('/home/reevesman/Ameriflux/AMF_US-Brw/AMF_US-Brw_BASE_HH_2-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data7 <- read.csv('/home/reevesman/Ameriflux/AMF_US-Atq/AMF_US-Atq_BASE_HH_1-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data8 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Qfo/AMF_CA-Qfo_BASE_HH_1-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data9 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Ojp/AMF_CA-Ojp_BASE_HH_1-1.csv',
-                  skip = 2,
-                  stringsAsFactors = F)
-
-data10 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Oas/AMF_CA-Oas_BASE_HH_1-1.csv',
-                   skip = 2,
-                   stringsAsFactors = F)
-
-data11 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Man/AMF_CA-Man_BASE_HH_2-1.csv',
-                   skip = 2,
-                   stringsAsFactors = F)
-
-data12 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Gro/AMF_CA-Gro_BASE_HH_1-1.csv',
-                   skip = 2,
-                   stringsAsFactors = F)
+data_path_1 <- '/home/reevesman/Ameriflux/AMF_US-Ivo/AMF_US-Ivo_BASE_HH_2-1.csv'
+data_path_2 <- '/home/reevesman/Ameriflux/AMF_US-ICt/AMF_US-ICt_BASE_HH_2-1.csv'
+data_path_3 <- '/home/reevesman/Ameriflux/AMF_US-ICs/AMF_US-ICs_BASE_HH_2-1.csv'
+data_path_4 <- '/home/reevesman/Ameriflux/AMF_US-ICh/AMF_US-ICh_BASE_HH_2-1.csv'
+data_path_5 <- '/home/reevesman/Ameriflux/AMF_US-EML/AMF_US-EML_BASE_HH_1-4.csv'
+data_path_6 <- '/home/reevesman/Ameriflux/AMF_US-Brw/AMF_US-Brw_BASE_HH_2-1.csv'
+data_path_7 <- '/home/reevesman/Ameriflux/AMF_US-Atq/AMF_US-Atq_BASE_HH_1-1.csv'
+data_path_8 <- '/home/reevesman/Ameriflux/AMF_CA-Qfo/AMF_CA-Qfo_BASE_HH_1-1.csv'
+data_path_9 <- '/home/reevesman/Ameriflux/AMF_CA-Ojp/AMF_CA-Ojp_BASE_HH_1-1.csv'
+data_path_10 <- '/home/reevesman/Ameriflux/AMF_CA-Oas/AMF_CA-Oas_BASE_HH_1-1.csv'
+data_path_11 <- '/home/reevesman/Ameriflux/AMF_CA-Man/AMF_CA-Man_BASE_HH_2-1.csv'
+data_path_12 <- '/home/reevesman/Ameriflux/AMF_CA-Gro/AMF_CA-Gro_BASE_HH_1-1.csv'
 
 
 ##############################################################################
@@ -91,7 +56,9 @@ data12 <- read.csv('/home/reevesman/Ameriflux/AMF_CA-Gro/AMF_CA-Gro_BASE_HH_1-1.
 
 ##############################################################################
 
-attribute_definitions <- function(data, definitions){
+attribute_definitions <- function(data_path, definitions){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   
@@ -211,7 +178,9 @@ attribute_definitions <- function(data, definitions){
 
 ##############################################################################
 
-attribute_units <- function(data, definitions){
+attribute_units <- function(data_path, definitions){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   
@@ -290,7 +259,9 @@ attribute_units <- function(data, definitions){
 
 ##############################################################################
 
-attribute_names <- function(data){
+attribute_names <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   return(colnames(data))
 }
@@ -309,7 +280,9 @@ attribute_names <- function(data){
 
 ##############################################################################
 
-attribute_domains <- function(data){
+attribute_domains <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   domains <- vector(mode = 'character', length = length(attributes))
@@ -341,7 +314,9 @@ attribute_domains <- function(data){
 
 ##############################################################################
 
-textDomain_definitions <- function(data){
+textDomain_definitions <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   #no textDomains in any of the data
@@ -364,7 +339,9 @@ textDomain_definitions <- function(data){
 
 ##############################################################################
 
-attribute_measurement_scales <- function(data){
+attribute_measurement_scales <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   scales <- vector(mode = 'character', length = length(attributes))
@@ -398,7 +375,9 @@ attribute_measurement_scales <- function(data){
 
 ##############################################################################
 
-attribute_format_strings <- function(data){
+attribute_format_strings <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   formats <- vector(mode = 'character', length = length(attributes))
@@ -429,7 +408,9 @@ attribute_format_strings <- function(data){
 
 ##############################################################################
 
-attribute_number_type <- function(data){
+attribute_number_type <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   types <- vector(mode = 'character', length = length(attributes))
@@ -464,7 +445,9 @@ attribute_number_type <- function(data){
 
 ##############################################################################
 
-attribute_missing_value_codes <- function(data){
+attribute_missing_value_codes <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   codes <- rep(NA, times = length(attributes))
@@ -496,7 +479,9 @@ attribute_missing_value_codes <- function(data){
 
 ##############################################################################
 
-attribute_missing_value_explanations <- function(data){
+attribute_missing_value_explanations <- function(data_path){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   attributes <- colnames(data)
   explanations <- rep(NA, times = length(attributes))
@@ -531,23 +516,25 @@ attribute_missing_value_explanations <- function(data){
 
 ##############################################################################
 
-attribute_table <- function(data, definitions){
+attribute_table <- function(data_path, definitions){
+  
+  data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
   print('Manually examine measurement scales')
   print('Manually examine number types')
   print('Manually check for custom units')
   
   attributeTable <- data.frame(
-    attributeName = attribute_names(data),
-    domain = attribute_domains(data),
-    attributeDefinition = attribute_definitions(data, definitions),
-    definition = textDomain_definitions(data),
-    measurementScale = attribute_measurement_scales(data),
-    formatString = attribute_format_strings(data),
-    numberType = attribute_number_type(data),
-    unit = attribute_units(data, definitions),
-    missingValueCode = attribute_missing_value_codes(data),
-    missingValueCodeExplanation = attribute_missing_value_explanations(data),
+    attributeName = attribute_names(data_path),
+    domain = attribute_domains(data_path),
+    attributeDefinition = attribute_definitions(data_path, definitions),
+    definition = textDomain_definitions(data_path),
+    measurementScale = attribute_measurement_scales(data_path),
+    formatString = attribute_format_strings(data_path),
+    numberType = attribute_number_type(data_path),
+    unit = attribute_units(data_path, definitions),
+    missingValueCode = attribute_missing_value_codes(data_path),
+    missingValueCodeExplanation = attribute_missing_value_explanations(data_path),
     stringsAsFactors = FALSE)
   
   return(attributeTable)
@@ -558,5 +545,5 @@ attribute_table <- function(data, definitions){
 
 # Example:
 
-# attribute_table(data2,definitions)
-# set_attributes(attribute_table(data1,definitions))
+attribute_table(data_path_2,definitions)
+EML::set_attributes(attribute_table(data_path_2,definitions))

From 63d05cd709619af6c38e47941aafdb183e7850ba Mon Sep 17 00:00:00 2001
From: Adam Reevesman <areevesman@gmail.com>
Date: Tue, 3 Apr 2018 16:17:15 -0700
Subject: [PATCH 3/4] simplified attribute_definition function, made it more
 scale-able

---
 R/Ameriflux_data_tables.R | 72 +++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 41 deletions(-)

diff --git a/R/Ameriflux_data_tables.R b/R/Ameriflux_data_tables.R
index df29dce..230828b 100644
--- a/R/Ameriflux_data_tables.R
+++ b/R/Ameriflux_data_tables.R
@@ -60,6 +60,11 @@ attribute_definitions <- function(data_path, definitions){
   
   data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
+  #potential qualifiers
+  int_qualifiers <- c('_1','_2','_3','_4','_5','_6','_7','_8','_9')
+  char_qualifiers <- c('_PI','_QC','_IU','_SD','_F','_R','_A')
+  qualifiers <- c(int_qualifiers, char_qualifiers)
+  
   attributes <- colnames(data)
   
   #will be vector of attribute definitions
@@ -71,47 +76,32 @@ attribute_definitions <- function(data_path, definitions){
   for (att in attributes){
     
     #if the attribute has one of the attached qualifiers
-    #get definition of qualifier to paste onto regular definition later
+    #figure out which qualifier it is
     #set att to the regular attribute name (w/out qualifier)
+    #if it is an int_qualifier, set it to '_#' so its definition can be found in "definitions"
+    #get extra definition of qualifier to paste onto regular definition later
     #set a flag that this attribute includes a qualifier
     
-    #did not do all of the possible combinations of qualifiers
-    #because not all possible combinations appeared in data
-    
-    if (str_sub(att,-5,-1) == '_PI_F'){
-      x <- str_sub(att,-5,-1)
-      extra <- paste(definitions[which(definitions$uniqueAttributeLabel == '_PI'), 'uniqueAttributeDefinition'],
-                     definitions[which(definitions$uniqueAttributeLabel == '_F'), 'uniqueAttributeDefinition'],
-                     sep = '; ')
-      str_sub(att,-5,-1) <- ""
-      QUALIFIERS_EXIST <- TRUE
-    }
-    
-    else if (str_sub(att,-3,-1) %in% c('_PI','_QC','_IU','_SD')){
-      x <- str_sub(att,-3,-1)
-      extra <- definitions[which(definitions$uniqueAttributeLabel == x), 'uniqueAttributeDefinition']
-      str_sub(att,-3,-1) <- ""
-      QUALIFIERS_EXIST <- TRUE
-    }
-    
-    else if (str_sub(att,-2,-1) == '_F'){
-      x <- '_F'
-      extra <- definitions[which(definitions$uniqueAttributeLabel == '_F'), 'uniqueAttributeDefinition']
-      str_sub(att,-2,-1) <- ""
-      QUALIFIERS_EXIST <- TRUE
-    }
-    
-    else if (str_sub(att,-2,-1) %in% c('_1','_2','_3','_4','_5','_6','_7','_8','_9')){
-      x <- '_#'
-      extra <- definitions[which(definitions$uniqueAttributeLabel == '_#'), 'uniqueAttributeDefinition']
-      str_sub(att,-2,-1) <- ""
-      QUALIFIERS_EXIST <- TRUE
-    }
-    
-    else if (str_sub(att,-2,-1) %in% c('_R','_A')){
-      x <- paste('_H_V', str_sub(att,-2,-1), sep = '')
-      extra <- definitions[which(definitions$uniqueAttributeLabel == x), 'uniqueAttributeDefinition']
-      str_sub(att,-6,-1) <- ""
+    if (sum(sapply(qualifiers, grepl, x = att)) > 0){
+      
+      qualifier_indeces <- which(sapply(qualifiers, grepl, x = att) == TRUE)
+      att_qualifiers <- qualifiers[qualifier_indeces]
+      
+      for (j in 1:length(att_qualifiers)){
+        att <- gsub(att_qualifiers[j], replacement = '', x = att)
+      }
+      
+      if (att_qualifiers[1] %in% int_qualifiers){
+        att_qualifiers <- '_#'
+      }
+      
+      #using paste in case there are multiple qualifiers
+      #in reality, only needed for _PI_F
+      #same reason for using for loop below (length(att_qualifiers) should be 1 or 2)
+      extra <- paste(definitions[which(definitions$uniqueAttributeLabel %in% att_qualifiers), 
+                                 'uniqueAttributeDefinition'],
+                     collapse = '; ')
+      
       QUALIFIERS_EXIST <- TRUE
     }
     
@@ -135,7 +125,7 @@ attribute_definitions <- function(data_path, definitions){
       #def is the corresponding definition for i-th attribute
       def <- paste(definitions[which(definitions$uniqueAttributeLabel == att),'uniqueAttributeDefinition'],
                    '; With qualifier \'',
-                   x,
+                   paste(att_qualifiers, collapse = ''),
                    '\' ',
                    extra,
                    sep = '')
@@ -545,5 +535,5 @@ attribute_table <- function(data_path, definitions){
 
 # Example:
 
-attribute_table(data_path_2,definitions)
-EML::set_attributes(attribute_table(data_path_2,definitions))
+# attribute_table(data_path_2,definitions)
+# EML::set_attributes(attribute_table(data_path_2,definitions))

From 88ebd8ded016b138748af09a50274b6b1d65194f Mon Sep 17 00:00:00 2001
From: Adam Reevesman <areevesman@gmail.com>
Date: Wed, 4 Apr 2018 13:29:33 -0700
Subject: [PATCH 4/4] simplified attribute_units function, got rid of need for
 stringr library in script

---
 R/Ameriflux_data_tables.R | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/R/Ameriflux_data_tables.R b/R/Ameriflux_data_tables.R
index 230828b..7b30a1c 100644
--- a/R/Ameriflux_data_tables.R
+++ b/R/Ameriflux_data_tables.R
@@ -26,8 +26,6 @@
 
 ##############################################################################
 
-library(stringr)
-
 
 definitions <- read.csv('/home/reevesman/Ameriflux/attribute_function/definitions.csv',
                         stringsAsFactors = FALSE)
@@ -172,6 +170,11 @@ attribute_units <- function(data_path, definitions){
   
   data <- read.csv(data_path, skip = 2, stringsAsFactors = FALSE)
   
+  #potential qualifiers
+  int_qualifiers <- c('_1','_2','_3','_4','_5','_6','_7','_8','_9')
+  char_qualifiers <- c('_PI','_QC','_IU','_SD','_F','_R','_A')
+  qualifiers <- c(int_qualifiers, char_qualifiers)
+  
   attributes <- colnames(data)
   
   #will be vector of attribute definitions
@@ -187,24 +190,14 @@ attribute_units <- function(data_path, definitions){
     #did not do all of the possible combinations of qualifiers
     #because not all possible combinations appeared in data
     
-    if (str_sub(att,-5,-1) == '_PI_F'){
-      str_sub(att,-5,-1) <- ""
-    }
-    
-    else if (str_sub(att,-3,-1) %in% c('_PI','_QC','_IU','_SD')){
-      str_sub(att,-3,-1) <- ""
-    }
-    
-    else if (str_sub(att,-2,-1) == '_F'){
-      str_sub(att,-2,-1) <- ""
-    }
-    
-    else if (str_sub(att,-2,-1) %in% c('_1','_2','_3','_4','_5','_6','_7','_8','_9')){
-      str_sub(att,-2,-1) <- ""
-    }
-    
-    else if (str_sub(att,-2,-1) %in% c('_R','_A')){
-      str_sub(att,-6,-1) <- ""
+    if (sum(sapply(qualifiers, grepl, x = att)) > 0){
+      
+      qualifier_indeces <- which(sapply(qualifiers, grepl, x = att) == TRUE)
+      att_qualifiers <- qualifiers[qualifier_indeces]
+      
+      for (j in 1:length(att_qualifiers)){
+        att <- gsub(att_qualifiers[j], replacement = '', x = att)
+      }
     }
     
     #if the attribute (with potential qualifiers removed) 
@@ -234,6 +227,7 @@ attribute_units <- function(data_path, definitions){
   return(units)
 }
 
+
 ##############################################################################