Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions R/Sharis/Ameriflux_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#' Generate Attributes table
#'
#' @description This function creates a complete attributes table by combining information gathered from
#' the csv data file itself and a csv of preliminary attribute information. This function was written
#' specifically to work with Ameriflux data and the information that was provided along with it.
#'
#'
#' @param csv_file_path (character) Path to the CSV file of data
#' @param attributes_file_path (character) Path to CSV file of preliminary attribute information


generate_attributes_table <- function(csv_file_path,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you'd like you can document what this function does. Check out the multiline comments (#' is the multiline comment symbol in R) at the top of this function: https://github.com/NCEAS/datamgmt/blob/master/R/guess_member_node.R

attributes_file_path) {
# Check that files exist
stopifnot(file.exists(csv_file_path))
stopifnot(file.exists(attributes_file_path))

# Read in files
data <- read.csv(csv_file_path, stringsAsFactors = FALSE, skip = 2)
n <- dim(data)[2]
attributes <- try(read.csv(attributes_file_path, stringsAsFactors = FALSE))
colnames(attributes) <- ("category", "label", "definition", "unit", "SI_unit")

# Initialize data frame
att_table <- data.frame(attributeName = rep("NA", n),
attributeDefinition = rep("NA", n),
measurementScale = rep("NA", n),
domain = rep("NA", n),
formatString = rep("NA", n),
definition = rep("NA", n),
unit = rep("NA", n),
numberType = rep("NA", n),
missingValueCode = rep("NA", n),
missingValueCodeExplanation = rep("NA", n),
stringsAsFactors = F)

qualifiers<- c("_PI", "_QC", "_F", "_IU", "_H_V_R", "_H_V_A", "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9", "_SD", "_N")
num_qualifiers<- c("_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9")

# add attribute name
att_table$attributeName <- colnames(data)
col_names <- colnames(data)

for (i in seq_len(n)) {

## check if the name has a qualifier at the end
if (any(endsWith(col_names[i], suffix = qualifiers))) {
# identify the qualifier
current_qual <- which(endsWith(col_names[i], suffix = qualifiers))
qualifier<- qualifiers[current_qual]
len<- nchar(qualifier)
main_label<- substr(col_names[i], 1, nchar(col_names[i])-len)

# get definition for main label
main_def <- attributes$definition[attributes$label == main_label]

# get definition for qualifier label, special case if it is a number
if (qualifier %in% num_qualifiers){
qual_def <- attributes$definition[attributes$label == "_#"]
} else{
qual_def <- attributes$definition[attributes$label == qualifier]
}

# concatenate the definitions
att_table$attributeDefinition[i] <- paste(main_def, ". ", qual_def)

# check if it is a time variable
if (grepl("TIME", main_label)){
att_table$measurementScale[i] <- "dateTime"
att_table$domain[i] <- "dateTimeDomain"
att_table$formatString[i] <- "YYYYMMDDHHMM"
att_table$unit[i] <- "NA"
} else {
att_table$measurementScale[i] <- "ratio"
att_table$domain[i] <- "numericDomain"
att_table$numberType[i] <- "real"
att_table$unit[i] <- attributes$SI_unit[attributes$label == main_label]
att_table$missingValueCode[i] <- "-9999"
att_table$missingValueCodeExplanation[i] <- "Missing values are represented as -9999"
}

# case if there is no qualifier
} else {
att_table$attributeDefinition[i] <- attributes$definition[attributes$label == col_names[i]]
# check if it is a time variable
if (grepl("TIME", col_names[i])){
att_table$measurementScale[i] <- "dateTime"
att_table$domain[i] <- "dateTimeDomain"
att_table$formatString[i] <- "YYYYMMDDHHMM"
att_table$unit[i] <- "NA"
} else {
att_table$measurementScale[i] <- "ratio"
att_table$domain[i] <- "numericDomain"
att_table$numberType[i] <- "real"
att_table$unit[i] <- attributes$SI_unit[attributes$label == col_names[i]]
att_table$missingValueCode[i] <- "-9999"
att_table$missingValueCodeExplanation[i] <- "Missing values are represented as -9999"
}
}

}

return(att_table)

}
170 changes: 170 additions & 0 deletions R/Sharis/code_chunks.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
---
title: "Code Chunks"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Updating Coverage

If you want to add to the current coverage without having to restate all the other coverage elements using `set_coverage`, these chunks allow you to just add taxonomic coverage and single date temporal coverage.

### Adding taxonomic coverage

```{r, eval = FALSE}
# add each new element as a tax object
tax1 <- new("taxonomicClassification",
taxonRankName = new("taxonRankName", "Species"),
taxonRankValue = new("taxonRankValue", "Calamagrostis deschampsioides"))

tax2 <- new("taxonomicClassification",
taxonRankName = new("taxonRankName", "Species"),
taxonRankValue = new("taxonRankValue", "Carex aquatilis"))

# combine all tax elements into taxonomic coverage object
taxcov <- new("taxonomicCoverage",
taxonomicClassification = c(tax1, tax2))

eml@dataset@coverage@taxonomicCoverage <- c(taxcov)

```

### Adding single date temporal coverage

```{r, eval = FALSE}
date <- new("singleDateTime",
calendarDate = "2011-09-15")

tempcov1 <- new("temporalCoverage",
singleDateTime = date)

eml@dataset@coverage@temporalCoverage <- c(tempcov1)
```

## Data Objects
### Adding data tables for a whole folder of files with the same attributes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section is so awesome! I just had to do this but forgot you wrote it up until I mostly finished. In any case, interesting to see slightly different ways of doing the same thing!


```{r, eval = FALSE}
# set path to data folder
data_path <- '/home/visitor/…”'

# list the files. recursive = TRUE will get all files even if they are in folders in folders
paths <- list.files(data_path, full.names=TRUE, recursive = TRUE)

# generate pids. replace format_id with the one that applies
new_pids1 <- sapply(paths, function(path) {publish_object(mn, path, format_id = "text/plain")})

attributes1 <- read.csv("/home/me/file_attributes.csv", stringsAsFactors = FALSE, na.strings = c(''))
attributeList1 <- set_attributes(attributes1)

# assign attributes to data tables
eml <- eml_add_entities(eml, data.frame(type = "dataTable",
path = paths,
pid = new_pids1,
format_id = "text/plain"))
eml@dataset@dataTable[[1]]@attributeList <- attributeList1
eml@dataset@dataTable[[1]]@attributeList@id <- new("xml_attribute", "shared_attributes1")

#run a loop over all the data pids to add data tables with descriptions
for (i in 2:length(eml@dataset@dataTable)) {
eml@dataset@dataTable[[i]]@attributeList@references <- new("references", "shared_attributes1")
eml@dataset@dataTable[[i]]@entityDescription <- new("entityDescription", .Data = "entity description")
}

```

## System Metadata
### Obsolescence chain

This chunk is to obsolete one dataset. If there are more to add to the chain, more steps can be added, carefully making sure to fill in `obsoletes` and `obsoletedBy` slots for each one.

```{r, eval = FALSE}
# get oldest version of the file you want to be visible. Use get_all_versions and look at the first.
# urn:uuid:...

# PID for data set to be hidden: doi:10…

# adding data set to hide in the slot before the first version of the visible data set
gsmOG <- getSystemMetadata(mn, "urn:uuid:...")
gsmOG@obsoletes <- "doi:10…"
updateSystemMetadata(mn, "urn:uuid:...", gsmOG)

# adding first version to obsolescence chain after hidden version
gsmObs1 <- getSystemMetadata(mn, "doi:10…")
gsmObs1@obsoletedBy <- "urn:uuid:..."
updateSystemMetadata(mn, "doi:10…", gsmObs1)

```

### Set rights and access

This chunk sets rights and access for metadata, resource map, and all data objects in a package

```{r, eval = FALSE}
## Fix rights holder and access
PI_name <- "http://orcid.org/…."
project_pid <- "resource_map_doi:10…."
project_package <- get_package(mn, project_pid)
set_rights_and_access(mn, c(project_package$metadata, project_package$resource_map, project_package$data), PI_name, c("read", "write", "changePermission"))

```

## Miscellaneous

### Adding sampling info to methods section

```{r, eval = FALSE}
step1 <- new('methodStep',
description = "methods text")

stEx <- new("studyExtent",
description = "study extent description")

samp <- new("sampling",
studyExtent = stEx,
samplingDescription = "sampling description text")

methods1 <- new("methods",
methodStep = c(step1),
sampling = samp)
eml@dataset@methods <- methods1
```

### Adding a pre generated identifier to the eml

When you pre generate a UUID or DOI, the change is not automatically reflected in the "Identifier" section of the eml so this makes sure that the eml lines up with the identifier being used.

```{r, eval = FALSE}
## Generate DOI and add to EML
doiPid <- generateIdentifier(mn, "DOI")
eml@packageId <- new("xml_attribute",
.Data = doiPid)

```

### Dealing with netCDFs

This section is for dealing with netCDF (.nc) files. These files require data tables but since they can not be simply opened on the computer using a text editor or Excel, you can use Panoply to explore them or these R commands:


```{r}
library(arcticdatautils)
library(ncdf4)
filepath <- '/home/sharisnochs/Petterson/ICECAPS_precip_product_2010_2015.nc'

# gets attribute info
atts <- get_ncdf4_attributes(filepath)
# preview of View(atts)
atts[1:5,]


# returns the actual values for a specified attribute
t <- nc_open(filepath)
test <- ncvar_get(t, 't2m')
# preview of View(test)
test[1:10]
```

The `formatId` in the sysmeta will most likely be `netCDF-4`.