From 20e6b873d3532eae58160db4fde55f235db620f7 Mon Sep 17 00:00:00 2001 From: Sharis Ochs Date: Fri, 16 Mar 2018 14:12:55 -0700 Subject: [PATCH 1/4] Scripts for review --- R/Sharis/Ameriflux_functions.R | 92 ++++++++++++++++++ R/Sharis/code_chunks.Rmd | 170 +++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) create mode 100644 R/Sharis/Ameriflux_functions.R create mode 100644 R/Sharis/code_chunks.Rmd diff --git a/R/Sharis/Ameriflux_functions.R b/R/Sharis/Ameriflux_functions.R new file mode 100644 index 0000000..05076ca --- /dev/null +++ b/R/Sharis/Ameriflux_functions.R @@ -0,0 +1,92 @@ +generate_attributes_table <- function(csv_file_path, + attributes_file_path) { + # Check that files exist + stopifnot(file.exists(csv_file_path)) + stopifnot(file.exists(attributes_file_path)) + + # Read in files + data <- read.csv(csv_file_path, stringsAsFactors = FALSE, skip = 2) + n <- dim(data)[2] + attributes <- try(read.csv(attributes_file_path, stringsAsFactors = FALSE)) + + + # Initialize data frame + table <- data.frame(attributeName = rep("NA", n), + attributeDefinition = rep("NA", n), + measurementScale = rep("NA", n), + domain = rep("NA", n), + formatString = rep("NA", n), + definition = rep("NA", n), + unit = rep("NA", n), + numberType = rep("NA", n), + missingValueCode = rep("NA", n), + missingValueCodeExplanation = rep("NA", n), + stringsAsFactors = F) + + qualifiers<- c("_PI", "_QC", "_F", "_IU", "_H_V_R", "_H_V_A", "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9", "_SD", "_N") + num_qualifiers<- c("_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9") + + for (i in seq_len(n)) { + # add attribute name + table$attributeName[i] = colnames(data)[i] + + ## check if the name has a qualifier at the end + if (any(endsWith(colnames(data)[i], suffix = qualifiers))) { + # identify the qualifier + current_qual <- which(endsWith(colnames(data)[i], suffix = qualifiers)) + qualifier<- qualifiers[current_qual] + len<- nchar(qualifier) + main_label<- substr(colnames(data)[i], 1, nchar(colnames(data)[i])-len) + + # get definition for main label + main_def <- attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == main_label] + + # get definition for qualifier label, special case if it is a number + if (qualifier %in% num_qualifiers){ + qual_def <- attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == "_#"] + } else{ + qual_def <- attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == qualifier] + } + + # concatenate the definitions + table$attributeDefinition[i] = paste(main_def, ". ", qual_def) + + # check if it is a time variable + if (grepl("TIME", main_label)){ + table$measurementScale[i] = "dateTime" + table$domain[i] = "dateTimeDomain" + table$formatString[i] = "YYYYMMDDHHMM" + table$unit[i] = "NA" + } else { + table$measurementScale[i] = "ratio" + table$domain[i] = "numericDomain" + table$numberType[i] <- "real" + table$unit[i] = attributes$SI_unit[attributes$uniqueAttributeLabel == main_label] + table$missingValueCode[i] = "-9999" + table$missingValueCodeExplanation[i] = "Missing values are represented as -9999" + } + + # case if there is no qualifier + } else { + table$attributeDefinition[i] = attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == colnames(data)[i]] + # check if it is a time variable + if (grepl("TIME", colnames(data)[i])){ + table$measurementScale[i] = "dateTime" + table$domain[i] = "dateTimeDomain" + table$formatString[i] = "YYYYMMDDHHMM" + table$unit[i] = "NA" + } else { + table$measurementScale[i] = "ratio" + table$domain[i] = "numericDomain" + table$numberType[i] <- "real" + table$unit[i] = attributes$SI_unit[attributes$uniqueAttributeLabel == colnames(data)[i]] + table$missingValueCode[i] = "-9999" + table$missingValueCodeExplanation[i] = "Missing values are represented as -9999" + } + } + + } + + return(table) + +} diff --git a/R/Sharis/code_chunks.Rmd b/R/Sharis/code_chunks.Rmd new file mode 100644 index 0000000..6599988 --- /dev/null +++ b/R/Sharis/code_chunks.Rmd @@ -0,0 +1,170 @@ +--- +title: "Code Chunks" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Updating Coverage + +If you want to add to the current coverage without having to restate all the other coverage elements using `set_coverage`, these chunks allow you to just add taxonomic coverage and single date temporal coverage. + +### Adding taxonomic coverage + +```{r, eval = FALSE} +# add each new element as a tax object +tax1 <- new("taxonomicClassification", + taxonRankName = new("taxonRankName", "Species"), + taxonRankValue = new("taxonRankValue", "Calamagrostis deschampsioides")) + +tax2 <- new("taxonomicClassification", + taxonRankName = new("taxonRankName", "Species"), + taxonRankValue = new("taxonRankValue", "Carex aquatilis")) + +# combine all tax elements into taxonomic coverage object +taxcov <- new("taxonomicCoverage", + taxonomicClassification = c(tax1, tax2)) + +eml@dataset@coverage@taxonomicCoverage <- c(taxcov) + +``` + +### Adding single date temporal coverage + +```{r, eval = FALSE} +date <- new("singleDateTime", + calendarDate = "2011-09-15") + +tempcov1 <- new("temporalCoverage", + singleDateTime = date) + +eml@dataset@coverage@temporalCoverage <- c(tempcov1) +``` + +## Data Objects +### Adding data tables for a whole folder of files with the same attributes + +```{r, eval = FALSE} +# set path to data folder +data_path <- '/home/visitor/…”' + +# list the files. recursive = TRUE will get all files even if they are in folders in folders +paths <- list.files(data_path, full.names=TRUE, recursive = TRUE) + +# generate pids. replace format_id with the one that applies +new_pids1 <- sapply(paths, function(path) {publish_object(mn, path, format_id = "text/plain")}) + +attributes1 <- read.csv("/home/me/file_attributes.csv", stringsAsFactors = FALSE, na.strings = c('')) +attributeList1 <- set_attributes(attributes1) + +# assign attributes to data tables +eml <- eml_add_entities(eml, data.frame(type = "dataTable", + path = paths, + pid = new_pids1, + format_id = "text/plain")) +eml@dataset@dataTable[[1]]@attributeList <- attributeList1 +eml@dataset@dataTable[[1]]@attributeList@id <- new("xml_attribute", "shared_attributes1") + +#run a loop over all the data pids to add data tables with descriptions +for (i in 2:length(eml@dataset@dataTable)) { + eml@dataset@dataTable[[i]]@attributeList@references <- new("references", "shared_attributes1") + eml@dataset@dataTable[[i]]@entityDescription <- new("entityDescription", .Data = "entity description") +} + +``` + +## System Metadata +### Obsolescence chain + +This chunk is to obsolete one dataset. If there are more to add to the chain, more steps can be added, carefully making sure to fill in `obsoletes` and `obsoletedBy` slots for each one. + +```{r, eval = FALSE} +# get oldest version of the file you want to be visible. Use get_all_versions and look at the first. +# urn:uuid:... + +# PID for data set to be hidden: doi:10… + +# adding data set to hide in the slot before the first version of the visible data set +gsmOG <- getSystemMetadata(mn, "urn:uuid:...") +gsmOG@obsoletes <- "doi:10…" +updateSystemMetadata(mn, "urn:uuid:...", gsmOG) + +# adding first version to obsolescence chain after hidden version +gsmObs1 <- getSystemMetadata(mn, "doi:10…") +gsmObs1@obsoletedBy <- "urn:uuid:..." +updateSystemMetadata(mn, "doi:10…", gsmObs1) + +``` + +### Set rights and access + +This chunk sets rights and access for metadata, resource map, and all data objects in a package + +```{r, eval = FALSE} +## Fix rights holder and access +PI_name <- "http://orcid.org/…." +project_pid <- "resource_map_doi:10…." +project_package <- get_package(mn, project_pid) +set_rights_and_access(mn, c(project_package$metadata, project_package$resource_map, project_package$data), PI_name, c("read", "write", "changePermission")) + +``` + +## Miscellaneous + +### Adding sampling info to methods section + +```{r, eval = FALSE} +step1 <- new('methodStep', + description = "methods text") + +stEx <- new("studyExtent", + description = "study extent description") + +samp <- new("sampling", + studyExtent = stEx, + samplingDescription = "sampling description text") + +methods1 <- new("methods", + methodStep = c(step1), + sampling = samp) +eml@dataset@methods <- methods1 +``` + +### Adding a pre generated identifier to the eml + +When you pre generate a UUID or DOI, the change is not automatically reflected in the "Identifier" section of the eml so this makes sure that the eml lines up with the identifier being used. + +```{r, eval = FALSE} +## Generate DOI and add to EML +doiPid <- generateIdentifier(mn, "DOI") +eml@packageId <- new("xml_attribute", + .Data = doiPid) + +``` + +### Dealing with netCDFs + +This section is for dealing with netCDF (.nc) files. These files require data tables but since they can not be simply opened on the computer using a text editor or Excel, you can use Panoply to explore them or these R commands: + + +```{r} +library(arcticdatautils) +library(ncdf4) +filepath <- '/home/sharisnochs/Petterson/ICECAPS_precip_product_2010_2015.nc' + +# gets attribute info +atts <- get_ncdf4_attributes(filepath) +# preview of View(atts) +atts[1:5,] + + +# returns the actual values for a specified attribute +t <- nc_open(filepath) +test <- ncvar_get(t, 't2m') +# preview of View(test) +test[1:10] +``` + +The `formatId` in the sysmeta will most likely be `netCDF-4`. \ No newline at end of file From 413a36f3430108c093f806cb614f06c94881e189 Mon Sep 17 00:00:00 2001 From: sharisochs <36999120+sharisochs@users.noreply.github.com> Date: Wed, 11 Apr 2018 11:12:16 -0700 Subject: [PATCH 2/4] Update Ameriflux_functions.R --- R/Sharis/Ameriflux_functions.R | 81 ++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/R/Sharis/Ameriflux_functions.R b/R/Sharis/Ameriflux_functions.R index 05076ca..c62d6f9 100644 --- a/R/Sharis/Ameriflux_functions.R +++ b/R/Sharis/Ameriflux_functions.R @@ -1,3 +1,14 @@ +#' Generate Attributes table +#' +#' @description This function creates a complete attributes table by combining information gathered from +#' the csv data file itself and a csv of preliminary attribute information. This function was written +#' specifically to work with Ameriflux data and the information that was provided along with it. +#' +#' +#' @param csv_file_path (character) Path to the CSV file of data +#' @param attributes_file_path (character) Path to CSV file of preliminary attribute information + + generate_attributes_table <- function(csv_file_path, attributes_file_path) { # Check that files exist @@ -8,10 +19,10 @@ generate_attributes_table <- function(csv_file_path, data <- read.csv(csv_file_path, stringsAsFactors = FALSE, skip = 2) n <- dim(data)[2] attributes <- try(read.csv(attributes_file_path, stringsAsFactors = FALSE)) - + colnames(attributes) <- ("category", "label", "definition", "unit", "SI_unit") # Initialize data frame - table <- data.frame(attributeName = rep("NA", n), + att_table <- data.frame(attributeName = rep("NA", n), attributeDefinition = rep("NA", n), measurementScale = rep("NA", n), domain = rep("NA", n), @@ -26,67 +37,69 @@ generate_attributes_table <- function(csv_file_path, qualifiers<- c("_PI", "_QC", "_F", "_IU", "_H_V_R", "_H_V_A", "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9", "_SD", "_N") num_qualifiers<- c("_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9") + # add attribute name + att_table$attributeName <- colnames(data) + col_names <- colnames(data) + for (i in seq_len(n)) { - # add attribute name - table$attributeName[i] = colnames(data)[i] ## check if the name has a qualifier at the end - if (any(endsWith(colnames(data)[i], suffix = qualifiers))) { + if (any(endsWith(col_names[i], suffix = qualifiers))) { # identify the qualifier - current_qual <- which(endsWith(colnames(data)[i], suffix = qualifiers)) + current_qual <- which(endsWith(col_names[i], suffix = qualifiers)) qualifier<- qualifiers[current_qual] len<- nchar(qualifier) - main_label<- substr(colnames(data)[i], 1, nchar(colnames(data)[i])-len) + main_label<- substr(col_names[i], 1, nchar(col_names[i])-len) # get definition for main label - main_def <- attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == main_label] + main_def <- attributes$definition[attributes$label == main_label] # get definition for qualifier label, special case if it is a number if (qualifier %in% num_qualifiers){ - qual_def <- attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == "_#"] + qual_def <- attributes$definition[attributes$label == "_#"] } else{ - qual_def <- attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == qualifier] + qual_def <- attributes$definition[attributes$label == qualifier] } # concatenate the definitions - table$attributeDefinition[i] = paste(main_def, ". ", qual_def) + att_table$attributeDefinition[i] <- paste(main_def, ". ", qual_def) # check if it is a time variable if (grepl("TIME", main_label)){ - table$measurementScale[i] = "dateTime" - table$domain[i] = "dateTimeDomain" - table$formatString[i] = "YYYYMMDDHHMM" - table$unit[i] = "NA" + att_table$measurementScale[i] <- "dateTime" + att_table$domain[i] <- "dateTimeDomain" + att_table$formatString[i] <- "YYYYMMDDHHMM" + att_table$unit[i] <- "NA" } else { - table$measurementScale[i] = "ratio" - table$domain[i] = "numericDomain" - table$numberType[i] <- "real" - table$unit[i] = attributes$SI_unit[attributes$uniqueAttributeLabel == main_label] - table$missingValueCode[i] = "-9999" - table$missingValueCodeExplanation[i] = "Missing values are represented as -9999" + att_table$measurementScale[i] <- "ratio" + att_table$domain[i] <- "numericDomain" + att_table$numberType[i] <- "real" + att_table$unit[i] <- attributes$SI_unit[attributes$label == main_label] + att_table$missingValueCode[i] <- "-9999" + att_table$missingValueCodeExplanation[i] <- "Missing values are represented as -9999" } # case if there is no qualifier } else { - table$attributeDefinition[i] = attributes$uniqueAttributeDefinition[attributes$uniqueAttributeLabel == colnames(data)[i]] + att_table$attributeDefinition[i] <- attributes$definition[attributes$label == col_names[i]] # check if it is a time variable - if (grepl("TIME", colnames(data)[i])){ - table$measurementScale[i] = "dateTime" - table$domain[i] = "dateTimeDomain" - table$formatString[i] = "YYYYMMDDHHMM" - table$unit[i] = "NA" + if (grepl("TIME", col_names[i])){ + att_table$measurementScale[i] <- "dateTime" + att_table$domain[i] <- "dateTimeDomain" + att_table$formatString[i] <- "YYYYMMDDHHMM" + att_table$unit[i] <- "NA" } else { - table$measurementScale[i] = "ratio" - table$domain[i] = "numericDomain" - table$numberType[i] <- "real" - table$unit[i] = attributes$SI_unit[attributes$uniqueAttributeLabel == colnames(data)[i]] - table$missingValueCode[i] = "-9999" - table$missingValueCodeExplanation[i] = "Missing values are represented as -9999" + att_table$measurementScale[i] <- "ratio" + att_table$domain[i] <- "numericDomain" + att_table$numberType[i] <- "real" + att_table$unit[i] <- attributes$SI_unit[attributes$label == col_names[i]] + att_table$missingValueCode[i] <- "-9999" + att_table$missingValueCodeExplanation[i] <- "Missing values are represented as -9999" } } } - return(table) + return(att_table) } From 84687f0976ad51cc7b3f27e1517131ee612ecc55 Mon Sep 17 00:00:00 2001 From: Sharis Ochs Date: Mon, 4 Jun 2018 10:42:18 -0700 Subject: [PATCH 3/4] Function to combine get_all_versions, getSystemMetadata, and solr query --- R/Sharis/getSysmetaAll.R | 157 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 R/Sharis/getSysmetaAll.R diff --git a/R/Sharis/getSysmetaAll.R b/R/Sharis/getSysmetaAll.R new file mode 100644 index 0000000..b81a00f --- /dev/null +++ b/R/Sharis/getSysmetaAll.R @@ -0,0 +1,157 @@ +#' Get System Metadata for all versions +#' +#' @description This function combines getSystemMetadata and solr query to produce +#' a data frame with any combination of getSystemMetadata and query fields for all +#' versions of the given PID. If no fields are specified, the resulting data frame +#' will contain all fields possible from getSystemMetadata and solr query. +#' +#' @param node (character) Specify the node where the object should be searched for. +#' @param object_pid (character) PID for the object that you want to return information about. +#' @param list_of_fields (list) List of fields that you want returned in the data frame. +#' "Id" is added automatically. +#' +#' +#' @author Sharis Ochs + +getSystemMetadataAll <- function(node, object_pid, list_of_fields = "all") { + + ## Checks ========================= + # Check that object exist + if (!(arcticdatautils::object_exists(node, object_pid))) { + stop('Object does not exist on specified node ') + } + + # get all solr fields + adc_solr <- httr::GET("https://arcticdata.io/metacat/d1/mn/v2/query/solr") + suppressMessages(suppressWarnings(adc_solr<- adc_solr %>% + stringr::str_extract_all("name>.*<") %>% + unlist() %>% + stringr::str_replace_all("name>|<", ""))) + + # fields in getSystemMetadata but not solr query + gsm_fields <- c("serialVersion", 'accessPolicy', 'preferredNodes', 'blockedNodes', 'archived', 'dateSysMetadataModified', 'originMemberNode', 'authoritativeMemberNode') + + + suppressWarnings(if (list_of_fields != "all"){ + ## Check that all specified fields are valid + for (j in 1:length(list_of_fields)){ + if (!(list_of_fields[j] %in% gsm_fields | list_of_fields[j] %in% adc_solr)){ + stop('Please enter a valid field ') + } + } + }) + + ## Initialize =========================== + # Check how many versions there are + versions<- arcticdatautils::get_all_versions(node, object_pid) + n<- length(versions) + + # Initialize data frame of fields only in getSystemMetadata + sysmeta_table <- data.frame(id = rep("NA", n), + serialVersion = rep("NA", n), + accessPolicy = rep("NA", n), + preferredNodes = rep("NA", n), + blockedNodes = rep("NA", n), + archived = rep("NA", n), + dateSysMetadataModified = rep("NA", n), + originMemberNode = rep("NA", n), + authoritativeMemberNode = rep("NA", n), + stringsAsFactors = F) + + # Case 1 ======================== + # Case: if no fields are specified + suppressWarnings(if (list_of_fields == "all"){ + + # Query all fields + inside_q <- paste(versions, collapse="\"+OR+\"") + q <- paste0("documents:\"", inside_q, "\"") + df_query <- dataone::query(node, list(q=q, + fl= "*", + rows="100"), + as = "data.frame") + + # Then getSystemMetadata + for (i in 1:n){ + sysmeta<- dataone::getSystemMetadata(node, versions[i]) + sysmeta_table$id[i] <- sysmeta@identifier + sysmeta_table$serialVersion[i] <- sysmeta@serialVersion + sysmeta_table$accessPolicy[i] <- I(list((sysmeta@accessPolicy))) + if (length(sysmeta@preferredNodes)>0){ + sysmeta_table$preferredNodes[i] <- list(sysmeta@preferredNodes) + } + if (length(sysmeta@blockedNodes)>0){ + sysmeta_table$blockedNodes[i] <- list(sysmeta@blockedNodes) + } + sysmeta_table$archived[i] <- sysmeta@archived + sysmeta_table$dateSysMetadataModified[i] <- sysmeta@dateSysMetadataModified + sysmeta_table$originMemberNode[i] <- sysmeta@originMemberNode + sysmeta_table$authoritativeMemberNode[i] <- sysmeta@authoritativeMemberNode + } + + # Merge data frames + final_df <- merge(df_query, sysmeta_table, by = "id") + }) + + + # Case 2 ======================== + # Else: Fields are specified + suppressWarnings(if (list_of_fields != "all") { + # split fields into those in solr query and those in getSystemMetadata + query_fields <- list_of_fields[which(list_of_fields %in% adc_solr)] + gsm_fields <- list_of_fields[which(list_of_fields %in% gsm_fields)] + + # If query fields are specified, run query + if (length(query_fields)>0){ + inside_q <- paste(versions, collapse="\"+OR+\"") + q <- paste0("documents:\"", inside_q, "\"") + fl <- paste(list_of_fields, collapse=", ") + fl_add_id <- paste(c("id", list_of_fields), collapse=", ") + df_query <- dataone::query(node, list(q=q, + fl= fl_add_id, + rows="100"), + as = "data.frame") + } + + # If getSystemMetadata fields are specified, getSystemMetadata + if (length(gsm_fields)>0){ + for (i in 1:n){ + sysmeta<- dataone::getSystemMetadata(node, versions[i]) + sysmeta_table$id[i] <- sysmeta@identifier + sysmeta_table$serialVersion[i] <- sysmeta@serialVersion + sysmeta_table$accessPolicy[i] <- I(list((sysmeta@accessPolicy))) + if (length(sysmeta@preferredNodes)>0){ + sysmeta_table$preferredNodes[i] <- list(sysmeta@preferredNodes) + } + if (length(sysmeta@blockedNodes)>0){ + sysmeta_table$blockedNodes[i] <- list(sysmeta@blockedNodes) + } + sysmeta_table$archived[i] <- sysmeta@archived + sysmeta_table$dateSysMetadataModified[i] <- sysmeta@dateSysMetadataModified + sysmeta_table$originMemberNode[i] <- sysmeta@originMemberNode + sysmeta_table$authoritativeMemberNode[i] <- sysmeta@authoritativeMemberNode + } + } + + + # Make new data frame with only the desired fields + sysmeta_table_2 <- sysmeta_table[c("id", gsm_fields)] + + if (length(query_fields)>0 & length(gsm_fields)>0){ + # Merge data frames + final_df <- merge(df_query, sysmeta_table_2, by = "id") + } + + if (length(query_fields)>0 & length(gsm_fields)==0){ + final_df <- df_query + } + + if (length(query_fields)==0 & length(gsm_fields)>0){ + final_df <- sysmeta_table_2 + } + }) + + return(final_df) +} + +#example +df<- getSystemMetadataAll(mn, "doi:10.18739/A27D2Q670", c("origin", "submitter", "title", 'accessPolicy')) From d8f4e5e880d85c06bd3d88b3e1f64df80af7317f Mon Sep 17 00:00:00 2001 From: Sharis Ochs Date: Mon, 4 Jun 2018 10:46:58 -0700 Subject: [PATCH 4/4] clean up branches --- R/Sharis/getSysmetaAll.R | 157 --------------------------------------- 1 file changed, 157 deletions(-) delete mode 100644 R/Sharis/getSysmetaAll.R diff --git a/R/Sharis/getSysmetaAll.R b/R/Sharis/getSysmetaAll.R deleted file mode 100644 index b81a00f..0000000 --- a/R/Sharis/getSysmetaAll.R +++ /dev/null @@ -1,157 +0,0 @@ -#' Get System Metadata for all versions -#' -#' @description This function combines getSystemMetadata and solr query to produce -#' a data frame with any combination of getSystemMetadata and query fields for all -#' versions of the given PID. If no fields are specified, the resulting data frame -#' will contain all fields possible from getSystemMetadata and solr query. -#' -#' @param node (character) Specify the node where the object should be searched for. -#' @param object_pid (character) PID for the object that you want to return information about. -#' @param list_of_fields (list) List of fields that you want returned in the data frame. -#' "Id" is added automatically. -#' -#' -#' @author Sharis Ochs - -getSystemMetadataAll <- function(node, object_pid, list_of_fields = "all") { - - ## Checks ========================= - # Check that object exist - if (!(arcticdatautils::object_exists(node, object_pid))) { - stop('Object does not exist on specified node ') - } - - # get all solr fields - adc_solr <- httr::GET("https://arcticdata.io/metacat/d1/mn/v2/query/solr") - suppressMessages(suppressWarnings(adc_solr<- adc_solr %>% - stringr::str_extract_all("name>.*<") %>% - unlist() %>% - stringr::str_replace_all("name>|<", ""))) - - # fields in getSystemMetadata but not solr query - gsm_fields <- c("serialVersion", 'accessPolicy', 'preferredNodes', 'blockedNodes', 'archived', 'dateSysMetadataModified', 'originMemberNode', 'authoritativeMemberNode') - - - suppressWarnings(if (list_of_fields != "all"){ - ## Check that all specified fields are valid - for (j in 1:length(list_of_fields)){ - if (!(list_of_fields[j] %in% gsm_fields | list_of_fields[j] %in% adc_solr)){ - stop('Please enter a valid field ') - } - } - }) - - ## Initialize =========================== - # Check how many versions there are - versions<- arcticdatautils::get_all_versions(node, object_pid) - n<- length(versions) - - # Initialize data frame of fields only in getSystemMetadata - sysmeta_table <- data.frame(id = rep("NA", n), - serialVersion = rep("NA", n), - accessPolicy = rep("NA", n), - preferredNodes = rep("NA", n), - blockedNodes = rep("NA", n), - archived = rep("NA", n), - dateSysMetadataModified = rep("NA", n), - originMemberNode = rep("NA", n), - authoritativeMemberNode = rep("NA", n), - stringsAsFactors = F) - - # Case 1 ======================== - # Case: if no fields are specified - suppressWarnings(if (list_of_fields == "all"){ - - # Query all fields - inside_q <- paste(versions, collapse="\"+OR+\"") - q <- paste0("documents:\"", inside_q, "\"") - df_query <- dataone::query(node, list(q=q, - fl= "*", - rows="100"), - as = "data.frame") - - # Then getSystemMetadata - for (i in 1:n){ - sysmeta<- dataone::getSystemMetadata(node, versions[i]) - sysmeta_table$id[i] <- sysmeta@identifier - sysmeta_table$serialVersion[i] <- sysmeta@serialVersion - sysmeta_table$accessPolicy[i] <- I(list((sysmeta@accessPolicy))) - if (length(sysmeta@preferredNodes)>0){ - sysmeta_table$preferredNodes[i] <- list(sysmeta@preferredNodes) - } - if (length(sysmeta@blockedNodes)>0){ - sysmeta_table$blockedNodes[i] <- list(sysmeta@blockedNodes) - } - sysmeta_table$archived[i] <- sysmeta@archived - sysmeta_table$dateSysMetadataModified[i] <- sysmeta@dateSysMetadataModified - sysmeta_table$originMemberNode[i] <- sysmeta@originMemberNode - sysmeta_table$authoritativeMemberNode[i] <- sysmeta@authoritativeMemberNode - } - - # Merge data frames - final_df <- merge(df_query, sysmeta_table, by = "id") - }) - - - # Case 2 ======================== - # Else: Fields are specified - suppressWarnings(if (list_of_fields != "all") { - # split fields into those in solr query and those in getSystemMetadata - query_fields <- list_of_fields[which(list_of_fields %in% adc_solr)] - gsm_fields <- list_of_fields[which(list_of_fields %in% gsm_fields)] - - # If query fields are specified, run query - if (length(query_fields)>0){ - inside_q <- paste(versions, collapse="\"+OR+\"") - q <- paste0("documents:\"", inside_q, "\"") - fl <- paste(list_of_fields, collapse=", ") - fl_add_id <- paste(c("id", list_of_fields), collapse=", ") - df_query <- dataone::query(node, list(q=q, - fl= fl_add_id, - rows="100"), - as = "data.frame") - } - - # If getSystemMetadata fields are specified, getSystemMetadata - if (length(gsm_fields)>0){ - for (i in 1:n){ - sysmeta<- dataone::getSystemMetadata(node, versions[i]) - sysmeta_table$id[i] <- sysmeta@identifier - sysmeta_table$serialVersion[i] <- sysmeta@serialVersion - sysmeta_table$accessPolicy[i] <- I(list((sysmeta@accessPolicy))) - if (length(sysmeta@preferredNodes)>0){ - sysmeta_table$preferredNodes[i] <- list(sysmeta@preferredNodes) - } - if (length(sysmeta@blockedNodes)>0){ - sysmeta_table$blockedNodes[i] <- list(sysmeta@blockedNodes) - } - sysmeta_table$archived[i] <- sysmeta@archived - sysmeta_table$dateSysMetadataModified[i] <- sysmeta@dateSysMetadataModified - sysmeta_table$originMemberNode[i] <- sysmeta@originMemberNode - sysmeta_table$authoritativeMemberNode[i] <- sysmeta@authoritativeMemberNode - } - } - - - # Make new data frame with only the desired fields - sysmeta_table_2 <- sysmeta_table[c("id", gsm_fields)] - - if (length(query_fields)>0 & length(gsm_fields)>0){ - # Merge data frames - final_df <- merge(df_query, sysmeta_table_2, by = "id") - } - - if (length(query_fields)>0 & length(gsm_fields)==0){ - final_df <- df_query - } - - if (length(query_fields)==0 & length(gsm_fields)>0){ - final_df <- sysmeta_table_2 - } - }) - - return(final_df) -} - -#example -df<- getSystemMetadataAll(mn, "doi:10.18739/A27D2Q670", c("origin", "submitter", "title", 'accessPolicy'))