diff --git a/noteCovariateExtraction/R/KOR_ENG_Extraction.R b/noteCovariateExtraction/R/KOR_ENG_Extraction.R deleted file mode 100644 index 4b3c181..0000000 --- a/noteCovariateExtraction/R/KOR_ENG_Extraction.R +++ /dev/null @@ -1,27 +0,0 @@ -#' Custom createCoveriate Settings -#' -#' This function is Custom createCoveriate Settings. -#' @param rawcovariate_id -#' @keywordsa createCovariateSetting -#' @export -#' @examples -#' KOR_ENG_Extraction() -KOR_ENG_Extraction <- function(rawcovariate_id){ - - #Divide English and Hangul - eng_word <- gsub('[^a-zA-Z]','',rawcovariate_id) - eng_word[length(eng_word)+1] <- c("") - only_eng <- eng_word[-which(eng_word == "")] - only_eng <- unique(only_eng) - - han_eng <- setdiff(rawcovariate_id,only_eng) - - kor_word <- gsub('[^가-힣]','',han_eng) - kor_word[length(kor_word)+1] <- c("") - only_kor <- kor_word[-which(kor_word == "")] - only_kor <- unique(only_kor) - - word_list <- list('ENG' = only_eng, 'KOR' = only_kor) - - return(word_list) -} diff --git a/noteCovariateExtraction/R/createTopicFromNoteSettings.R b/noteCovariateExtraction/R/createTopicFromNoteSettings.R index ab2b2ec..a6fda19 100644 --- a/noteCovariateExtraction/R/createTopicFromNoteSettings.R +++ b/noteCovariateExtraction/R/createTopicFromNoteSettings.R @@ -10,6 +10,7 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE, noteConceptId = noteConceptId, useDictionary=TRUE, selectDictionary = c('KOR'), + implementLanguage = c('KOR','ENG'), useTextToVec = FALSE, useTopicModeling=FALSE, numberOfTopics=10L, @@ -18,10 +19,29 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE, useAutoencoder=FALSE, LatentDimensionForAutoEncoder = 100L, sampleSize=-1){ + if(sum(useDictionary) == 0){ stop('Not implemented.') } else{ + + if(sum(length(selectDictionary)) <= 2){ + + if(length(unique(selectDictionary %in% implementLanguage)) == 1){ + if(unique(selectDictionary %in% implementLanguage)){ + + } + else{ + stop(paste('Only choose implement Language :', paste(implementLanguage,collapse=" "))) + } + } + else{ + stop(paste('Only choose implement Language :', paste(implementLanguage,collapse=" "))) + } + } + else{ + stop('Please choose up to two.') + } if (sum (useTextToVec,useTopicModeling,useGloVe,useAutoencoder) != 1 ) { stop("Choose only one among useTextToVec,useTopicModeling,useGloVe,useAutoencoder") } @@ -30,6 +50,7 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE, noteConceptId = noteConceptId, useDictionary=useDictionary, selectDictionary=selectDictionary, + implementLanguage = implementLanguage, useTextToVec=useTextToVec, useTopicModeling=useTopicModeling, numberOfTopics = numberOfTopics, diff --git a/noteCovariateExtraction/R/getTopicFromNoteSettings.R b/noteCovariateExtraction/R/getTopicFromNoteSettings.R index cdafb98..781667f 100644 --- a/noteCovariateExtraction/R/getTopicFromNoteSettings.R +++ b/noteCovariateExtraction/R/getTopicFromNoteSettings.R @@ -58,22 +58,57 @@ getTopicFromNoteSettings <- function(connection, colnames(rawCovariates)<-tolower(colnames(rawCovariates)) ######################## - if(covariateSettings$selectDictionary == c('KOR')){ - #ff in list #because Characters can not be inserted into the ff package. - rawcovariate_id <- ff::ffapply(x[i1:i2],X= rawCovariates$covariate_id, RETURN=TRUE, CFUN="list", AFUN = Preprocessing_KOR) - - #Create a new dictionary by finding the intersection of all words and dictionaries, Limit: Not recognized if only compound words are found - # The kor_dictionary_db is built-in. - dictionary <- intersect(as.vector(kor_dictionary_db[,1]),unique(unlist(rawcovariate_id))) - - names(rawcovariate_id) <- 'word' - #In the case of Hangul - rawcovariate_id <- lapply(rawcovariate_id$'word', KOR_ENG_Extraction) - - covariate_id <- list() - #Compare dictionary with only Hangul - for(i in 1:length(rawcovariate_id)){ - covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$KOR,dictionary),rawcovariate_id[[i]]$ENG) + + #ff in list #because Characters can not be inserted into the ff package. + rawcovariate_id <- ff::ffapply(x[i1:i2],X= rawCovariates$covariate_id, RETURN=TRUE, CFUN="list", AFUN = notePreprocessing) + + #Create a new dictionary by finding the intersection of all words and dictionaries, Limit: Not recognized if only compound words are found + # The kor_dictionary_db is built-in. + if('KOR' %in% covariateSettings$selectDictionary){ + dictionary_kor <- intersect(as.vector(kor_dictionary_db[,1]),unique(unlist(rawcovariate_id))) + } + if('ENG' %in% covariateSettings$selectDictionary){ + stop('ENG is not implement') + #dictionary_eng <- intersect(as.vector(eng_dictionary_db[,1]),unique(unlist(rawcovariate_id))) + } + # if('Other' %in% covariateSettings$selectDictionary){ + # dictionary_Other <- intersect(as.vector(Other_dictionary_db[,1]),unique(unlist(rawcovariate_id))) + # } + + names(rawcovariate_id) <- 'word' + #In the case of Hangul + rawcovariate_id <- lapply(rawcovariate_id$'word', medicalTermExtraction) + + covariate_id <- list() + #Compare dictionary with two language + if(sum(covariateSettings$selectDictionary %in% covariateSettings$implementLanguage) == 2){ + + ##Compared with Other and English dictionary + # for(i in 1:length(rawcovariate_id)){ + # covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$Other,dictionary_Other),intersect(rawcovariate_id[[i]]$ENG,dictionary_eng)) + # } + } + + #Compare dictionary with one language + else{ + ##Compared with Other language EX)KOR + #EX) KOR + if(covariateSettings$selectDictionary == 'KOR'){ + for(i in 1:length(rawcovariate_id)){ + covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$KOR,dictionary_kor),rawcovariate_id[[i]]$ENG) + } + } + else if (covariateSettings$selectDictionary == 'Other'){ + # for(i in 1:length(rawcovariate_id)){ + # covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$Other,dictionary_kor),rawcovariate_id[[i]]$ENG) + # } + } + + ##Compared with Only English + else if(covariateSettings$selectDictionary == 'ENG'){ + # for(i in 1:length(rawcovariate_id)){ + # covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$ENG,dictionary_eng)) + # } } } diff --git a/noteCovariateExtraction/R/medicalTermExtraction.R b/noteCovariateExtraction/R/medicalTermExtraction.R new file mode 100644 index 0000000..184be5a --- /dev/null +++ b/noteCovariateExtraction/R/medicalTermExtraction.R @@ -0,0 +1,42 @@ +#' Custom createCoveriate Settings +#' +#' This function is Custom createCoveriate Settings. +#' @param rawcovariate_id +#' @keywordsa createCovariateSetting +#' @export +#' @examples +#' medicalTermExtraction() +medicalTermExtraction <- function(rawcovariate_id){ + + #Divide English and Other languages + eng_word <- gsub('[^a-zA-Z]','',rawcovariate_id) + eng_word[length(eng_word)+1] <- c("") + only_eng <- eng_word[-which(eng_word == "")] + only_eng <- unique(only_eng) + + #Only one Languages + if(length(covariateSettings$selectDictionary) == 1){ + #EX) ENG + if('ENG' %in% covariateSettings$selectDictionary){ + word_list <- list('ENG' = only_eng) + } + } + + #Other, ENG devise => EX)KOR + if('KOR' %in% covariateSettings$selectDictionary){ + + han_eng <- setdiff(rawcovariate_id,only_eng) + + kor_word <- gsub('[^가-힣]','',han_eng) + kor_word[length(kor_word)+1] <- c("") + only_kor <- kor_word[-which(kor_word == "")] + only_kor <- unique(only_kor) + + word_list <- list('ENG' = only_eng, 'KOR' = only_kor) + } + #Other Language, ENG devise + #if(){} + + + return(word_list) +} diff --git a/noteCovariateExtraction/R/Preprocessing_KOR.R b/noteCovariateExtraction/R/notePreprocessing.R similarity index 54% rename from noteCovariateExtraction/R/Preprocessing_KOR.R rename to noteCovariateExtraction/R/notePreprocessing.R index 370d2b0..7cae30f 100644 --- a/noteCovariateExtraction/R/Preprocessing_KOR.R +++ b/noteCovariateExtraction/R/notePreprocessing.R @@ -5,8 +5,8 @@ #' @keywordsa createCovariateSetting #' @export #' @examples -#' Preprocessing_KOR() -Preprocessing_KOR <- function(covariate_id){ +#' notePreprocessing() +notePreprocessing <- function(covariate_id){ covariate_id <- gsub('<[^<>]*>',' ',covariate_id) #Remove Tag #Remove html special characters @@ -16,12 +16,25 @@ Preprocessing_KOR <- function(covariate_id){ covariate_id <- gsub('&', " ", covariate_id) covariate_id <- gsub('"', " ", covariate_id) - #remove hangle typo - covariate_id <- gsub('[ㅏ-ㅣ]*','',covariate_id) - covariate_id <- gsub('[ㄱ-ㅎ]*','',covariate_id) + #####At least one should be included. + #KOR PreProcessing + if('KOR' %in% covariateSettings$selectDictionary){ + #remove hangle typo + covariate_id <- gsub('[ㅏ-ㅣ]*','',covariate_id) + covariate_id <- gsub('[ㄱ-ㅎ]*','',covariate_id) + + #Only Korean and English are left. (remove special characters) + covariate_id <- gsub('[^가-힣a-zA-Z]',' ',covariate_id) + } + #Other Language PreProcessing + #if('Other' %in% covariateSettings$selectDictionary){} + ################################### + + #Only ENG PreProcessing + else{ + covariate_id <- gsub('[^a-zA-Z]',' ',covariate_id) + } - #Only Korean and English are left. (remove special characters) - covariate_id <- gsub('[^가-힣a-zA-Z]',' ',covariate_id) #The spacing is only once covariate_id <- stringr::str_replace_all(covariate_id,"[[:space:]]{1,}"," ") @@ -34,3 +47,6 @@ Preprocessing_KOR <- function(covariate_id){ return(covariate_id) } + + +