OHDSI · parkdongsu · Oct 19, 2018
diff --git a/noteCovariateExtraction/R/KOR_ENG_Extraction.R b/noteCovariateExtraction/R/KOR_ENG_Extraction.R
diff --git a/noteCovariateExtraction/R/createTopicFromNoteSettings.R b/noteCovariateExtraction/R/createTopicFromNoteSettings.R
@@ -10,6 +10,7 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE,
                                         noteConceptId = noteConceptId,
                                         useDictionary=TRUE,
                                         selectDictionary = c('KOR'),
+                                        implementLanguage = c('KOR','ENG'),
                                         useTextToVec = FALSE,
                                         useTopicModeling=FALSE,
                                         numberOfTopics=10L,
@@ -18,10 +19,29 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE,
                                         useAutoencoder=FALSE,
                                         LatentDimensionForAutoEncoder = 100L,
                                         sampleSize=-1){
+
     if(sum(useDictionary) == 0){
         stop('Not implemented.')
     }
     else{
+
+        if(sum(length(selectDictionary)) <= 2){
+
+            if(length(unique(selectDictionary %in% implementLanguage)) == 1){
+                if(unique(selectDictionary %in% implementLanguage)){
+
+                }
+                else{
+                    stop(paste('Only choose implement Language :', paste(implementLanguage,collapse=" ")))
+                }
+            }
+            else{
+                stop(paste('Only choose implement Language :', paste(implementLanguage,collapse=" ")))
+            }
+        }
+        else{
+            stop('Please choose up to two.')
+        }
         if (sum (useTextToVec,useTopicModeling,useGloVe,useAutoencoder) != 1 ) {
             stop("Choose only one among useTextToVec,useTopicModeling,useGloVe,useAutoencoder")
         }
@@ -30,6 +50,7 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE,
                                       noteConceptId = noteConceptId,
                                       useDictionary=useDictionary,
                                       selectDictionary=selectDictionary,
+                                      implementLanguage = implementLanguage,
                                       useTextToVec=useTextToVec,
                                       useTopicModeling=useTopicModeling,
                                       numberOfTopics = numberOfTopics,

diff --git a/noteCovariateExtraction/R/getTopicFromNoteSettings.R b/noteCovariateExtraction/R/getTopicFromNoteSettings.R
@@ -58,22 +58,57 @@ getTopicFromNoteSettings <- function(connection,
         colnames(rawCovariates)<-tolower(colnames(rawCovariates))
 
         ########################
-        if(covariateSettings$selectDictionary == c('KOR')){
-            #ff in list #because Characters can not be inserted into the ff package.
-            rawcovariate_id <- ff::ffapply(x[i1:i2],X= rawCovariates$covariate_id, RETURN=TRUE, CFUN="list", AFUN = Preprocessing_KOR)
-
-            #Create a new dictionary by finding the intersection of all words and dictionaries, Limit: Not recognized if only compound words are found
-            # The kor_dictionary_db is built-in.
-            dictionary <- intersect(as.vector(kor_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
-
-            names(rawcovariate_id) <- 'word'
-            #In the case of Hangul
-            rawcovariate_id <- lapply(rawcovariate_id$'word', KOR_ENG_Extraction)
-
-            covariate_id <- list()
-            #Compare dictionary with only Hangul
-            for(i in 1:length(rawcovariate_id)){
-                covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$KOR,dictionary),rawcovariate_id[[i]]$ENG)
+
+        #ff in list #because Characters can not be inserted into the ff package.
+        rawcovariate_id <- ff::ffapply(x[i1:i2],X= rawCovariates$covariate_id, RETURN=TRUE, CFUN="list", AFUN = notePreprocessing)
+
+        #Create a new dictionary by finding the intersection of all words and dictionaries, Limit: Not recognized if only compound words are found
+        # The kor_dictionary_db is built-in.
+        if('KOR' %in% covariateSettings$selectDictionary){
+            dictionary_kor <- intersect(as.vector(kor_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
+        }
+        if('ENG' %in% covariateSettings$selectDictionary){
+            stop('ENG is not implement')
+            #dictionary_eng <- intersect(as.vector(eng_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
+        }
+        # if('Other' %in% covariateSettings$selectDictionary){
+        #     dictionary_Other <- intersect(as.vector(Other_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
+        # }
+
+        names(rawcovariate_id) <- 'word'
+        #In the case of Hangul
+        rawcovariate_id <- lapply(rawcovariate_id$'word', medicalTermExtraction)
+
+        covariate_id <- list()
+        #Compare dictionary with two language
+        if(sum(covariateSettings$selectDictionary %in% covariateSettings$implementLanguage) == 2){
+
+            ##Compared with Other and English dictionary
+            # for(i in 1:length(rawcovariate_id)){
+            #     covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$Other,dictionary_Other),intersect(rawcovariate_id[[i]]$ENG,dictionary_eng))
+            # }
+        }
+
+        #Compare dictionary with one language
+        else{
+            ##Compared with Other language EX)KOR
+            #EX) KOR
+            if(covariateSettings$selectDictionary == 'KOR'){
+                for(i in 1:length(rawcovariate_id)){
+                    covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$KOR,dictionary_kor),rawcovariate_id[[i]]$ENG)
+                }
+            }
+            else if (covariateSettings$selectDictionary == 'Other'){
+                # for(i in 1:length(rawcovariate_id)){
+                #     covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$Other,dictionary_kor),rawcovariate_id[[i]]$ENG)
+                # }
+            }
+
+            ##Compared with Only English
+            else if(covariateSettings$selectDictionary == 'ENG'){
+                # for(i in 1:length(rawcovariate_id)){
+                #     covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$ENG,dictionary_eng))
+                # }
             }
         }
 

diff --git a/noteCovariateExtraction/R/medicalTermExtraction.R b/noteCovariateExtraction/R/medicalTermExtraction.R
@@ -0,0 +1,42 @@
+#' Custom createCoveriate Settings
+#'
+#' This function is Custom createCoveriate Settings.
+#' @param rawcovariate_id
+#' @keywordsa createCovariateSetting
+#' @export
+#' @examples
+#' medicalTermExtraction()
+medicalTermExtraction <- function(rawcovariate_id){
+
+    #Divide English and Other languages
+    eng_word <- gsub('[^a-zA-Z]','',rawcovariate_id)
+    eng_word[length(eng_word)+1] <- c("")
+    only_eng <- eng_word[-which(eng_word == "")]
+    only_eng <- unique(only_eng)
+
+    #Only one Languages
+    if(length(covariateSettings$selectDictionary) == 1){
+        #EX) ENG
+        if('ENG' %in% covariateSettings$selectDictionary){
+            word_list <- list('ENG' = only_eng)
+        }
+    }
+
+    #Other, ENG devise => EX)KOR
+    if('KOR' %in% covariateSettings$selectDictionary){
+
+        han_eng <- setdiff(rawcovariate_id,only_eng)
+
+        kor_word <- gsub('[^가-힣]','',han_eng)
+        kor_word[length(kor_word)+1] <- c("")
+        only_kor <- kor_word[-which(kor_word == "")]
+        only_kor <- unique(only_kor)
+
+        word_list <- list('ENG' = only_eng, 'KOR' = only_kor)
+    }
+    #Other Language, ENG devise
+    #if(){}
+
+
+    return(word_list)
+}
diff --git a/...CovariateExtraction/R/Preprocessing_KOR.R → ...CovariateExtraction/R/notePreprocessing.R b/...CovariateExtraction/R/Preprocessing_KOR.R → ...CovariateExtraction/R/notePreprocessing.R
@@ -5,8 +5,8 @@
 #' @keywordsa createCovariateSetting
 #' @export
 #' @examples
-#' Preprocessing_KOR()
-Preprocessing_KOR <- function(covariate_id){
+#' notePreprocessing()
+notePreprocessing <- function(covariate_id){
 
     covariate_id <- gsub('<[^<>]*>',' ',covariate_id) #Remove Tag
     #Remove html special characters
@@ -16,12 +16,25 @@ Preprocessing_KOR <- function(covariate_id){
     covariate_id <- gsub('&amp;', " ", covariate_id)
     covariate_id <- gsub('&quot;', " ", covariate_id)
 
-    #remove hangle typo
-    covariate_id <- gsub('[ㅏ-ㅣ]*','',covariate_id)
-    covariate_id <- gsub('[ㄱ-ㅎ]*','',covariate_id)
+    #####At least one should be included.
+    #KOR PreProcessing
+    if('KOR' %in% covariateSettings$selectDictionary){
+        #remove hangle typo
+        covariate_id <- gsub('[ㅏ-ㅣ]*','',covariate_id)
+        covariate_id <- gsub('[ㄱ-ㅎ]*','',covariate_id)
+
+        #Only Korean and English are left. (remove special characters)
+        covariate_id <- gsub('[^가-힣a-zA-Z]',' ',covariate_id)
+    }
+    #Other Language PreProcessing
+    #if('Other' %in% covariateSettings$selectDictionary){}
+    ###################################
+
+    #Only ENG PreProcessing
+    else{
+        covariate_id <- gsub('[^a-zA-Z]',' ',covariate_id)
+    }
 
-    #Only Korean and English are left. (remove special characters)
-    covariate_id <- gsub('[^가-힣a-zA-Z]',' ',covariate_id)
 
     #The spacing is only once
     covariate_id <- stringr::str_replace_all(covariate_id,"[[:space:]]{1,}"," ")
@@ -34,3 +47,6 @@ Preprocessing_KOR <- function(covariate_id){
 
     return(covariate_id)
 }
+
+
+