Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions noteCovariateExtraction/R/KOR_ENG_Extraction.R

This file was deleted.

21 changes: 21 additions & 0 deletions noteCovariateExtraction/R/createTopicFromNoteSettings.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE,
noteConceptId = noteConceptId,
useDictionary=TRUE,
selectDictionary = c('KOR'),
implementLanguage = c('KOR','ENG'),
useTextToVec = FALSE,
useTopicModeling=FALSE,
numberOfTopics=10L,
Expand All @@ -18,10 +19,29 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE,
useAutoencoder=FALSE,
LatentDimensionForAutoEncoder = 100L,
sampleSize=-1){

if(sum(useDictionary) == 0){
stop('Not implemented.')
}
else{

if(sum(length(selectDictionary)) <= 2){

if(length(unique(selectDictionary %in% implementLanguage)) == 1){
if(unique(selectDictionary %in% implementLanguage)){

}
else{
stop(paste('Only choose implement Language :', paste(implementLanguage,collapse=" ")))
}
}
else{
stop(paste('Only choose implement Language :', paste(implementLanguage,collapse=" ")))
}
}
else{
stop('Please choose up to two.')
}
if (sum (useTextToVec,useTopicModeling,useGloVe,useAutoencoder) != 1 ) {
stop("Choose only one among useTextToVec,useTopicModeling,useGloVe,useAutoencoder")
}
Expand All @@ -30,6 +50,7 @@ createTopicFromNoteSettings <- function(useTopicFromNote = TRUE,
noteConceptId = noteConceptId,
useDictionary=useDictionary,
selectDictionary=selectDictionary,
implementLanguage = implementLanguage,
useTextToVec=useTextToVec,
useTopicModeling=useTopicModeling,
numberOfTopics = numberOfTopics,
Expand Down
67 changes: 51 additions & 16 deletions noteCovariateExtraction/R/getTopicFromNoteSettings.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,57 @@ getTopicFromNoteSettings <- function(connection,
colnames(rawCovariates)<-tolower(colnames(rawCovariates))

########################
if(covariateSettings$selectDictionary == c('KOR')){
#ff in list #because Characters can not be inserted into the ff package.
rawcovariate_id <- ff::ffapply(x[i1:i2],X= rawCovariates$covariate_id, RETURN=TRUE, CFUN="list", AFUN = Preprocessing_KOR)

#Create a new dictionary by finding the intersection of all words and dictionaries, Limit: Not recognized if only compound words are found
# The kor_dictionary_db is built-in.
dictionary <- intersect(as.vector(kor_dictionary_db[,1]),unique(unlist(rawcovariate_id)))

names(rawcovariate_id) <- 'word'
#In the case of Hangul
rawcovariate_id <- lapply(rawcovariate_id$'word', KOR_ENG_Extraction)

covariate_id <- list()
#Compare dictionary with only Hangul
for(i in 1:length(rawcovariate_id)){
covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$KOR,dictionary),rawcovariate_id[[i]]$ENG)

#ff in list #because Characters can not be inserted into the ff package.
rawcovariate_id <- ff::ffapply(x[i1:i2],X= rawCovariates$covariate_id, RETURN=TRUE, CFUN="list", AFUN = notePreprocessing)

#Create a new dictionary by finding the intersection of all words and dictionaries, Limit: Not recognized if only compound words are found
# The kor_dictionary_db is built-in.
if('KOR' %in% covariateSettings$selectDictionary){
dictionary_kor <- intersect(as.vector(kor_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
}
if('ENG' %in% covariateSettings$selectDictionary){
stop('ENG is not implement')
#dictionary_eng <- intersect(as.vector(eng_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
}
# if('Other' %in% covariateSettings$selectDictionary){
# dictionary_Other <- intersect(as.vector(Other_dictionary_db[,1]),unique(unlist(rawcovariate_id)))
# }

names(rawcovariate_id) <- 'word'
#In the case of Hangul
rawcovariate_id <- lapply(rawcovariate_id$'word', medicalTermExtraction)

covariate_id <- list()
#Compare dictionary with two language
if(sum(covariateSettings$selectDictionary %in% covariateSettings$implementLanguage) == 2){

##Compared with Other and English dictionary
# for(i in 1:length(rawcovariate_id)){
# covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$Other,dictionary_Other),intersect(rawcovariate_id[[i]]$ENG,dictionary_eng))
# }
}

#Compare dictionary with one language
else{
##Compared with Other language EX)KOR
#EX) KOR
if(covariateSettings$selectDictionary == 'KOR'){
for(i in 1:length(rawcovariate_id)){
covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$KOR,dictionary_kor),rawcovariate_id[[i]]$ENG)
}
}
else if (covariateSettings$selectDictionary == 'Other'){
# for(i in 1:length(rawcovariate_id)){
# covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$Other,dictionary_kor),rawcovariate_id[[i]]$ENG)
# }
}

##Compared with Only English
else if(covariateSettings$selectDictionary == 'ENG'){
# for(i in 1:length(rawcovariate_id)){
# covariate_id[[i]] <- c(intersect(rawcovariate_id[[i]]$ENG,dictionary_eng))
# }
}
}

Expand Down
42 changes: 42 additions & 0 deletions noteCovariateExtraction/R/medicalTermExtraction.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#' Custom createCoveriate Settings
#'
#' This function is Custom createCoveriate Settings.
#' @param rawcovariate_id
#' @keywordsa createCovariateSetting
#' @export
#' @examples
#' medicalTermExtraction()
medicalTermExtraction <- function(rawcovariate_id){

#Divide English and Other languages
eng_word <- gsub('[^a-zA-Z]','',rawcovariate_id)
eng_word[length(eng_word)+1] <- c("")
only_eng <- eng_word[-which(eng_word == "")]
only_eng <- unique(only_eng)

#Only one Languages
if(length(covariateSettings$selectDictionary) == 1){
#EX) ENG
if('ENG' %in% covariateSettings$selectDictionary){
word_list <- list('ENG' = only_eng)
}
}

#Other, ENG devise => EX)KOR
if('KOR' %in% covariateSettings$selectDictionary){

han_eng <- setdiff(rawcovariate_id,only_eng)

kor_word <- gsub('[^가-힣]','',han_eng)
kor_word[length(kor_word)+1] <- c("")
only_kor <- kor_word[-which(kor_word == "")]
only_kor <- unique(only_kor)

word_list <- list('ENG' = only_eng, 'KOR' = only_kor)
}
#Other Language, ENG devise
#if(){}


return(word_list)
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#' @keywordsa createCovariateSetting
#' @export
#' @examples
#' Preprocessing_KOR()
Preprocessing_KOR <- function(covariate_id){
#' notePreprocessing()
notePreprocessing <- function(covariate_id){

covariate_id <- gsub('<[^<>]*>',' ',covariate_id) #Remove Tag
#Remove html special characters
Expand All @@ -16,12 +16,25 @@ Preprocessing_KOR <- function(covariate_id){
covariate_id <- gsub('&amp;', " ", covariate_id)
covariate_id <- gsub('&quot;', " ", covariate_id)

#remove hangle typo
covariate_id <- gsub('[ㅏ-ㅣ]*','',covariate_id)
covariate_id <- gsub('[ㄱ-ㅎ]*','',covariate_id)
#####At least one should be included.
#KOR PreProcessing
if('KOR' %in% covariateSettings$selectDictionary){
#remove hangle typo
covariate_id <- gsub('[ㅏ-ㅣ]*','',covariate_id)
covariate_id <- gsub('[ㄱ-ㅎ]*','',covariate_id)

#Only Korean and English are left. (remove special characters)
covariate_id <- gsub('[^가-힣a-zA-Z]',' ',covariate_id)
}
#Other Language PreProcessing
#if('Other' %in% covariateSettings$selectDictionary){}
###################################

#Only ENG PreProcessing
else{
covariate_id <- gsub('[^a-zA-Z]',' ',covariate_id)
}

#Only Korean and English are left. (remove special characters)
covariate_id <- gsub('[^가-힣a-zA-Z]',' ',covariate_id)

#The spacing is only once
covariate_id <- stringr::str_replace_all(covariate_id,"[[:space:]]{1,}"," ")
Expand All @@ -34,3 +47,6 @@ Preprocessing_KOR <- function(covariate_id){

return(covariate_id)
}