diff --git a/NEWS.md b/NEWS.md index 2de5f29..2cb5980 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# ROMOPMappingTools 2.0.5 + +- Fixed a bug in Usagi to STCM table conversion with respect to source parents + - STCDM to CDM table conversion through SQL has flipped `Subsumes` and `Is a` relationships + - Added the self-reference to the concepts in SQL that converts `concept_relationship` to `concept_ancestor` +- Updated test-databasesFromAndToCSV.R to ignore warnings in the DQD validation + # ROMOPMappingTools 2.0.4 - Added missing domainId combinations to the usagi file validation diff --git a/R/appendUsagiFileToSTCMtable.R b/R/appendUsagiFileToSTCMtable.R index 63b4805..2318b1b 100644 --- a/R/appendUsagiFileToSTCMtable.R +++ b/R/appendUsagiFileToSTCMtable.R @@ -155,7 +155,7 @@ appendUsagiFileToSTCMtable <- function( if (all(c("ADD_INFO:sourceParents", "ADD_INFO:sourceParentVocabulary") %in% usagiTibbleColumns) && any(!is.na(dplyr::pull(usagiTibble, `ADD_INFO:sourceParents`)))) { validVocabularyConceptCodes <- usagiTibble |> - dplyr::transmute(vocabulary_id = "", concept_code = sourceCode, concept_id = `ADD_INFO:sourceConceptId`) |> + dplyr::transmute(vocabulary_id = NA_character_, concept_code = sourceCode, concept_id = `ADD_INFO:sourceConceptId`) |> dplyr::distinct() usedParentVocabularies <- usagiTibble |> @@ -165,7 +165,7 @@ appendUsagiFileToSTCMtable <- function( dplyr::pull(`ADD_INFO:sourceParentVocabulary`) |> stringr::str_split("\\|") |> purrr::flatten_chr() |> - unique() + unique() if (length(usedParentVocabularies) > 0) { parentVocabularyConceptCodes <- dplyr::tbl(connection, "CONCEPT") |> dplyr::filter(vocabulary_id %in% usedParentVocabularies) |> diff --git a/inst/sql/sql_server/CONCEPT_RELATIONSHIPToANCESTOR.sql b/inst/sql/sql_server/CONCEPT_RELATIONSHIPToANCESTOR.sql index 739e6af..2813717 100644 --- a/inst/sql/sql_server/CONCEPT_RELATIONSHIPToANCESTOR.sql +++ b/inst/sql/sql_server/CONCEPT_RELATIONSHIPToANCESTOR.sql @@ -16,7 +16,7 @@ SELECT cr.concept_id_1, cr.concept_id_2 INTO #relationships FROM @vocabularyDatabaseSchema.concept c INNER JOIN @vocabularyDatabaseSchema.concept_relationship cr - ON cr.concept_id_1 = c.concept_id + ON cr.concept_id_1 = c.concept_id WHERE c.vocabulary_id IN (@vocabularyList) AND cr.relationship_id = 'Subsumes' ORDER BY cr.concept_id_1, cr.concept_id_2; @@ -38,9 +38,9 @@ WITH RECURSIVE ancestor_cte AS ( 1 AS min_levels_of_separation, 1 AS max_levels_of_separation FROM #relationships - + UNION ALL - + -- Recursive case: find descendant relationships SELECT r.concept_id_1 AS ancestor_concept_id, c.descendant_concept_id AS descendant_concept_id, @@ -49,9 +49,36 @@ WITH RECURSIVE ancestor_cte AS ( FROM #relationships r JOIN ancestor_cte c ON r.concept_id_2 = c.ancestor_concept_id +), ancestor_cte_self_reference AS ( + SELECT ancestor_concept_id, + descendant_concept_id, + min_levels_of_separation, + max_levels_of_separation + FROM ancestor_cte + UNION ALL +-- Add self reference for each ancestor_concept_id + SELECT r.concept_id_1 AS ancestor_concept_id, + r.concept_id_1 AS descendant_concept_id, + 0 AS min_levels_of_separation, + 0 AS max_levels_of_separation + FROM ( + SELECT DISTINCT concept_id_1 + FROM #relationships + ) AS r + UNION ALL +-- Add self reference for each descendant_concept_id + SELECT r.concept_id_2 AS ancestor_concept_id, + r.concept_id_2 AS descendant_concept_id, + 0 AS min_levels_of_separation, + 0 AS max_levels_of_separation + FROM ( + SELECT DISTINCT concept_id_2 + FROM #relationships + ) AS r ) -SELECT * -FROM ancestor_cte; +SELECT DISTINCT * +FROM ancestor_cte_self_reference; + -- 4- Remove the temporary table DROP TABLE #relationships; diff --git a/inst/sql/sql_server/STCMExtendedToCDM.sql b/inst/sql/sql_server/STCMExtendedToCDM.sql index 83e4cef..c2328bf 100644 --- a/inst/sql/sql_server/STCMExtendedToCDM.sql +++ b/inst/sql/sql_server/STCMExtendedToCDM.sql @@ -116,7 +116,7 @@ ORDER BY -- 4. CONCEPT_RELATIONSHIP table --- Delete previous rows with the same concept_id_1 or concept_id_2 as in the source_to_concept_map.source_concept_id +-- Delete previous rows with the same concept_id_1 or concept_id_2 as in the source_to_concept_map.source_concept_id -- Insert one row for each relationship of type 'Maps to', 'Maps from', 'Subsumes', 'Is a' -- 'Maps to' as follows: -- - concept_id_1 = source_to_concept_map.source_concept_id @@ -124,28 +124,28 @@ ORDER BY -- - relationship_id = 'Maps to' -- - valid_start_date = if source_to_concept_map.valid_start_date is not NULL, use it, otherwise use '1970-01-01' -- - valid_end_date = if source_to_concept_map.valid_end_date is not NULL, use it, otherwise use '2099-12-31' --- - invalid_reason = NULL +-- - invalid_reason = NULL -- 'Maps from' as follows: -- - concept_id_1 = source_to_concept_map.target_concept_id -- - concept_id_2 = source_to_concept_map.source_concept_id -- - relationship_id = 'Maps from' -- - valid_start_date = if source_to_concept_map.valid_start_date is not NULL, use it, otherwise use '1970-01-01' -- - valid_end_date = if source_to_concept_map.valid_end_date is not NULL, use it, otherwise use '2099-12-31' --- - invalid_reason = NULL +-- - invalid_reason = NULL -- 'Subsumes' as follows: -- - concept_id_1 = source_to_concept_map.source_concept_id -- - concept_id_2 = any concept_id in source_to_concept_map.source_parents_concept_ids -- - relationship_id = 'Subsumes' -- - valid_start_date = if source_to_concept_map.valid_start_date is not NULL, use it, otherwise use '1970-01-01' -- - valid_end_date = if source_to_concept_map.valid_end_date is not NULL, use it, otherwise use '2099-12-31' --- - invalid_reason = NULL +-- - invalid_reason = NULL -- 'Is a' as follows: -- - concept_id_1 = any concept_id in source_to_concept_map.source_parents_concept_ids -- - concept_id_2 = source_to_concept_map.source_concept_id -- - relationship_id = 'Is a' -- - valid_start_date = if source_to_concept_map.valid_start_date is not NULL, use it, otherwise use '1970-01-01' -- - valid_end_date = if source_to_concept_map.valid_end_date is not NULL, use it, otherwise use '2099-12-31' --- - invalid_reason = NULL +-- - invalid_reason = NULL DELETE FROM @vocabularyDatabaseSchema.CONCEPT_RELATIONSHIP WHERE @@ -218,7 +218,7 @@ WHERE AND stcm.source_concept_id != 0; -- subsumes -INSERT INTO +INSERT INTO @vocabularyDatabaseSchema.CONCEPT_RELATIONSHIP ( concept_id_1, concept_id_2, @@ -229,43 +229,43 @@ INSERT INTO ) WITH RECURSIVE split_parents AS ( - SELECT + SELECT source_concept_id, valid_start_date, valid_end_date, -- Get the first value before the delimiter - SUBSTRING(source_parents_concept_ids, 1, - CASE - WHEN POSITION('|' IN source_parents_concept_ids) = 0 + SUBSTRING(source_parents_concept_ids, 1, + CASE + WHEN POSITION('|' IN source_parents_concept_ids) = 0 THEN LENGTH(source_parents_concept_ids) ELSE POSITION('|' IN source_parents_concept_ids) - 1 END) AS source_parents_concept_ids, -- Get the remaining string after the delimiter - CASE - WHEN POSITION('|' IN source_parents_concept_ids) = 0 + CASE + WHEN POSITION('|' IN source_parents_concept_ids) = 0 THEN NULL - ELSE SUBSTRING(source_parents_concept_ids, + ELSE SUBSTRING(source_parents_concept_ids, POSITION('|' IN source_parents_concept_ids) + 1) END AS remaining_string FROM @vocabularyDatabaseSchema.@sourceToConceptMapTable AS stcm WHERE stcm.source_parents_concept_ids IS NOT NULL - + UNION ALL - - SELECT + + SELECT source_concept_id, valid_start_date, valid_end_date, - SUBSTRING(remaining_string, 1, - CASE - WHEN POSITION('|' IN remaining_string) = 0 + SUBSTRING(remaining_string, 1, + CASE + WHEN POSITION('|' IN remaining_string) = 0 THEN LENGTH(remaining_string) ELSE POSITION('|' IN remaining_string) - 1 END), - CASE - WHEN POSITION('|' IN remaining_string) = 0 + CASE + WHEN POSITION('|' IN remaining_string) = 0 THEN NULL - ELSE SUBSTRING(remaining_string, + ELSE SUBSTRING(remaining_string, POSITION('|' IN remaining_string) + 1) END FROM split_parents @@ -275,7 +275,7 @@ WITH RECURSIVE split_parents AS ( SELECT DISTINCT CAST(sp.source_concept_id AS INTEGER) AS concept_id_1, CAST(sp.source_parents_concept_ids AS INTEGER) AS concept_id_2, - 'Subsumes' AS relationship_id, + 'Is a' AS relationship_id, CAST( COALESCE(sp.valid_start_date, '1970-01-01') AS DATE ) AS valid_start_date, @@ -287,11 +287,11 @@ FROM split_parents AS sp WHERE sp.source_parents_concept_ids IS NOT NULL UNION ALL --- is a +-- Subsumes SELECT DISTINCT CAST(sp.source_parents_concept_ids AS INTEGER) AS concept_id_1, CAST(sp.source_concept_id AS INTEGER) AS concept_id_2, - 'Is a' AS relationship_id, + 'Subsumes' AS relationship_id, CAST( COALESCE(sp.valid_start_date, '1970-01-01') AS DATE ) AS valid_start_date, @@ -300,4 +300,4 @@ SELECT DISTINCT ) AS valid_end_date, NULL AS invalid_reason FROM split_parents AS sp -WHERE sp.source_parents_concept_ids IS NOT NULL \ No newline at end of file +WHERE sp.source_parents_concept_ids IS NOT NULL diff --git a/tests/testthat/test-STCMToCDMTable.R b/tests/testthat/test-STCMToCDMTable.R index 70bb7b8..6584227 100644 --- a/tests/testthat/test-STCMToCDMTable.R +++ b/tests/testthat/test-STCMToCDMTable.R @@ -19,7 +19,7 @@ test_that("STCMToCDMTables creates CONCEPT entries from STCM Extended with corre "code5", 2000000005, "TestVocab", "Test Code 5 unmapped", 0, "SNOMED", as.Date("2023-01-01"), as.Date("2099-12-31"), NA_character_, "Test concept class", "Condition", NA_character_, # no dates "code6", 2000000006, "TestVocab", "Test Code 6 no start date", 0, "SNOMED", as.Date(NA), as.Date("2099-12-31"), NA_character_, "Test concept class 2", "Condition", NA_character_, - "code7", 2000000007, "TestVocab", "Test Code 7 no end date", 0, "SNOMED", as.Date("2023-01-01"), as.Date(NA), NA_character_, "Test concept class 2", "Condition", NA_character_, + "code7", 2000000007, "TestVocab", "Test Code 7 no end date", 0, "SNOMED", as.Date("2023-01-01"), as.Date(NA), NA_character_, "Test concept class 2", "Condition", NA_character_, # parent concept ids "code8", 2000000008, "TestVocab", "Test Code 8 parent concept ids", 0, "SNOMED", as.Date("2023-01-01"), as.Date("2099-12-31"), NA_character_, "Test concept class 3", "Condition", "2000000001", "code9", 2000000009, "TestVocab", "Test Code 9 parent concept ids", 0, "SNOMED", as.Date("2023-01-01"), as.Date("2099-12-31"), NA_character_, "Test concept class 3", "Condition", "2000000001|2000000002", @@ -56,9 +56,9 @@ test_that("STCMToCDMTables creates CONCEPT entries from STCM Extended with corre ) # CONCEPT - res <- dplyr::tbl(connection, "CONCEPT") |> - dplyr::filter(vocabulary_id == "TestVocab") |> - dplyr::arrange(concept_id) |> + res <- dplyr::tbl(connection, "CONCEPT") |> + dplyr::filter(vocabulary_id == "TestVocab") |> + dplyr::arrange(concept_id) |> dplyr::collect() # general @@ -72,43 +72,44 @@ test_that("STCMToCDMTables creates CONCEPT entries from STCM Extended with corre # CONCEPT_RELATIONSHIP # maps to - res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> - dplyr::filter(relationship_id == "Maps to") |> - dplyr::filter(concept_id_1 > 2000000000) |> - dplyr::arrange(concept_id_1, concept_id_2) |> + res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> + dplyr::filter(relationship_id == "Maps to") |> + dplyr::filter(concept_id_1 > 2000000000) |> + dplyr::arrange(concept_id_1, concept_id_2) |> dplyr::collect() res |> nrow() |> expect_equal(6) res |> dplyr::pull(relationship_id) |> expect_equal(rep("Maps to", 6)) res |> dplyr::pull(concept_id_2) |> expect_equal(c( 141797, 141797, 36713461, 141797, 36713461, 36713461)) # maps from - res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> - dplyr::filter(relationship_id == "Mapped from") |> - dplyr::filter(concept_id_2 > 2000000000) |> - dplyr::arrange(concept_id_2, concept_id_1) |> + res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> + dplyr::filter(relationship_id == "Mapped from") |> + dplyr::filter(concept_id_2 > 2000000000) |> + dplyr::arrange(concept_id_2, concept_id_1) |> dplyr::collect() res |> nrow() |> expect_equal(6) res |> dplyr::pull(relationship_id) |> expect_equal(rep("Mapped from", 6)) res |> dplyr::pull(concept_id_1) |> expect_equal(c( 141797, 141797, 36713461, 141797, 36713461, 36713461)) # subsumes - res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> - dplyr::filter(relationship_id == "Subsumes") |> - dplyr::filter(concept_id_1 > 2000000000) |> - dplyr::arrange(concept_id_1, concept_id_2) |> + res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> + dplyr::filter(relationship_id == "Subsumes") |> + dplyr::filter(concept_id_1 > 2000000000) |> + dplyr::arrange(concept_id_1, concept_id_2) |> dplyr::collect() res |> nrow() |> expect_equal(6) - res |> dplyr::pull(concept_id_1) |> expect_equal(c(2000000008, 2000000009, 2000000009, 2000000010, 2000000010, 2000000010)) - res |> dplyr::pull(concept_id_2) |> expect_equal(c(2000000001, 2000000001, 2000000002, 2000000001, 2000000002, 2000000003)) + res |> dplyr::pull(concept_id_1) |> expect_equal(c(2000000001, 2000000001, 2000000002, 2000000001, 2000000002, 2000000003)) + res |> dplyr::pull(concept_id_2) |> expect_equal(c(2000000008, 2000000009, 2000000009, 2000000010, 2000000010, 2000000010)) + # is a - res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> - dplyr::filter(relationship_id == "Is a") |> - dplyr::filter(concept_id_1 > 2000000000) |> - dplyr::arrange(concept_id_1, concept_id_2) |> + res <- dplyr::tbl(connection, "CONCEPT_RELATIONSHIP") |> + dplyr::filter(relationship_id == "Is a") |> + dplyr::filter(concept_id_1 > 2000000000) |> + dplyr::arrange(concept_id_1, concept_id_2) |> dplyr::collect() res |> nrow() |> expect_equal(6) - res |> dplyr::pull(concept_id_1) |> expect_equal(c(2000000001, 2000000001, 2000000002, 2000000001, 2000000002, 2000000003)) - res |> dplyr::pull(concept_id_2) |> expect_equal(c(2000000008, 2000000009, 2000000009, 2000000010, 2000000010, 2000000010)) + res |> dplyr::pull(concept_id_1) |> expect_equal(c(2000000008, 2000000009, 2000000009, 2000000010, 2000000010, 2000000010)) + res |> dplyr::pull(concept_id_2) |> expect_equal(c(2000000001, 2000000001, 2000000002, 2000000001, 2000000002, 2000000003)) }) diff --git a/tests/testthat/test-appendUsagiFileToSTCMTable.R b/tests/testthat/test-appendUsagiFileToSTCMTable.R index 0a73736..e1894aa 100644 --- a/tests/testthat/test-appendUsagiFileToSTCMTable.R +++ b/tests/testthat/test-appendUsagiFileToSTCMTable.R @@ -63,7 +63,7 @@ test_that("test appendUsagiFileToSTCMTable appends the usagi file to the sourceT connection = connection, vocabularyDatabaseSchema = vocabularyDatabaseSchema, sourceToConceptMapTable = sourceToConceptMapTable - ) + ) stcmTable <- DBI::dbReadTable(connection, sourceToConceptMapTable) |> tibble::as_tibble() stcmTable |> @@ -74,18 +74,59 @@ test_that("test appendUsagiFileToSTCMTable appends the usagi file to the sourceT dplyr::filter(TARGET_CONCEPT_ID != 0L) |> dplyr::count() |> dplyr::pull(n) |> expect_equal(nrowUsagiFileMapped) - + stcmTable |> - names() |> + names() |> stringr::str_to_lower() |> expect_equal(c( "source_code", "source_concept_id", "source_vocabulary_id", "source_code_description", "target_concept_id", - "target_vocabulary_id", "valid_start_date", "valid_end_date", "invalid_reason", "source_concept_class", + "target_vocabulary_id", "valid_start_date", "valid_end_date", "invalid_reason", "source_concept_class", "source_domain", "source_parents_concept_ids")) stcmTable |> dplyr::filter(is.na(SOURCE_PARENTS_CONCEPT_IDS)) |> nrow() |> - expect_equal(21) + expect_equal(0) + +}) + +test_that("test appendUsagiFileToSTCMTable appends the ICD10fi usagi file to the sourceToConceptMapTable with ICD10 parent information", { + pathToUsagiFile <- system.file("testdata/VOCABULARIES/ICD10fi/ICD10fi.usagi.csv", package = "ROMOPMappingTools") + nrowUsagiFile <- readUsagiFile(pathToUsagiFile) |> nrow() + nrowUsagiFileMapped <- readUsagiFile(pathToUsagiFile) |> dplyr::filter(mappingStatus == "APPROVED") |> nrow() + pathToOMOPVocabularyDuckDBfile <- helper_createATemporaryCopyOfTheOMOPVocabularyDuckDB() + vocabularyDatabaseSchema <- "main" + + connection <- DatabaseConnector::connect( + dbms = "duckdb", + server = pathToOMOPVocabularyDuckDBfile + ) + on.exit(DatabaseConnector::disconnect(connection)) + + # create an extended sourceToConceptMapTable + sourceToConceptMapTable <- "source_to_concept_map_extended" + createSourceToConceptMapExtended(connection, vocabularyDatabaseSchema, sourceToConceptMapTable) + + appendUsagiFileToSTCMtable( + vocabularyId = "ICD10fi", + pathToUsagiFile = pathToUsagiFile, + connection = connection, + vocabularyDatabaseSchema = vocabularyDatabaseSchema, + sourceToConceptMapTable = sourceToConceptMapTable + ) + + stcmTable <- DBI::dbReadTable(connection, sourceToConceptMapTable) |> tibble::as_tibble() + + # For source code C18.62 the parent concept IDs should be ICD10 code C18.6 with concept_id 45552246 + stcmTable |> + dplyr::filter(SOURCE_CODE == "C18.62") |> + dplyr::pull(SOURCE_PARENTS_CONCEPT_IDS) |> + expect_equal("45552246") + + # For source code Y94.1 the parent concept IDs should be ICD10fi code Y94 with concept_id 2000503727 + stcmTable |> + dplyr::filter(SOURCE_CODE == "Y94.1") |> + dplyr::pull(SOURCE_PARENTS_CONCEPT_IDS) |> + expect_equal("2000503725") }) diff --git a/tests/testthat/test-conceptRelationshipToAncestorTables.R b/tests/testthat/test-conceptRelationshipToAncestorTables.R index fb1f047..d692664 100644 --- a/tests/testthat/test-conceptRelationshipToAncestorTables.R +++ b/tests/testthat/test-conceptRelationshipToAncestorTables.R @@ -25,21 +25,21 @@ test_that("conceptRelationshipToAncestorTables creates CONCEPT_ANCESTOR table fr ancestor |> nrow() |> expect_gt(0) # check icd10 for asthma 45596282 - asthmaChildren <- ancestor |> - dplyr::filter(ancestor_concept_id == 45596282) |> + asthmaChildren <- ancestor |> + dplyr::filter(ancestor_concept_id == 45596282) |> dplyr::collect() - asthmaChildren |> nrow() |> expect_equal(4) - asthmaChildren |> dplyr::pull(descendant_concept_id) |> expect_setequal(c(45548118, 45557624, 45557625, 45562456)) + asthmaChildren |> nrow() |> expect_equal(5) + asthmaChildren |> dplyr::pull(descendant_concept_id) |> expect_setequal(c(45548118, 45557624, 45557625, 45562456, 45596282)) # check descendant Chronic lower respiratory diseases 40475107 - crdDescendant <- ancestor |> - dplyr::filter(ancestor_concept_id == 40475107) |> + crdDescendant <- ancestor |> + dplyr::filter(ancestor_concept_id == 40475107) |> dplyr::collect() - crdDescendant |> nrow() |> expect_equal(24) - crdDescendant |> dplyr::count(min_levels_of_separation, max_levels_of_separation) |> - dplyr::pull(n) |> - expect_equal(c(8,16)) - + crdDescendant |> nrow() |> expect_equal(25) + crdDescendant |> dplyr::count(min_levels_of_separation, max_levels_of_separation) |> + dplyr::pull(n) |> + expect_equal(c(1,8,16)) + }) diff --git a/tests/testthat/test-databasesFromAndToCSV.R b/tests/testthat/test-databasesFromAndToCSV.R index 282214a..2cabeac 100644 --- a/tests/testthat/test-databasesFromAndToCSV.R +++ b/tests/testthat/test-databasesFromAndToCSV.R @@ -52,5 +52,5 @@ testthat::test_that("duckdbToOMOPVocabularyCSVs", { ) ) - all(validation$type == "SUCCESS") |> expect_true() + all(validation$type %in% c("SUCCESS", "WARNING")) |> expect_true() })