From a1a8a5aba07e31726ccc0887b501e994dcd4793a Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 31 Oct 2024 22:08:12 -0700 Subject: [PATCH 01/20] Prepare to support species in mGAP (#175) * Prepare to support species in mGAP --- mGAP/resources/data/species.tsv | 10 ++ mGAP/resources/etls/prime-seq.xml | 2 + .../queries/mGAP/combinedPedigree.sql | 12 +- .../queries/mGAP/releaseTracks/.qview.xml | 2 +- .../mGAP/variantCatalogReleases/.qview.xml | 1 + mGAP/resources/r/UpdateTracks.r | 115 +++++++++++++++ .../dbscripts/postgresql/mgap-16.73-16.74.sql | 3 + .../dbscripts/sqlserver/mgap-16.73-16.74.sql | 3 + mGAP/resources/schemas/mgap.xml | 26 +++- mGAP/src/org/labkey/mgap/mGAPController.java | 88 ++++++++++-- mGAP/src/org/labkey/mgap/mGAPModule.java | 2 +- mcc/package-lock.json | 132 +++++++----------- 12 files changed, 290 insertions(+), 106 deletions(-) create mode 100644 mGAP/resources/data/species.tsv create mode 100644 mGAP/resources/r/UpdateTracks.r create mode 100644 mGAP/resources/schemas/dbscripts/postgresql/mgap-16.73-16.74.sql create mode 100644 mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.73-16.74.sql diff --git a/mGAP/resources/data/species.tsv b/mGAP/resources/data/species.tsv new file mode 100644 index 000000000..2472af328 --- /dev/null +++ b/mGAP/resources/data/species.tsv @@ -0,0 +1,10 @@ +common_name scientific_name mhc_prefix +Cotton-top Tamarin Saguinus oedipus Saoe +Cynomolgus macaque Macaca fascicularis Mafa +Marmoset Callithrix jacchus Caja +Pigtail macaque Macaca nemestrina Mane +Rhesus macaque Macaca mulatta Mamu +Sooty Mangabey Cercocebus atys Ceat +Stump Tailed Macaca Arctoides Maar +Vervet Chlorocebus sabaeus Chsa +Japanese macaque Macaca fuscata Mafu diff --git a/mGAP/resources/etls/prime-seq.xml b/mGAP/resources/etls/prime-seq.xml index 8b2bcecc9..458803fca 100644 --- a/mGAP/resources/etls/prime-seq.xml +++ b/mGAP/resources/etls/prime-seq.xml @@ -53,6 +53,7 @@ releaseId trackName label + species totalSamples category url @@ -101,6 +102,7 @@ releaseDate vcfId/dataid/DataFileUrl genomeId/name + species totalSubjects totalVariants dbSnpId diff --git a/mGAP/resources/queries/mGAP/combinedPedigree.sql b/mGAP/resources/queries/mGAP/combinedPedigree.sql index 55512965a..4725d8998 100644 --- a/mGAP/resources/queries/mGAP/combinedPedigree.sql +++ b/mGAP/resources/queries/mGAP/combinedPedigree.sql @@ -1,12 +1,12 @@ SELECT - s.subjectname, + s.Id as subjectname, s.gender, - s.mother as dam, - s.father as sire, + s.Id.parents.dam as dam, + s.Id.parents.sire as sire, s.species, - s.geographic_origin, + s.geographic_origin -FROM laboratory.subjects s +FROM "/Internal/PMR/".study.demographics s UNION ALL @@ -19,4 +19,4 @@ SELECT null as geographic_origin FROM mgap.demographics d -WHERE d.subjectname NOT IN (SELECT DISTINCT s.subjectname FROM laboratory.subjects s) \ No newline at end of file +WHERE d.subjectname NOT IN (SELECT DISTINCT s.Id FROM "/Internal/PMR/".study.demographics s) \ No newline at end of file diff --git a/mGAP/resources/queries/mGAP/releaseTracks/.qview.xml b/mGAP/resources/queries/mGAP/releaseTracks/.qview.xml index 08488c2ec..7de86c8ae 100644 --- a/mGAP/resources/queries/mGAP/releaseTracks/.qview.xml +++ b/mGAP/resources/queries/mGAP/releaseTracks/.qview.xml @@ -3,6 +3,7 @@ + @@ -10,7 +11,6 @@ - diff --git a/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml b/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml index f8b614ee2..ea7526061 100644 --- a/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml +++ b/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml @@ -1,6 +1,7 @@ + diff --git a/mGAP/resources/r/UpdateTracks.r b/mGAP/resources/r/UpdateTracks.r new file mode 100644 index 000000000..8646b5cb0 --- /dev/null +++ b/mGAP/resources/r/UpdateTracks.r @@ -0,0 +1,115 @@ +library(Rlabkey) +library(dplyr) + +# This script is designed to be run externally per release, to identify subject that need to be added to the releaseTrackSubsets table: + +testByCenter <- function(centerName, trackName) { + dat <- suppressWarnings(labkey.selectRows( + baseUrl="https://prime-seq.ohsu.edu", + folderPath="/Internal/ColonyData", + schemaName="mgap", + queryName="sampleSummary", + viewName="", + colSelect="subjectId,externalAlias", + colFilter=makeFilter( + c("tracks", "DOES_NOT_CONTAIN", trackName), + c("center", "EQUAL", centerName)), + containerFilter=NULL, + colNameOpt="rname" + )) + + print(paste0(trackName, ': ', nrow(dat))) + + if (nrow(dat) == 0) { + return(NULL) + } + + return(data.frame(trackName = trackName, subjectId = dat$subjectid)) +} + +testBySpecies <- function(speciesList, trackName) { + dat <- suppressWarnings(labkey.selectRows( + baseUrl="https://prime-seq.ohsu.edu", + folderPath="/Internal/ColonyData", + schemaName="mgap", + queryName="sampleSummary", + viewName="", + colSelect="subjectId,externalAlias", + colFilter=makeFilter( + c("tracks", "DOES_NOT_CONTAIN", trackName), + c("species", "IN", paste0(speciesList, collapse = ';'))), + containerFilter=NULL, + colNameOpt="rname" + )) + + print(paste0(trackName, ': ', nrow(dat))) + + if (nrow(dat) == 0) { + return(NULL) + } + + return(data.frame(trackName = trackName, subjectId = dat$subjectid)) +} + +toInsert <- rbind( + testByCenter('CNPRC', 'CNPRC Animals'), + testByCenter('TNPRC', 'TNPRC Animals'), + testByCenter('ENPRC', 'ENPRC Animals'), + testByCenter('NEPRC', 'NEPRC Animals'), + testByCenter('SNPRC', 'SNPRC Animals'), + testByCenter('ONPRC', 'ONPRC Animals'), + testByCenter('MDA', 'MDA Animals'), + testByCenter('WFU', 'WFU Animals'), + testByCenter('CPRC', 'CPRC Animals'), + testBySpecies(c('RHESUS MACAQUE', 'Rhesus', 'Macaca mulatta'), 'Rhesus Macaques'), + testBySpecies(c('JAPANESE MACAQUE', 'Macaca fuscata'), 'Japanese Macaques') +) + + +if (FALSE) { + added <- labkey.insertRows( + baseUrl="https://prime-seq.ohsu.edu", + folderPath="/Internal/ColonyData", + schemaName="mgap", + queryName="releaseTrackSubsets", + toInsert = toInsert + ) +} + + +# Now ensure all tracks exist: +existingTracks <- labkey.selectRows( + baseUrl="https://prime-seq.ohsu.edu", + folderPath="/Internal/ColonyData", + schemaName="mgap", + queryName="releaseTracks", + colNameOpt="rname" +) + +missingTrackNames <- labkey.selectRows( + baseUrl="https://prime-seq.ohsu.edu", + folderPath="/Internal/ColonyData", + schemaName="mgap", + queryName="releaseTrackSubsets", + colSelect="trackName", + colNameOpt="rname" +) %>% + filter(!trackname %in% existingTracks$trackname) %>% + select(trackname) %>% unique() + +if (nrow(missingTrackNames) > 0) { + toAdd <- data.frame(trackName = missingTrackNames$trackname, label = missingTrackNames$trackname, isprimarytrack = FALSE) + toAdd$Category <- 'Species Dataset' + # Add anything else desired, like species, source, url, description, category + + if (FALSE) { + added <- labkey.insertRows( + baseUrl="https://prime-seq.ohsu.edu", + folderPath="/Internal/ColonyData", + schemaName="mgap", + queryName="releaseTracks", + toInsert = toAdd + ) + } +} + diff --git a/mGAP/resources/schemas/dbscripts/postgresql/mgap-16.73-16.74.sql b/mGAP/resources/schemas/dbscripts/postgresql/mgap-16.73-16.74.sql new file mode 100644 index 000000000..e4f4faba8 --- /dev/null +++ b/mGAP/resources/schemas/dbscripts/postgresql/mgap-16.73-16.74.sql @@ -0,0 +1,3 @@ +ALTER TABLE mGAP.variantCatalogReleases ADD species varchar(1000); +ALTER TABLE mGAP.releaseTracks ADD species varchar(1000); +ALTER TABLE mGAP.releaseTracks DROP COLUMN mergepriority; \ No newline at end of file diff --git a/mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.73-16.74.sql b/mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.73-16.74.sql new file mode 100644 index 000000000..e4f4faba8 --- /dev/null +++ b/mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.73-16.74.sql @@ -0,0 +1,3 @@ +ALTER TABLE mGAP.variantCatalogReleases ADD species varchar(1000); +ALTER TABLE mGAP.releaseTracks ADD species varchar(1000); +ALTER TABLE mGAP.releaseTracks DROP COLUMN mergepriority; \ No newline at end of file diff --git a/mGAP/resources/schemas/mgap.xml b/mGAP/resources/schemas/mgap.xml index 24bfd6332..fa92efe8c 100644 --- a/mGAP/resources/schemas/mgap.xml +++ b/mGAP/resources/schemas/mgap.xml @@ -95,6 +95,15 @@ false Row Id + + Species + + laboratory + species + common_name + + + Version false @@ -644,6 +653,15 @@ Label false + + Species + + laboratory + species + common_name + + + Source ${url} @@ -669,11 +687,7 @@ textarea - Is Primary Track? - - - Merge Priority Order - This order will be used for genotype priority order when merging to create the primary release VCF. Lower numbers have higher priority. Set to -1 to exclude this track when merging. + Is Primary Track For Species? Skip Validation? @@ -831,7 +845,7 @@ textarea - Is Primary Track? + Is Primary Track For Species? false diff --git a/mGAP/src/org/labkey/mgap/mGAPController.java b/mGAP/src/org/labkey/mgap/mGAPController.java index e8f0ebbc4..52991080b 100644 --- a/mGAP/src/org/labkey/mgap/mGAPController.java +++ b/mGAP/src/org/labkey/mgap/mGAPController.java @@ -58,6 +58,7 @@ import org.labkey.api.query.QueryUpdateService; import org.labkey.api.query.UserSchema; import org.labkey.api.reader.Readers; +import org.labkey.api.resource.Resource; import org.labkey.api.security.AuthenticationManager; import org.labkey.api.security.Group; import org.labkey.api.security.GroupManager; @@ -224,7 +225,7 @@ public Object execute(RequestUserForm form, BindException errors) throws Excepti } DetailsURL url = DetailsURL.fromString("/query/executeQuery.view?schemaName=mgap&query.queryName=userRequests&query.viewName=Pending Requests", c); - mail.setEncodedHtmlContent("A user requested an account on mGap. Click here to view/approve this request"); + mail.setEncodedHtmlContent("A user requested an account on mGap. Click here to view/approve this request"); mail.setFrom(getReplyEmail(getContainer())); mail.setSubject("mGap Account Request"); mail.addRecipients(Message.RecipientType.TO, emails.toArray(new Address[emails.size()])); @@ -412,13 +413,13 @@ public Object execute(ApproveUserRequestsForm form, BindException errors) throws User u; if (map.get("userId") != null) { - Integer userId = (Integer)map.get("userId"); + Integer userId = (Integer) map.get("userId"); u = UserManager.getUser(userId); existingUsersGivenAccess.add(u); } else { - ValidEmail ve = new ValidEmail((String)map.get("email")); + ValidEmail ve = new ValidEmail((String) map.get("email")); u = UserManager.getUser(ve); if (u != null) { @@ -428,8 +429,8 @@ public Object execute(ApproveUserRequestsForm form, BindException errors) throws { SecurityManager.NewUserStatus st = SecurityManager.addUser(ve, getUser()); u = st.getUser(); - u.setFirstName((String)map.get("firstName")); - u.setLastName((String)map.get("lastName")); + u.setFirstName((String) map.get("firstName")); + u.setLastName((String) map.get("lastName")); UserManager.updateUser(getUser(), u); if (st.isLdapOrSsoEmail()) @@ -539,7 +540,7 @@ private static Map getReleaseRow(User u, ReleaseForm form, Error return null; } - Container rowContainer = ContainerManager.getForId((String)row.get("container")); + Container rowContainer = ContainerManager.getForId((String) row.get("container")); if (rowContainer == null) { errors.reject(ERROR_MSG, "Unknown row container: " + form.getReleaseId()); @@ -555,7 +556,7 @@ else if (!rowContainer.hasPermission(u, ReadPermission.class)) private static SequenceOutputFile getOutputFile(Map row, ReleaseForm form, Errors errors) { - SequenceOutputFile so = SequenceOutputFile.getForId((Integer)row.get("vcfId")); + SequenceOutputFile so = SequenceOutputFile.getForId((Integer) row.get("vcfId")); if (so == null) { errors.reject(ERROR_MSG, "Unknown VCF file ID: " + form.getReleaseId()); @@ -590,7 +591,7 @@ public void export(DownloadBundleForm form, HttpServletResponse response, BindEx } Set toZip = new HashSet<>(); - String zipName = "mGap_VariantCatalog_v" + FileUtil.makeLegalName((String)row.get("version")); + String zipName = "mGap_VariantCatalog_v" + FileUtil.makeLegalName((String) row.get("version")); zipName = zipName.replaceAll(" ", "_"); toZip.add(so.getFile()); @@ -598,7 +599,7 @@ public void export(DownloadBundleForm form, HttpServletResponse response, BindEx if (form.getIncludeGenome()) { - ReferenceGenome genome = SequenceAnalysisService.get().getReferenceGenome((Integer)row.get("genomeId"), getUser()); + ReferenceGenome genome = SequenceAnalysisService.get().getReferenceGenome((Integer) row.get("genomeId"), getUser()); if (genome == null) { errors.reject(ERROR_MSG, "Unknown genome: " + row.get("genomeId")); @@ -969,7 +970,7 @@ public URLHelper getRedirectURL(GenomeBrowserForm form) String species = StringUtils.trimToNull(form.getSpecies()); if (jbrowseDatabaseId == null) { - jbrowseDatabaseId = ctx.getString("human".equals(species) ? "mgapJBrowseHuman": "mgapJBrowse"); + jbrowseDatabaseId = ctx.getString("human".equals(species) ? "mgapJBrowseHuman" : "mgapJBrowse"); } if (jbrowseDatabaseId == null) @@ -1283,4 +1284,71 @@ public URLHelper getSuccessURL(Object o) return PageFlowUtil.urlProvider(PipelineUrls.class).urlBegin(getContainer()); } } + + @RequiresPermission(AdminPermission.class) + public static class ImportDataAction extends ConfirmAction + { + @Override + public ModelAndView getConfirmView(Object o, BindException errors) throws Exception + { + setTitle("Import mGAP Reference Data"); + + return HtmlView.of("This will import default values for reference tables. Do you want to continue?"); + } + + @Override + public void validateCommand(Object o, Errors errors) + { + + } + + @Override + public @NotNull URLHelper getSuccessURL(Object o) + { + return getContainer().getStartURL(getUser()); + } + + @Override + public boolean handlePost(Object o, BindException errors) throws Exception + { + Resource r = ModuleLoader.getInstance().getModule(mGAPModule.class).getModuleResource(Path.parse("data/species.tsv")); + if (!r.exists()) + { + throw new IllegalStateException("Unable to find species.tsv"); + } + + List> toAdd = new ArrayList<>(); + try (CSVReader reader = new CSVReader(Readers.getReader(r.getInputStream()), '\t')) + { + String[] line; + while ((line = reader.readNext()) != null) + { + if (line[0].equals("common_name")) + { + continue; + } + + Map row = new CaseInsensitiveHashMap<>(); + row.put("common_name", line[0]); + row.put("scientific_name", line[1]); + row.put("mhc_prefix", line[2]); + + toAdd.add(row); + } + } + + UserSchema us = QueryService.get().getUserSchema(getUser(), getContainer(), "laboratory"); + TableInfo ti = us.getTable("species"); + ti.getUpdateService().truncateRows(getUser(), getContainer(), null, null); + + BatchValidationException bve = new BatchValidationException(); + ti.getUpdateService().insertRows(getUser(), getContainer(), toAdd, bve, null, null); + if (bve.hasErrors()) + { + throw bve; + } + + return true; + } + } } \ No newline at end of file diff --git a/mGAP/src/org/labkey/mgap/mGAPModule.java b/mGAP/src/org/labkey/mgap/mGAPModule.java index 7bcc1c7c4..28affbe03 100644 --- a/mGAP/src/org/labkey/mgap/mGAPModule.java +++ b/mGAP/src/org/labkey/mgap/mGAPModule.java @@ -77,7 +77,7 @@ public String getName() @Override public Double getSchemaVersion() { - return 16.73; + return 16.74; } @Override diff --git a/mcc/package-lock.json b/mcc/package-lock.json index 3655ca754..9091b05dc 100644 --- a/mcc/package-lock.json +++ b/mcc/package-lock.json @@ -4938,21 +4938,6 @@ "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true }, - "node_modules/body-parser/node_modules/qs": { - "version": "6.13.0", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", - "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", - "dev": true, - "dependencies": { - "side-channel": "^1.0.6" - }, - "engines": { - "node": ">=0.6" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/bonjour-service": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/bonjour-service/-/bonjour-service-1.2.1.tgz", @@ -5554,10 +5539,11 @@ "integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A==" }, "node_modules/cookie": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz", - "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==", + "version": "0.7.1", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.1.tgz", + "integrity": "sha512-6DnInpx7SJ2AK3+CTUE/ZM0vWTUboZCegxhC2xiIydHR9jNuTAASBrfEpHhiGOZw/nX51bHt6YQl8jsGo4y/0w==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -6587,24 +6573,25 @@ } }, "node_modules/express": { - "version": "4.20.0", - "resolved": "https://registry.npmjs.org/express/-/express-4.20.0.tgz", - "integrity": "sha512-pLdae7I6QqShF5PnNTCVn4hI91Dx0Grkn2+IAsMTgMIKuQVte2dN9PeGSSAME2FR8anOhVA62QDIUaWVfEXVLw==", + "version": "4.21.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz", + "integrity": "sha512-YSFlK1Ee0/GC8QaO91tHcDxJiE/X4FbpAyQWkxAvG6AXCuR65YzK8ua6D9hvi/TzUfZMpc+BwuM1IPw8fmQBiQ==", "dev": true, + "license": "MIT", "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", "body-parser": "1.20.3", "content-disposition": "0.5.4", "content-type": "~1.0.4", - "cookie": "0.6.0", + "cookie": "0.7.1", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", - "finalhandler": "1.2.0", + "finalhandler": "1.3.1", "fresh": "0.5.2", "http-errors": "2.0.0", "merge-descriptors": "1.0.3", @@ -6613,11 +6600,11 @@ "parseurl": "~1.3.3", "path-to-regexp": "0.1.10", "proxy-addr": "~2.0.7", - "qs": "6.11.0", + "qs": "6.13.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", "send": "0.19.0", - "serve-static": "1.16.0", + "serve-static": "1.16.2", "setprototypeof": "1.2.0", "statuses": "2.0.1", "type-is": "~1.6.18", @@ -6750,13 +6737,14 @@ } }, "node_modules/finalhandler": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz", - "integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==", + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz", + "integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==", "dev": true, + "license": "MIT", "dependencies": { "debug": "2.6.9", - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "on-finished": "2.4.1", "parseurl": "~1.3.3", @@ -6772,15 +6760,27 @@ "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", "dev": true, + "license": "MIT", "dependencies": { "ms": "2.0.0" } }, + "node_modules/finalhandler/node_modules/encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/finalhandler/node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/find-cache-dir": { "version": "4.0.0", @@ -7857,10 +7857,11 @@ } }, "node_modules/http-proxy-middleware": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.6.tgz", - "integrity": "sha512-ya/UeJ6HVBYxrgYotAZo1KvPWlgB48kUJLDePFeneHsVujFaW5WNj2NgWCAE//B1Dl02BIfYlpNgBy8Kf8Rjmw==", + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz", + "integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==", "dev": true, + "license": "MIT", "dependencies": { "@types/http-proxy": "^1.17.8", "http-proxy": "^1.18.1", @@ -11846,12 +11847,13 @@ ] }, "node_modules/qs": { - "version": "6.11.0", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz", - "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==", + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", + "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", "dev": true, + "license": "BSD-3-Clause", "dependencies": { - "side-channel": "^1.0.4" + "side-channel": "^1.0.6" }, "engines": { "node": ">=0.6" @@ -12946,63 +12948,29 @@ } }, "node_modules/serve-static": { - "version": "1.16.0", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.0.tgz", - "integrity": "sha512-pDLK8zwl2eKaYrs8mrPZBJua4hMplRWJ1tIFksVC3FtBEBnl8dxgeHtsaMS8DhS9i4fLObaon6ABoc4/hQGdPA==", + "version": "1.16.2", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz", + "integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==", "dev": true, + "license": "MIT", "dependencies": { - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "parseurl": "~1.3.3", - "send": "0.18.0" + "send": "0.19.0" }, "engines": { "node": ">= 0.8.0" } }, - "node_modules/serve-static/node_modules/debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "dependencies": { - "ms": "2.0.0" - } - }, - "node_modules/serve-static/node_modules/debug/node_modules/ms": { + "node_modules/serve-static/node_modules/encodeurl": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true - }, - "node_modules/serve-static/node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "dev": true - }, - "node_modules/serve-static/node_modules/send": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", - "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", "dev": true, - "dependencies": { - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", - "mime": "1.6.0", - "ms": "2.1.3", - "on-finished": "2.4.1", - "range-parser": "~1.2.1", - "statuses": "2.0.1" - }, + "license": "MIT", "engines": { - "node": ">= 0.8.0" + "node": ">= 0.8" } }, "node_modules/set-blocking": { From 5650d21477329c7e589f8fc0abc91b51da7b6fa2 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 31 Oct 2024 22:29:00 -0700 Subject: [PATCH 02/20] Drop support for Cassandra --- .../labkey/mgap/pipeline/AnnotationStep.java | 171 ++---------------- .../pipeline/MultiSourceAnnotatorRunner.java | 8 +- 2 files changed, 14 insertions(+), 165 deletions(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java b/mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java index 96dd56b56..2854ce3c5 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java +++ b/mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java @@ -2,7 +2,6 @@ import htsjdk.samtools.util.Interval; import htsjdk.variant.vcf.VCFFileReader; -import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; import org.json.JSONObject; @@ -25,18 +24,14 @@ import org.labkey.api.sequenceanalysis.SequenceOutputFile; import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; -import org.labkey.api.sequenceanalysis.pipeline.PipelineStep; import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; -import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl; import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; import org.labkey.api.sequenceanalysis.run.SelectVariantsWrapper; -import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; -import org.labkey.api.util.FileUtil; import org.labkey.api.util.PageFlowUtil; import org.labkey.api.writer.PrintWriters; import org.labkey.mgap.mGAPSchema; @@ -57,7 +52,7 @@ /** * Created by bimber on 5/2/2017. */ -public class AnnotationStep extends AbstractCommandPipelineStep implements VariantProcessingStep +public class AnnotationStep extends AbstractCommandPipelineStep implements VariantProcessingStep { public static final String GRCH37 = "genome37"; private static final String CLINVAR_VCF = "clinvar37"; @@ -65,7 +60,7 @@ public class AnnotationStep extends AbstractCommandPipelineStep public AnnotationStep(PipelineStepProvider provider, PipelineContext ctx) { - super(provider, ctx, new CassandraRunner(ctx.getLogger())); + super(provider, ctx, new MultiSourceAnnotatorRunner(ctx.getLogger())); } public static class Provider extends AbstractVariantProcessingStepProvider implements VariantProcessingStep.SupportsScatterGather @@ -88,10 +83,6 @@ public Provider() put("valueField", "rowid"); put("allowBlank", false); }}, null), - ToolParameterDescriptor.create("useCassandra", "Use Cassandra", "If checked, Cassandra will be run.", "checkbox", new JSONObject() - {{ - put("checked", true); - }}, true), ToolParameterDescriptor.create("useFuncotator", "Use Funcotator", "If checked, Extended Funcotator will be run.", "checkbox", new JSONObject() {{ put("checked", true); @@ -242,44 +233,20 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno boolean dropGenotypes = totalSubjects > 10; boolean dropFiltered = getProvider().getParameterByName("dropFiltered").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class); - //This flag exists to allow in-flight jobs to be reworked to include a sample. it should eventually be removed. - boolean forceRecreate = false; - File currentVcf = inputVCF; if (dropGenotypes || dropFiltered) { if (dropGenotypes) - getPipelineCtx().getLogger().info("dropping most genotypes prior to liftover for performance reasons. a single is retained since cassandra requires one."); + getPipelineCtx().getLogger().info("dropping genotypes prior to liftover for performance reasons."); if (dropFiltered) getPipelineCtx().getLogger().info("dropping filtered sites"); File subset = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".subset.vcf.gz"); - //NOTE: this check exists to correct in-flight jobs created using --sites-only-vcf-output. It should eventually be removed. - if (subset.exists()) - { - try (VCFFileReader reader = new VCFFileReader(subset)) - { - if (reader.getFileHeader().getGenotypeSamples().isEmpty()) - { - getPipelineCtx().getLogger().info("A VCF appears to have been created with --sites-only. Will overwrite these using an output with a single sample for Cassandra"); - forceRecreate = true; - } - } - } - List selectArgs = new ArrayList<>(); if (dropGenotypes) { - //NOTE: Cassandra requires at least one genotype, so instead of --sites-only-vcf-output, subset to first sample only - String firstSample; - try (VCFFileReader reader = new VCFFileReader(inputVCF)) - { - firstSample = reader.getFileHeader().getGenotypeSamples().get(0); - } - - selectArgs.add("-sn"); - selectArgs.add(firstSample); + selectArgs.add("--sites-only-vcf-output"); } if (dropFiltered) @@ -297,7 +264,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno needToSubsetToInterval = false; } - if (forceRecreate || !indexExists(subset)) + if (!indexExists(subset)) { SelectVariantsWrapper wrapper = new SelectVariantsWrapper(getPipelineCtx().getLogger()); wrapper.execute(originalGenome.getWorkingFastaFile(), inputVCF, subset, selectArgs); @@ -332,7 +299,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno needToSubsetToInterval = false; File intervalSubset = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".intervalSubset.vcf.gz"); - if (forceRecreate || !indexExists(intervalSubset)) + if (!indexExists(intervalSubset)) { SelectVariantsWrapper wrapper = new SelectVariantsWrapper(getPipelineCtx().getLogger()); wrapper.execute(originalGenome.getWorkingFastaFile(), inputVCF, intervalSubset, selectArgs); @@ -358,7 +325,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno File liftedToGRCh37 = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(currentVcf.getName()) + ".liftTo" + grch37Genome.getGenomeId() + ".vcf.gz"); File liftoverRejects = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(currentVcf.getName()) + ".liftoverReject" + grch37Genome.getGenomeId() + ".vcf.gz"); - if (forceRecreate || !indexExists(liftoverRejects) || !indexExists(liftedToGRCh37)) + if (!indexExists(liftoverRejects) || !indexExists(liftedToGRCh37)) { LiftoverVcfRunner liftoverVcfRunner = new LiftoverVcfRunner(getPipelineCtx().getLogger()); liftoverVcfRunner.doLiftover(currentVcf, chainFile, grch37Genome.getWorkingFastaFile(), liftoverRejects, liftedToGRCh37, 0.95); @@ -374,7 +341,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno //annotate with clinvar getPipelineCtx().getLogger().info("annotating with ClinVar 2.0"); File clinvarAnnotated = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".cv.vcf.gz"); - if (forceRecreate || !indexExists(clinvarAnnotated)) + if (!indexExists(clinvarAnnotated)) { ClinvarAnnotatorRunner cvRunner = new ClinvarAnnotatorRunner(getPipelineCtx().getLogger()); cvRunner.execute(liftedToGRCh37, clinvarVCF, clinvarAnnotated); @@ -390,7 +357,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno //backport ClinVar getPipelineCtx().getLogger().info("backport ClinVar 2.0 to source genome"); File clinvarAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(clinvarAnnotated.getName()) + ".bp.vcf.gz"); - if (forceRecreate || !indexExists(clinvarAnnotatedBackport )) + if (!indexExists(clinvarAnnotatedBackport )) { BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger()); bpRunner.execute(clinvarAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), clinvarAnnotatedBackport); @@ -403,49 +370,6 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno output.addIntermediateFile(clinvarAnnotatedBackport); output.addIntermediateFile(new File(clinvarAnnotatedBackport.getPath() + ".tbi")); - //annotate with cassandra - File cassandraAnnotatedBackport = null; - boolean useCassandra = getProvider().getParameterByName("useCassandra").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false); - if (useCassandra) - { - getPipelineCtx().getLogger().info("annotating with Cassandra"); - String basename = SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".cassandra"; - File cassandraAnnotated = new File(outputDirectory, basename + ".vcf.gz"); - if (forceRecreate || !indexExists(cassandraAnnotated)) - { - //we can assume splitting happened upstream, so run over the full VCF - runCassandra(liftedToGRCh37, cassandraAnnotated, output, forceRecreate); - } - else - { - getPipelineCtx().getLogger().info("resuming with existing file: " + cassandraAnnotated.getPath()); - } - - output.addOutput(cassandraAnnotated, "VCF Annotated With Cassandra"); - output.addIntermediateFile(cassandraAnnotated); - output.addIntermediateFile(new File(cassandraAnnotated.getPath() + ".tbi")); - - //backport Cassandra - getPipelineCtx().getLogger().info("backport Cassandra to source genome"); - cassandraAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(cassandraAnnotated.getName()) + ".bp.vcf.gz"); - if (forceRecreate || !indexExists(cassandraAnnotatedBackport)) - { - BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger()); - bpRunner.execute(cassandraAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), cassandraAnnotatedBackport); - } - else - { - getPipelineCtx().getLogger().info("resuming with existing file: " + cassandraAnnotatedBackport.getPath()); - } - output.addOutput(cassandraAnnotatedBackport, "VCF Annotated With Cassandra, Backported"); - output.addIntermediateFile(cassandraAnnotatedBackport); - output.addIntermediateFile(new File(cassandraAnnotatedBackport.getPath() + ".tbi")); - } - else - { - getPipelineCtx().getLogger().debug("Cassandra will be skipped"); - } - //annotate with funcotator File funcotatorAnnotatedBackport = null; if (useFuncotator) @@ -453,7 +377,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno getPipelineCtx().getLogger().info("annotating with Funcotator"); String basename = SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".funcotator"; File funcotatorAnnotated = new File(outputDirectory, basename + ".vcf.gz"); - if (forceRecreate || !indexExists(funcotatorAnnotated)) + if (!indexExists(funcotatorAnnotated)) { //we can assume splitting happened upstream, so run over the full VCF FuncotatorWrapper fr = new FuncotatorWrapper(getPipelineCtx().getLogger()); @@ -490,7 +414,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno //backport Funcotator getPipelineCtx().getLogger().info("backport Funcotator to source genome"); funcotatorAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(funcotatorAnnotated.getName()) + ".bp.vcf.gz"); - if (forceRecreate || !indexExists(funcotatorAnnotatedBackport)) + if (!indexExists(funcotatorAnnotatedBackport)) { BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger()); bpRunner.execute(funcotatorAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), funcotatorAnnotatedBackport); @@ -511,7 +435,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno //multiannotator getPipelineCtx().getLogger().info("Running MultiSourceAnnotator"); File multiAnnotated = new File(getPipelineCtx().getWorkingDirectory(), SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".ma.vcf.gz"); - if (forceRecreate || !indexExists(multiAnnotated)) + if (!indexExists(multiAnnotated)) { MultiSourceAnnotatorRunner maRunner = new MultiSourceAnnotatorRunner(getPipelineCtx().getLogger()); @@ -535,7 +459,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno addToolFieldNames("Funcotator", "-ff", options, multiAnnotated.getParentFile(), output, liftFields); } - maRunner.execute(inputVCF, cassandraAnnotatedBackport, clinvarAnnotatedBackport, liftoverRejects, funcotatorAnnotatedBackport, multiAnnotated, options); + maRunner.execute(inputVCF, clinvarAnnotatedBackport, liftoverRejects, funcotatorAnnotatedBackport, multiAnnotated, options); } else { @@ -588,75 +512,6 @@ private void addToolFieldNames(String toolName, String argName, List opt options.add(fieldFile.getPath()); } - private void runCassandra(File liftedToGRCh37, File finalOutput, VariantProcessingStepOutputImpl output, boolean forceRecreate) throws PipelineJobException - { - List extraArgs = new ArrayList<>(); - - //NOTE: Cassandra will not sort the output when multithreaded, so the extra sorting we would need to do negates any benefit here - String tmpDir = SequencePipelineService.get().getJavaTempDir(); - if (!StringUtils.isEmpty(tmpDir)) - { - File tmpDirFile = new File(tmpDir, "cassandra"); - if (!tmpDirFile.exists()) - { - tmpDirFile.mkdirs(); - } - - extraArgs.add("--tempDir"); - extraArgs.add(tmpDirFile.getPath()); - } - - CassandraRunner cassRunner = new CassandraRunner(getPipelineCtx().getLogger()); - - Integer maxRam = SequencePipelineService.get().getMaxRam(); - cassRunner.setMaxRamOverride(maxRam); - - //Cassandra requires unzipped files - File liftedToGRCh37Unzipped = new File(liftedToGRCh37.getParentFile(), FileUtil.getBaseName(liftedToGRCh37.getName())); - File liftedToGRCh37UnzippedDone = new File(liftedToGRCh37Unzipped.getPath() + ".done"); - if (forceRecreate || !liftedToGRCh37UnzippedDone.exists()) - { - SimpleScriptWrapper wrapper = new SimpleScriptWrapper(getPipelineCtx().getLogger()); - wrapper.execute(Arrays.asList("gunzip", liftedToGRCh37.getPath())); - try - { - FileUtils.touch(liftedToGRCh37UnzippedDone); - if (!liftedToGRCh37.exists() && indexExists(liftedToGRCh37)) - { - File idx = new File(liftedToGRCh37.getPath() + ".tbi"); - idx.delete(); - } - } - catch (IOException e) - { - throw new PipelineJobException(e); - } - } - else - { - getPipelineCtx().getLogger().info("Resuming from file: " + liftedToGRCh37Unzipped.getPath()); - } - - output.addIntermediateFile(liftedToGRCh37Unzipped); - output.addIntermediateFile(new File(liftedToGRCh37Unzipped.getPath() + ".idx")); - output.addIntermediateFile(liftedToGRCh37UnzippedDone); - - cassRunner.execute(liftedToGRCh37Unzipped, finalOutput, extraArgs); - if (!finalOutput.exists()) - { - throw new PipelineJobException("Unable to find output"); - } - - try - { - SequenceAnalysisService.get().ensureVcfIndex(finalOutput, getPipelineCtx().getLogger()); - } - catch (IOException e) - { - throw new PipelineJobException(e); - } - } - protected static boolean indexExists(File vcf) { File idx = new File(vcf.getPath() + ".tbi"); diff --git a/mGAP/src/org/labkey/mgap/pipeline/MultiSourceAnnotatorRunner.java b/mGAP/src/org/labkey/mgap/pipeline/MultiSourceAnnotatorRunner.java index 23d0b388a..6a1c50939 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/MultiSourceAnnotatorRunner.java +++ b/mGAP/src/org/labkey/mgap/pipeline/MultiSourceAnnotatorRunner.java @@ -15,19 +15,13 @@ public MultiSourceAnnotatorRunner(Logger log) super(log); } - public File execute(File inputVcf, @Nullable File cassandraVcf, File clinvarAnnotatedBackport, File liftoverRejects, @Nullable File funcotator, File outputVcf, @Nullable List options) throws PipelineJobException + public File execute(File inputVcf, File clinvarAnnotatedBackport, File liftoverRejects, @Nullable File funcotator, File outputVcf, @Nullable List options) throws PipelineJobException { List args = getBaseArgs("MultiSourceAnnotator"); args.add("-V"); args.add(inputVcf.getPath()); - if (cassandraVcf != null) - { - args.add("-c"); - args.add(cassandraVcf.getPath()); - } - args.add("-lr"); args.add(liftoverRejects.getPath()); From d31a908f2950e754e65dc01fdf1cb16e219a719e Mon Sep 17 00:00:00 2001 From: bbimber Date: Fri, 1 Nov 2024 15:42:37 -0700 Subject: [PATCH 03/20] Add mGAP calculated columns --- .../mGAP/variantCatalogReleases/.qview.xml | 2 +- mGAP/resources/schemas/mgap.xml | 2 +- mGAP/src/org/labkey/mgap/mGAPModule.java | 2 - .../pipeline/AnnotateNovelSitesWrapper.java | 49 +++++ .../mGapReleaseAnnotateNovelSitesStep.java | 206 ------------------ .../org/labkey/mgap/query/mGAPUserSchema.java | 15 +- 6 files changed, 65 insertions(+), 211 deletions(-) create mode 100644 mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java delete mode 100644 mGAP/src/org/labkey/mgap/pipeline/mGapReleaseAnnotateNovelSitesStep.java diff --git a/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml b/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml index ea7526061..425728879 100644 --- a/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml +++ b/mGAP/resources/queries/mGAP/variantCatalogReleases/.qview.xml @@ -1,7 +1,7 @@ - + diff --git a/mGAP/resources/schemas/mgap.xml b/mGAP/resources/schemas/mgap.xml index fa92efe8c..090becab2 100644 --- a/mGAP/resources/schemas/mgap.xml +++ b/mGAP/resources/schemas/mgap.xml @@ -690,7 +690,7 @@ Is Primary Track For Species? - Skip Validation? + Skip Annotation Checks? true diff --git a/mGAP/src/org/labkey/mgap/mGAPModule.java b/mGAP/src/org/labkey/mgap/mGAPModule.java index 28affbe03..1b1666ceb 100644 --- a/mGAP/src/org/labkey/mgap/mGAPModule.java +++ b/mGAP/src/org/labkey/mgap/mGAPModule.java @@ -57,7 +57,6 @@ import org.labkey.mgap.pipeline.SampleSpecificGenotypeFiltrationStep; import org.labkey.mgap.pipeline.VcfComparisonStep; import org.labkey.mgap.pipeline.mGapReleaseAlleleFreqStep; -import org.labkey.mgap.pipeline.mGapReleaseAnnotateNovelSitesStep; import org.labkey.mgap.pipeline.mGapReleaseComparisonStep; import org.labkey.mgap.pipeline.mGapReleaseGenerator; import org.labkey.mgap.query.mGAPUserSchema; @@ -141,7 +140,6 @@ public PipelineStartup() SequencePipelineService.get().registerPipelineStep(new VcfComparisonStep.Provider()); SequencePipelineService.get().registerPipelineStep(new mGapReleaseComparisonStep.Provider()); SequencePipelineService.get().registerPipelineStep(new SampleSpecificGenotypeFiltrationStep.Provider()); - SequencePipelineService.get().registerPipelineStep(new mGapReleaseAnnotateNovelSitesStep.Provider()); SequencePipelineService.get().registerPipelineStep(new GenerateMgapTracksStep.Provider()); SequencePipelineService.get().registerPipelineStep(new IndexVariantsForMgapStep.Provider()); SequencePipelineService.get().registerPipelineStep(new mGapReleaseAlleleFreqStep.Provider()); diff --git a/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java b/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java new file mode 100644 index 000000000..3352ce2fd --- /dev/null +++ b/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java @@ -0,0 +1,49 @@ +package org.labkey.mgap.pipeline; + +import org.apache.logging.log4j.Logger; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +public class AnnotateNovelSitesWrapper extends AbstractDiscvrSeqWrapper +{ + public AnnotateNovelSitesWrapper(Logger log) + { + super(log); + } + + public File execute(File vcf, File referenceVcf, File fasta, String versionString, File vcfOutput, List extraArgs) throws PipelineJobException + { + List args = new ArrayList<>(getBaseArgs()); + args.add("AnnotateNovelSites"); + args.add("-R"); + args.add(fasta.getPath()); + + args.add("-V"); + args.add(vcf.getPath()); + args.add("-rv"); + args.add(referenceVcf.getPath()); + + args.add("-an"); + args.add("mGAPV"); + args.add("-ad"); + args.add("The first mGAP version where variants at this site appeared"); + args.add("-av"); + args.add(versionString); + + args.add("-O"); + args.add(vcfOutput.getPath()); + + if (extraArgs != null) + { + args.addAll(extraArgs); + } + + execute(args); + + return vcfOutput; + } +} diff --git a/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseAnnotateNovelSitesStep.java b/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseAnnotateNovelSitesStep.java deleted file mode 100644 index 70e30aaf1..000000000 --- a/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseAnnotateNovelSitesStep.java +++ /dev/null @@ -1,206 +0,0 @@ -package org.labkey.mgap.pipeline; - -import htsjdk.samtools.util.Interval; -import org.apache.commons.lang3.math.NumberUtils; -import org.apache.logging.log4j.Logger; -import org.jetbrains.annotations.Nullable; -import org.json.JSONObject; -import org.labkey.api.data.SimpleFilter; -import org.labkey.api.data.TableSelector; -import org.labkey.api.pipeline.PipelineJob; -import org.labkey.api.pipeline.PipelineJobException; -import org.labkey.api.query.FieldKey; -import org.labkey.api.sequenceanalysis.SequenceAnalysisService; -import org.labkey.api.sequenceanalysis.SequenceOutputFile; -import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; -import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; -import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; -import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; -import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; -import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; -import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; -import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl; -import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; -import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper; -import org.labkey.api.util.PageFlowUtil; -import org.labkey.mgap.mGAPSchema; - -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * User: bimber - * Date: 6/15/2014 - * Time: 12:39 PM - */ -public class mGapReleaseAnnotateNovelSitesStep extends AbstractCommandPipelineStep implements VariantProcessingStep -{ - public static final String VERSION_ROWID = "versionRowId"; - public static final String PRIOR_RELEASE_LABEL = "priorReleaseLabel"; - public static final String SITES_ONLY_DATA = "sitesOnlyVcfData"; - - public mGapReleaseAnnotateNovelSitesStep(PipelineStepProvider provider, PipelineContext ctx) - { - super(provider, ctx, new AnnotateNovelSitesWrapper(ctx.getLogger())); - } - - public static class Provider extends AbstractVariantProcessingStepProvider implements SupportsScatterGather - { - public Provider() - { - super("mGapAnnotateNovelSites", "Annotate Novel Sites Against mGAP Release", "AnnotateNovelSites", "Compare the VCF to the specified mGAP release VCF, producing TSV/VCF reports with site- and genotype-level concordance.", Arrays.asList( - ToolParameterDescriptor.create(VERSION_ROWID, "mGAP Release", "The mGAP release VCF to use for comparison", "ldk-simplelabkeycombo", new JSONObject(){{ - put("allowBlank", false); - put("width", 400); - put("schemaName", "mgap"); - put("queryName", "variantCatalogReleases"); - put("containerPath", "js:Laboratory.Utils.getQueryContainerPath()"); - put("displayField", "version"); - put("valueField", "rowid"); - put("doNotIncludeInTemplates", true); - }}, null), - ToolParameterDescriptor.create("releaseVersion", "mGAP Version", "This string will be used to tag novel variants.", "textfield", new JSONObject(){{ - put("allowBlank", false); - put("doNotIncludeInTemplates", true); - }}, null) - ), PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js"), null); - } - - @Override - public mGapReleaseAnnotateNovelSitesStep create(PipelineContext ctx) - { - return new mGapReleaseAnnotateNovelSitesStep(this, ctx); - } - } - - @Override - public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List intervals) throws PipelineJobException - { - VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl(); - getPipelineCtx().getLogger().info("Annotating VCF by mGAP Release"); - - String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class, "0.0"); - if (releaseVersion.toLowerCase().startsWith("v")) - { - releaseVersion = releaseVersion.substring(1); - } - - if (!NumberUtils.isCreatable(releaseVersion)) - { - throw new IllegalArgumentException("Expected the release version to be numeric: " + releaseVersion); - } - - String priorReleaseLabel = getPipelineCtx().getSequenceSupport().getCachedObject(PRIOR_RELEASE_LABEL, String.class); - int sitesOnlyExpDataId = getPipelineCtx().getSequenceSupport().getCachedObject(SITES_ONLY_DATA, Integer.class); - File sitesOnlyVcf = getPipelineCtx().getSequenceSupport().getCachedData(sitesOnlyExpDataId); - if (!sitesOnlyVcf.exists()) - { - throw new PipelineJobException("Unable to find file: " + sitesOnlyVcf); - } - - List extraArgs = new ArrayList<>(); - if (intervals != null) - { - intervals.forEach(interval -> { - extraArgs.add("-L"); - extraArgs.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd()); - }); - - extraArgs.add("--ignore-variants-starting-outside-interval"); - } - - extraArgs.add("-dv"); - extraArgs.add(priorReleaseLabel); - - File annotatedVCF = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".comparison.vcf.gz"); - getWrapper().execute(inputVCF, sitesOnlyVcf, genome.getWorkingFastaFile(), releaseVersion, annotatedVCF, extraArgs); - if (!annotatedVCF.exists()) - { - throw new PipelineJobException("Unable to find output: " + annotatedVCF.getPath()); - } - - output.addInput(inputVCF, "Input VCF"); - output.addInput(sitesOnlyVcf, "Reference VCF"); - - output.addOutput(annotatedVCF, "VCF Annotated by mGAP Version"); - output.setVcf(annotatedVCF); - - return output; - } - - @Override - public void init(PipelineJob job, SequenceAnalysisJobSupport support, List inputFiles) throws PipelineJobException - { - Integer versionRowId = getProvider().getParameterByName(VERSION_ROWID).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class); - String version = new TableSelector(mGAPSchema.getInstance().getSchema().getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("version"), new SimpleFilter(FieldKey.fromString("rowId"), versionRowId), null).getObject(String.class); - if (version == null) - { - throw new PipelineJobException("Unable to find release for release: " + versionRowId); - } - - Integer referenceVcfOutputId = new TableSelector(mGAPSchema.getInstance().getSchema().getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("sitesOnlyVcfId"), new SimpleFilter(FieldKey.fromString("rowId"), versionRowId), null).getObject(Integer.class); - if (referenceVcfOutputId == null) - { - getPipelineCtx().getLogger().debug("Sites-only VCF not found, using primary VCF"); - referenceVcfOutputId = new TableSelector(mGAPSchema.getInstance().getSchema().getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("vcfId"), new SimpleFilter(FieldKey.fromString("rowId"), versionRowId), null).getObject(Integer.class); - } - - if (referenceVcfOutputId == null) - { - throw new PipelineJobException("Unable to find sites-only VCF for release: " + versionRowId); - } - - SequenceOutputFile sitesOnly = SequenceOutputFile.getForId(referenceVcfOutputId); - if (sitesOnly == null) - { - throw new PipelineJobException("Unable to find sites-only VCF output file for fileId: " + referenceVcfOutputId); - } - - support.cacheExpData(sitesOnly.getExpData()); - - support.cacheObject(SITES_ONLY_DATA, sitesOnly.getDataId()); - support.cacheObject(PRIOR_RELEASE_LABEL, version); - } - - public static class AnnotateNovelSitesWrapper extends AbstractDiscvrSeqWrapper - { - public AnnotateNovelSitesWrapper(Logger log) - { - super(log); - } - - public File execute(File vcf, File referenceVcf, File fasta, String versionString, File vcfOutput, List extraArgs) throws PipelineJobException - { - List args = new ArrayList<>(getBaseArgs()); - args.add("AnnotateNovelSites"); - args.add("-R"); - args.add(fasta.getPath()); - - args.add("-V"); - args.add(vcf.getPath()); - args.add("-rv"); - args.add(referenceVcf.getPath()); - - args.add("-an"); - args.add("mGAPV"); - args.add("-ad"); - args.add("The first mGAP version where variants at this site appeared"); - args.add("-av"); - args.add(versionString); - - args.add("-O"); - args.add(vcfOutput.getPath()); - - if (extraArgs != null) - { - args.addAll(extraArgs); - } - - execute(args); - - return vcfOutput; - } - } -} diff --git a/mGAP/src/org/labkey/mgap/query/mGAPUserSchema.java b/mGAP/src/org/labkey/mgap/query/mGAPUserSchema.java index f960fa01a..cc947f79f 100644 --- a/mGAP/src/org/labkey/mgap/query/mGAPUserSchema.java +++ b/mGAP/src/org/labkey/mgap/query/mGAPUserSchema.java @@ -76,7 +76,20 @@ else if (mGAPSchema.TABLE_RELEASE_TRACKS.equalsIgnoreCase(name)) private TableInfo createWrappedVariantTable(String name, TableInfo sourceTable, ContainerFilter cf) { - return super.createWrappedTable(name, sourceTable, cf); + AbstractTableInfo ati = (AbstractTableInfo)super.createWrappedTable(name, sourceTable, cf); + + String fieldName = "versionAndSpecies"; + if (ati.getColumn(fieldName) == null) + { + SQLFragment sql = new SQLFragment("(" + ati.getSqlDialect().concatenate(ExprColumn.STR_TABLE_ALIAS + ".species", "': '", ExprColumn.STR_TABLE_ALIAS + ".version") + ")"); + ExprColumn col = new ExprColumn(ati, fieldName, sql, JdbcType.VARCHAR, ati.getColumn("version"), ati.getColumn("species")); + col.setLabel("Version and Species"); + col.setFacetingBehaviorType(FacetingBehaviorType.ALWAYS_OFF); + col.setDescription("This column shows the version and species"); + ati.addColumn(col); + } + + return ati; } private TableInfo customizeReleaseTracks(String name, TableInfo sourceTable, ContainerFilter cf) From 93406cdd9e2ca7cf0491ee6bd224b31d7f379dfa Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 2 Nov 2024 09:52:04 -0700 Subject: [PATCH 04/20] Test fix --- .../mgap/pipeline/AnnotateNovelSitesWrapper.java | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java b/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java index 3352ce2fd..97bcc2495 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java +++ b/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java @@ -1,6 +1,7 @@ package org.labkey.mgap.pipeline; import org.apache.logging.log4j.Logger; +import org.jetbrains.annotations.Nullable; import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper; @@ -15,7 +16,7 @@ public AnnotateNovelSitesWrapper(Logger log) super(log); } - public File execute(File vcf, File referenceVcf, File fasta, String versionString, File vcfOutput, List extraArgs) throws PipelineJobException + public File execute(File vcf, @Nullable File referenceVcf, File fasta, String versionString, File vcfOutput, List extraArgs) throws PipelineJobException { List args = new ArrayList<>(getBaseArgs()); args.add("AnnotateNovelSites"); @@ -24,8 +25,16 @@ public File execute(File vcf, File referenceVcf, File fasta, String versionStrin args.add("-V"); args.add(vcf.getPath()); - args.add("-rv"); - args.add(referenceVcf.getPath()); + + if (referenceVcf == null) + { + args.add("-rv"); + args.add(referenceVcf.getPath()); + } + else + { + args.add("--allow-missing-ref"); + } args.add("-an"); args.add("mGAPV"); From a63899e0cc5d7c214c5ae48c658821825534696c Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 3 Nov 2024 06:58:27 -0800 Subject: [PATCH 05/20] Refactor GenerateMgapTracksStep to allow multiple species --- .../mgap/pipeline/GenerateMgapTracksStep.java | 315 +++++++++++++----- 1 file changed, 225 insertions(+), 90 deletions(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java b/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java index df573c6ec..b90606960 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java +++ b/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java @@ -6,6 +6,7 @@ import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.Nullable; import org.json.JSONObject; @@ -29,7 +30,6 @@ import org.labkey.api.sequenceanalysis.pipeline.AbstractPipelineStep; import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; -import org.labkey.api.sequenceanalysis.pipeline.PipelineStep; import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; @@ -62,6 +62,9 @@ public class GenerateMgapTracksStep extends AbstractPipelineStep implements VariantProcessingStep, VariantProcessingStep.SupportsScatterGather { public static final String TRACK_CATEGORY = "mGAP Release Track"; + public static final String VERSION_ROWID = "versionRowId"; + public static final String PRIOR_RELEASE_LABEL = "priorReleaseLabel"; + public static final String SITES_ONLY_DATA = "sitesOnlyVcfData"; // 1) makes the subset VCF per track with those IDs, // 2) dies if it cannot find any of the IDs being requested, @@ -78,11 +81,31 @@ public static class Provider extends AbstractVariantProcessingStepProvider primaryTrackNames = new HashSet<>(); + Map> trackToSubject = new HashMap<>(); + new TableSelector(existingTracks, PageFlowUtil.set("trackName", "isprimarytrack"), new SimpleFilter(FieldKey.fromString("species"), species), null).forEachResults(rs -> { + if (trackToSubject.containsKey(rs.getString(FieldKey.fromString("trackName")))) + { + throw new IllegalStateException("Duplicate track names present: " + rs.getString(FieldKey.fromString("trackName"))); + } + + trackToSubject.put(rs.getString(FieldKey.fromString("trackName")), new HashSet<>()); + + if (rs.getObject(FieldKey.fromString("isprimarytrack")) != null & rs.getBoolean(FieldKey.fromString("isprimarytrack"))) + { + primaryTrackNames.add(rs.getString(FieldKey.fromString("trackName"))); + } + }); + + if (primaryTrackNames.size() != 1) + { + throw new IllegalStateException("Expected single primary track, found: " + primaryTrackNames.size()); + } + // Verify all IDs in header are mGAP aliases. This map is the true ID to mGAP alias Map sampleIdToMgapAlias = getSampleToAlias(so.getFile()); // Now read track list, validate IDs present, and write to file: TableInfo ti = QueryService.get().getUserSchema(getPipelineCtx().getJob().getUser(), (getPipelineCtx().getJob().getContainer().isWorkbook() ? getPipelineCtx().getJob().getContainer().getParent() : getPipelineCtx().getJob().getContainer()), mGAPSchema.NAME).getTable(mGAPSchema.TABLE_RELEASE_TRACK_SUBSETS); - TableSelector ts = new TableSelector(ti, PageFlowUtil.set("trackName", "subjectId")); + TableSelector ts = new TableSelector(ti, PageFlowUtil.set("trackName", "subjectId"), new SimpleFilter(FieldKey.fromString("trackName"), trackToSubject.keySet(), CompareType.IN), null); Set requestedNotInVcf = new HashSet<>(); - Map> trackToSubject = new HashMap<>(); + ts.forEachResults(rs -> { if (!trackToSubject.containsKey(rs.getString(FieldKey.fromString("trackName")))) { @@ -138,6 +186,11 @@ public void init(PipelineJob job, SequenceAnalysisJobSupport support, List { writer.writeNext(new String[]{trackName, x}); }); @@ -147,67 +200,153 @@ public void init(PipelineJob job, SequenceAnalysisJobSupport support, List intervals) throws PipelineJobException + private File annotateNovelSites(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List intervals) throws PipelineJobException { - VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl(); - Map> trackToSamples = parseSampleMap(getSampleNameFile(getPipelineCtx().getSourceDirectory(true))); + String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class, "0.0"); + if (releaseVersion.toLowerCase().startsWith("v")) + { + releaseVersion = releaseVersion.substring(1); + } - VCFHeader header; - try (VCFFileReader reader = new VCFFileReader(inputVCF)) + if (!NumberUtils.isCreatable(releaseVersion)) { - header = reader.getFileHeader(); + throw new IllegalArgumentException("Expected the release version to be numeric: " + releaseVersion); } - if (!header.hasInfoLine("mGAPV")) + String priorReleaseLabel = getPipelineCtx().getSequenceSupport().getCachedObject(PRIOR_RELEASE_LABEL, String.class); + File sitesOnlyVcf = getAnnotationReferenceVcf(); + + List extraArgs = new ArrayList<>(); + if (intervals != null) { - throw new IllegalStateException("VCF is missing the annotation: mGAPV"); + intervals.forEach(interval -> { + extraArgs.add("-L"); + extraArgs.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd()); + }); + + extraArgs.add("--ignore-variants-starting-outside-interval"); } - processTracks(output, inputVCF, trackToSamples, outputDirectory, genome, intervals); + extraArgs.add("-dv"); + extraArgs.add(priorReleaseLabel); - // Also create the Novel Sites track: - String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class); - File novelSitesOutput = getNovelSitesOutput(outputDirectory); - if (new File(novelSitesOutput.getPath() + ".tbi").exists()) + if (sitesOnlyVcf != null) { - getPipelineCtx().getLogger().debug("Index exists, will not remake novel sites VCF"); + extraArgs.add("-ns"); + extraArgs.add(getNovelSitesOutput(outputDirectory).getPath()); + } + + File annotatedVCF = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".comparison.vcf.gz"); + if (new File(annotatedVCF.getPath() + ".tbi").exists()) + { + getPipelineCtx().getLogger().debug("Index exists, will not remake annotated sites VCF"); } else { - getPipelineCtx().getJob().setStatus(PipelineJob.TaskStatus.running, "Processing novel sites track"); - - SelectVariantsWrapper sv = new SelectVariantsWrapper(getPipelineCtx().getLogger()); - List svArgs = new ArrayList<>(); - svArgs.add("-select"); - svArgs.add("mGAPV == '" + releaseVersion + "'"); - if (intervals != null) + new AnnotateNovelSitesWrapper(getPipelineCtx().getLogger()).execute(inputVCF, sitesOnlyVcf, genome.getWorkingFastaFile(), releaseVersion, annotatedVCF, extraArgs); + if (!annotatedVCF.exists()) { - intervals.forEach(interval -> { - svArgs.add("-L"); - svArgs.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd()); - }); + throw new PipelineJobException("Unable to find output: " + annotatedVCF.getPath()); } + } + + return annotatedVCF; + } + + private File getNovelSitesOutput(File outputDirectory) + { + String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class); + String species = getProvider().getParameterByName("species").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class); - sv.execute(genome.getWorkingFastaFile(), inputVCF, novelSitesOutput, svArgs); + return new File(outputDirectory, "mGAP_v" + releaseVersion + "_" + species.replaceAll(" ", "_") + "_NovelSites.vcf.gz"); + } + + @Override + public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List intervals) throws PipelineJobException + { + VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl(); + Map> trackToSamples = parseSampleMap(getSampleNameFile(getPipelineCtx().getSourceDirectory(true))); + + String primaryTrackName = getPipelineCtx().getSequenceSupport().getCachedObject("primaryTrackName", String.class); + Map tracks = processTracks(output, inputVCF, trackToSamples, outputDirectory, genome, intervals); + + File primaryTrackFile = tracks.get(primaryTrackName); + if (primaryTrackFile == null) + { + throw new PipelineJobException("Missing primary track"); } - getPipelineCtx().getJob().getLogger().info("total variants: " + SequenceAnalysisService.get().getVCFLineCount(novelSitesOutput, getPipelineCtx().getJob().getLogger(), false)); + File primaryTrackAnnotated = annotateNovelSites(primaryTrackFile, outputDirectory, genome, intervals); + output.addIntermediateFile(primaryTrackAnnotated); + + if (getAnnotationReferenceVcf() != null) + { + File novelSitesOutput = getNovelSitesOutput(outputDirectory); + if (!novelSitesOutput.exists()) + { + throw new PipelineJobException("Missing file: " + novelSitesOutput.getPath()); + } + + getPipelineCtx().getJob().getLogger().info("total novel variants in release: " + SequenceAnalysisService.get().getVCFLineCount(novelSitesOutput, getPipelineCtx().getJob().getLogger(), false)); + } return output; } private File getOutputVcf(String trackName, File outputDirectory) { - return new File(outputDirectory, FileUtil.makeLegalName(trackName) + ".vcf.gz"); + return new File(outputDirectory, FileUtil.makeLegalName(trackName).replaceAll(" ", "_") + ".vcf.gz"); } @Override @@ -221,37 +360,33 @@ public void complete(PipelineJob job, List inputs, List newRow = new CaseInsensitiveHashMap<>(); newRow.put("trackName", trackName); newRow.put("label", trackName); + newRow.put("species", species); newRow.put("vcfId", so.getRowid()); - newRow.put("isprimarytrack", isPrimaryTrack); + newRow.put("isprimarytrack", primaryTrackName.equals(trackName)); BatchValidationException bve = new BatchValidationException(); releaseTracks.getUpdateService().insertRows(job.getUser(), targetContainer, Arrays.asList(newRow), bve, null, null); @@ -281,11 +416,6 @@ private void createOrUpdateTrack(SequenceOutputFile so, PipelineJob job, String } } - private boolean indexExists(File vcf) - { - return new File(vcf.getPath() + ".tbi").exists(); - } - private File getSampleNameFile(File outputDir) { return new File(outputDir, "sampleMapping.txt"); @@ -443,6 +573,7 @@ public void performAdditionalMergeTasks(SequenceOutputHandler.JobContext ctx, Pi return f; }).toList(); + job.getLogger().debug("Total VCFs to merge: " + toConcat.size()); if (toConcat.isEmpty()) { @@ -470,45 +601,49 @@ public void performAdditionalMergeTasks(SequenceOutputHandler.JobContext ctx, Pi manager.addSequenceOutput(so); } - job.getLogger().info("Merging novel sites VCF"); - List toConcat = orderedJobDirs.stream().map(dirName -> { - File f = getNovelSitesOutput(new File(ctx.getSourceDirectory(), dirName)); - if (!f.exists()) - { - throw new IllegalStateException("Missing file: " + f.getPath()); - } + if (getAnnotationReferenceVcf() != null) + { + job.getLogger().info("Merging novel sites VCF"); + List toConcat = orderedJobDirs.stream().map(dirName -> { + File f = getNovelSitesOutput(new File(ctx.getSourceDirectory(), dirName)); + if (!f.exists()) + { + throw new IllegalStateException("Missing file: " + f.getPath()); + } - ctx.getFileManager().addIntermediateFile(f); - ctx.getFileManager().addIntermediateFile(new File(f.getPath() + ".tbi")); + ctx.getFileManager().addIntermediateFile(f); + ctx.getFileManager().addIntermediateFile(new File(f.getPath() + ".tbi")); - return f; - }).toList(); + return f; + }).toList(); - if (toConcat.isEmpty()) - { - throw new PipelineJobException("No novel sites VCFs found"); - } + if (toConcat.isEmpty()) + { + throw new PipelineJobException("No novel sites VCFs found"); + } - String basename = SequenceAnalysisService.get().getUnzippedBaseName(toConcat.get(0).getName()); - File combined = new File(ctx.getSourceDirectory(), basename + ".vcf.gz"); - File combinedIdx = new File(combined.getPath() + ".tbi"); - if (combinedIdx.exists()) - { - job.getLogger().info("VCF exists, will not recreate: " + combined.getPath()); - } - else - { - combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, job.getLogger(), true, null); - } + String basename = SequenceAnalysisService.get().getUnzippedBaseName(toConcat.get(0).getName()); + File combined = new File(ctx.getSourceDirectory(), basename + ".vcf.gz"); + File combinedIdx = new File(combined.getPath() + ".tbi"); + if (combinedIdx.exists()) + { + job.getLogger().info("VCF exists, will not recreate: " + combined.getPath()); + } + else + { + combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, job.getLogger(), true, null); + } - SequenceOutputFile so = new SequenceOutputFile(); - so.setName("Novel Sites in This Release"); - so.setFile(combined); - so.setCategory(TRACK_CATEGORY); - so.setLibrary_id(genome.getGenomeId()); - String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class); - so.setDescription("These are novel sites in mGAP v" + releaseVersion); - manager.addSequenceOutput(so); + String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class); + String species = getProvider().getParameterByName("species").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class); + SequenceOutputFile so = new SequenceOutputFile(); + so.setName(species + ": Novel Sites in Release " + releaseVersion); + so.setFile(combined); + so.setCategory(TRACK_CATEGORY); + so.setLibrary_id(genome.getGenomeId()); + so.setDescription("These are novel sites in mGAP v" + releaseVersion + " for " + species); + manager.addSequenceOutput(so); + } } public static class SplitVcfBySamplesWrapper extends AbstractDiscvrSeqWrapper From 3d119555ab690db4f08b8f36c2941947a84200a4 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 3 Nov 2024 11:44:36 -0800 Subject: [PATCH 06/20] Allow GenerateMgapTracksStep to have missing prior release --- .../mgap/pipeline/GenerateMgapTracksStep.java | 57 +++++++++++-------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java b/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java index b90606960..abaa16c43 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java +++ b/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java @@ -81,7 +81,7 @@ public static class Provider extends AbstractVariantProcessingStepProvider Date: Sun, 3 Nov 2024 14:26:17 -0800 Subject: [PATCH 07/20] Correct typo --- mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java b/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java index abaa16c43..484b9c00e 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java +++ b/mGAP/src/org/labkey/mgap/pipeline/GenerateMgapTracksStep.java @@ -247,7 +247,7 @@ public void init(PipelineJob job, SequenceAnalysisJobSupport support, List Date: Sun, 3 Nov 2024 16:08:04 -0800 Subject: [PATCH 08/20] Correct typo --- .../src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java b/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java index 97bcc2495..3fef55a99 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java +++ b/mGAP/src/org/labkey/mgap/pipeline/AnnotateNovelSitesWrapper.java @@ -26,7 +26,7 @@ public File execute(File vcf, @Nullable File referenceVcf, File fasta, String ve args.add("-V"); args.add(vcf.getPath()); - if (referenceVcf == null) + if (referenceVcf != null) { args.add("-rv"); args.add(referenceVcf.getPath()); From 778645146ecd2c8d6198a96635fe3717073fcdfb Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Nov 2024 10:16:08 -0800 Subject: [PATCH 09/20] Add case-sensitive ID check --- .../queries/mGAP/sampleSummary.query.xml | 7 ++++ mGAP/resources/queries/mGAP/sampleSummary.sql | 3 +- .../queries/mGAP/sampleSummary/.qview.xml | 5 +++ mGAP/resources/views/mgapDataDashboard.html | 7 ++++ .../mgap/query/SampleSummaryCustomizer.java | 42 +++++++++++++++++++ 5 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 mGAP/resources/queries/mGAP/sampleSummary/.qview.xml create mode 100644 mGAP/src/org/labkey/mgap/query/SampleSummaryCustomizer.java diff --git a/mGAP/resources/queries/mGAP/sampleSummary.query.xml b/mGAP/resources/queries/mGAP/sampleSummary.query.xml index 4d9e68f7c..86a7a599e 100644 --- a/mGAP/resources/queries/mGAP/sampleSummary.query.xml +++ b/mGAP/resources/queries/mGAP/sampleSummary.query.xml @@ -2,8 +2,15 @@ + subjectName mGAP Subject/gVCF Summary + + + SubjectId Listed In Alias Table + true + +
diff --git a/mGAP/resources/queries/mGAP/sampleSummary.sql b/mGAP/resources/queries/mGAP/sampleSummary.sql index 385533ddf..d45c1c4bb 100644 --- a/mGAP/resources/queries/mGAP/sampleSummary.sql +++ b/mGAP/resources/queries/mGAP/sampleSummary.sql @@ -8,7 +8,8 @@ SELECT ss.center, t.tracks, t.total, - CASE WHEN ss.originalId IS NULL OR ss.gender IS NULL or ss.species IS NULL or ss.center IS NULL THEN true ELSE false END as missingDemographics + CASE WHEN ss.originalId IS NULL OR ss.gender IS NULL or ss.species IS NULL or ss.center IS NULL THEN true ELSE false END as missingDemographics, + am.subjectname as aliasSubjectName FROM (SELECT COALESCE(o.readset.subjectId, rt.subjectId) as subjectId, diff --git a/mGAP/resources/queries/mGAP/sampleSummary/.qview.xml b/mGAP/resources/queries/mGAP/sampleSummary/.qview.xml new file mode 100644 index 000000000..48de68347 --- /dev/null +++ b/mGAP/resources/queries/mGAP/sampleSummary/.qview.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/mGAP/resources/views/mgapDataDashboard.html b/mGAP/resources/views/mgapDataDashboard.html index fc6ce214d..1b0fd9138 100644 --- a/mGAP/resources/views/mgapDataDashboard.html +++ b/mGAP/resources/views/mgapDataDashboard.html @@ -76,6 +76,13 @@ queryName: 'sampleSummary', 'query.externalAlias~isblank': '' }) + },{ + name: 'gVCFs With SubjectId / Case-sensitive Difference', + url: LABKEY.ActionURL.buildURL('query', 'executeQuery.view', null, { + schemaName: 'mgap', + queryName: 'subjectCaseMismatch', + 'query.externalAlias~isnonblank': '' + }) }] },{ header: 'Prior Releases', diff --git a/mGAP/src/org/labkey/mgap/query/SampleSummaryCustomizer.java b/mGAP/src/org/labkey/mgap/query/SampleSummaryCustomizer.java new file mode 100644 index 000000000..317773403 --- /dev/null +++ b/mGAP/src/org/labkey/mgap/query/SampleSummaryCustomizer.java @@ -0,0 +1,42 @@ +package org.labkey.mgap.query; + +import org.labkey.api.data.AbstractTableInfo; +import org.labkey.api.data.JdbcType; +import org.labkey.api.data.SQLFragment; +import org.labkey.api.data.TableInfo; +import org.labkey.api.gwt.client.FacetingBehaviorType; +import org.labkey.api.ldk.table.AbstractTableCustomizer; +import org.labkey.api.query.ExprColumn; + +public class SampleSummaryCustomizer extends AbstractTableCustomizer +{ + @Override + public void customize(TableInfo ti) + { + if (ti instanceof AbstractTableInfo ati) + { + customizeTable(ati); + } + } + + private void customizeTable(AbstractTableInfo ti) + { + String fieldName = "subjectCaseMismatch"; + if (ti.getColumn(fieldName) != null) + { + return; + } + + if (!ti.getSqlDialect().isSqlServer()) + { + return; + } + + SQLFragment sql = new SQLFragment("CASE WHEN HASHBYTES('sha1', " + ExprColumn.STR_TABLE_ALIAS + ".subjectId) = HASHBYTES('sha1', " + ExprColumn.STR_TABLE_ALIAS + ".aliasSubjectName) THEN NULL ELSE " + ExprColumn.STR_TABLE_ALIAS + ".aliasSubjectName END"); + ExprColumn col = new ExprColumn(ti, fieldName, sql, JdbcType.VARCHAR, ti.getColumn("subjectId"), ti.getColumn("aliasSubjectName")); + col.setLabel("Id Case Mismatch?"); + col.setFacetingBehaviorType(FacetingBehaviorType.ALWAYS_OFF); + col.setDescription("If the case of the subjectId differs from the alias table, the updated case is shown"); + ti.addColumn(col); + } +} From 67cfed2c7d3a7963a47461b72dd2a3e8666feee2 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 6 Nov 2024 21:13:28 -0800 Subject: [PATCH 10/20] Update MCC dashboard to account for permissions --- mcc/src/client/U24Dashboard/Dashboard.tsx | 41 +++++++++++++---------- mcc/src/org/labkey/mcc/MccModule.java | 2 ++ 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/mcc/src/client/U24Dashboard/Dashboard.tsx b/mcc/src/client/U24Dashboard/Dashboard.tsx index 7acbdd8e2..8819a0789 100644 --- a/mcc/src/client/U24Dashboard/Dashboard.tsx +++ b/mcc/src/client/U24Dashboard/Dashboard.tsx @@ -63,24 +63,29 @@ export function Dashboard() { scope: this }); - Query.selectRows({ - containerPath: requestContainerPath, - schemaName: 'mcc', - queryName: 'requestScores', - columns: 'requestId/status', - success: function(results) { - if (isApiSubscribed) { - setRequestRows(results.rows); - } - }, - failure: function(response) { - if (isApiSubscribed) { - alert('There was an error loading data'); - console.error(response); - } - }, - scope: this - }); + if (ctx.hasRequestReadPermission) { + Query.selectRows({ + containerPath: requestContainerPath, + schemaName: 'mcc', + queryName: 'requestScores', + columns: 'requestId/status', + success: function (results) { + if (isApiSubscribed) { + setRequestRows(results.rows); + } + }, + failure: function (response) { + if (isApiSubscribed) { + alert('There was an error loading data'); + console.error(response); + } + }, + scope: this + }); + } + else { + setRequestRows([]) + } Query.selectRows({ containerPath: containerPath, diff --git a/mcc/src/org/labkey/mcc/MccModule.java b/mcc/src/org/labkey/mcc/MccModule.java index 673b3d950..3a0c891cd 100644 --- a/mcc/src/org/labkey/mcc/MccModule.java +++ b/mcc/src/org/labkey/mcc/MccModule.java @@ -56,6 +56,7 @@ import org.labkey.mcc.security.MccRabReviewerRole; import org.labkey.mcc.security.MccRequestAdminPermission; import org.labkey.mcc.security.MccRequesterRole; +import org.labkey.mcc.security.MccViewRequestsPermission; import java.util.Collection; import java.util.Collections; @@ -109,6 +110,7 @@ public JSONObject getPageContextJson(ContainerUser context) Container requestContainer = MccManager.get().getMCCRequestContainer(context.getContainer()); ret.put("hasRequestAdminPermission", requestContainer != null && requestContainer.hasPermission(context.getUser(), MccRequestAdminPermission.class)); + ret.put("hasRequestReadPermission", requestContainer != null && requestContainer.hasPermission(context.getUser(), MccViewRequestsPermission.class)); ret.put("hasRabPermission", requestContainer != null && requestContainer.hasPermission(context.getUser(), MccRabReviewPermission.class)); ret.put("hasFinalDecisionPermission", requestContainer != null && requestContainer.hasPermission(context.getUser(), MccFinalReviewPermission.class)); From 27183799b1a624e4378f4ef49b9eab797973ac98 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Nov 2024 05:28:46 -0800 Subject: [PATCH 11/20] Category field does not need to be required --- mGAP/resources/schemas/mgap.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mGAP/resources/schemas/mgap.xml b/mGAP/resources/schemas/mgap.xml index 090becab2..7eeaf7123 100644 --- a/mGAP/resources/schemas/mgap.xml +++ b/mGAP/resources/schemas/mgap.xml @@ -826,7 +826,7 @@
Category - false + true URL From cdf397fbc272c1d86123583dd5a87ad4af0009f1 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Nov 2024 05:33:23 -0800 Subject: [PATCH 12/20] Fix URL in dashboard --- .../web/mGAP/window/ReleaseWindow.js | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/mGAP/resources/web/mGAP/window/ReleaseWindow.js b/mGAP/resources/web/mGAP/window/ReleaseWindow.js index 9921c9746..51fe467b0 100644 --- a/mGAP/resources/web/mGAP/window/ReleaseWindow.js +++ b/mGAP/resources/web/mGAP/window/ReleaseWindow.js @@ -12,22 +12,28 @@ Ext4.define('mGAP.window.ReleaseWindow', { schemaName: 'mgap', queryName: 'releaseTracks', scope: this, - columns: 'vcfId,trackName,vcfId/library_id,isprimarytrack', + columns: 'vcfId,species,trackName,vcfId/library_id,isprimarytrack', failure: LDK.Utils.getErrorCallback(), success: function (results) { Ext4.Msg.hide(); var outputFiles = []; - var distinctGenomes = []; + var distinctGenomesBySpecies = {}; Ext4.Array.forEach(results.rows, function(r){ - if (r.vcfId) { - outputFiles.push(r.vcfId); + if (!r.vcfId) { + Ext4.Msg.alert('Error', 'Track lacks VCF ID: ' + r.trackName); + return false; + } - if (r['vcfId/library_id']) { - distinctGenomes.push(r['vcfId/library_id']); - } + if (!r.species) { + Ext4.Msg.alert('Error', 'Track lacks species: ' + r.trackName); + return false; } - else if (!r['isprimarytrack']) { - console.error('Track lacks VCF ID: ' + r.trackName); + + outputFiles.push(r.vcfId); + + distinctGenomesBySpecies[r.species] = distinctGenomesBySpecies[r.species] || []; + if (r['vcfId/library_id']) { + distinctGenomesBySpecies[r.species].push(r['vcfId/library_id']); } }, this); @@ -36,9 +42,12 @@ Ext4.define('mGAP.window.ReleaseWindow', { return; } - distinctGenomes = Ext4.Array.unique(distinctGenomes); - if (distinctGenomes.length !== 1){ - Ext4.Msg.alert('Error', 'All files must use the same genome. Genomes found: ' + distinctGenomes.length); + for (sn in Ext4.Object.getKeys(distinctGenomesBySpecies)) { + var genomes = Ext4.Array.unique(distinctGenomesBySpecies[sn]); + if (genomes.length !== 1){ + Ext4.Msg.alert('Error', 'All files must use the same genome. Genomes found for species ' + sn + ': ' + genomes.length); + return; + } } LABKEY.Ajax.request({ @@ -68,7 +77,7 @@ Ext4.define('mGAP.window.ReleaseWindow', { title: results.name, handlerConfig: results, toolParameters: results.toolParameters, - libraryId: distinctGenomes.length == 1 ? distinctGenomes[0] : null + libraryId: distinctGenomes.length === 1 ? distinctGenomes[0] : null }).show(); } } From 7b71b2f51e2894e5d9c8e29a148d88d3c6c3532f Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Nov 2024 08:24:05 -0800 Subject: [PATCH 13/20] Allow another column to be null --- mGAP/resources/schemas/mgap.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mGAP/resources/schemas/mgap.xml b/mGAP/resources/schemas/mgap.xml index 7eeaf7123..5679fefa4 100644 --- a/mGAP/resources/schemas/mgap.xml +++ b/mGAP/resources/schemas/mgap.xml @@ -668,7 +668,7 @@ Category - false + true URL From 6ea56e7125c033cd447a3cbe761a6df9ad690894 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Nov 2024 10:24:31 -0800 Subject: [PATCH 14/20] Add additional mGAP track columns --- .../dbscripts/postgresql/mgap-16.74-16.75.sql | 5 +++++ .../dbscripts/sqlserver/mgap-16.74-16.75.sql | 5 +++++ mGAP/resources/schemas/mgap.xml | 22 +++++++++++++++++++ mGAP/src/org/labkey/mgap/mGAPModule.java | 2 +- 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 mGAP/resources/schemas/dbscripts/postgresql/mgap-16.74-16.75.sql create mode 100644 mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.74-16.75.sql diff --git a/mGAP/resources/schemas/dbscripts/postgresql/mgap-16.74-16.75.sql b/mGAP/resources/schemas/dbscripts/postgresql/mgap-16.74-16.75.sql new file mode 100644 index 000000000..7080ea1bc --- /dev/null +++ b/mGAP/resources/schemas/dbscripts/postgresql/mgap-16.74-16.75.sql @@ -0,0 +1,5 @@ +ALTER TABLE mGAP.releaseTracks ADD shouldindex boolean default false; +ALTER TABLE mGAP.releaseTracks ADD vcfIndexId int; + +ALTER TABLE mGAP.tracksPerRelease ADD shouldindex boolean default false; +ALTER TABLE mGAP.tracksPerRelease ADD vcfIndexId int; \ No newline at end of file diff --git a/mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.74-16.75.sql b/mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.74-16.75.sql new file mode 100644 index 000000000..39631f183 --- /dev/null +++ b/mGAP/resources/schemas/dbscripts/sqlserver/mgap-16.74-16.75.sql @@ -0,0 +1,5 @@ +ALTER TABLE mGAP.releaseTracks ADD shouldindex bit default 0; +ALTER TABLE mGAP.releaseTracks ADD vcfIndexId int; + +ALTER TABLE mGAP.tracksPerRelease ADD shouldindex bit default 0; +ALTER TABLE mGAP.tracksPerRelease ADD vcfIndexId int; \ No newline at end of file diff --git a/mGAP/resources/schemas/mgap.xml b/mGAP/resources/schemas/mgap.xml index 5679fefa4..caed3fa92 100644 --- a/mGAP/resources/schemas/mgap.xml +++ b/mGAP/resources/schemas/mgap.xml @@ -692,6 +692,17 @@ Skip Annotation Checks? + + Should Include Lucene Index? + + + Lucene Index Id + + sequenceanalysis + outputfiles + rowid + + true @@ -848,6 +859,17 @@ Is Primary Track For Species? false + + Should Include Lucene Index? + + + Lucene Index Id + + sequenceanalysis + outputfiles + rowid + + true diff --git a/mGAP/src/org/labkey/mgap/mGAPModule.java b/mGAP/src/org/labkey/mgap/mGAPModule.java index 1b1666ceb..36fccd989 100644 --- a/mGAP/src/org/labkey/mgap/mGAPModule.java +++ b/mGAP/src/org/labkey/mgap/mGAPModule.java @@ -76,7 +76,7 @@ public String getName() @Override public Double getSchemaVersion() { - return 16.74; + return 16.75; } @Override From 64853cf863f9806738a73dc8f627cbf54b200465 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Nov 2024 19:48:25 -0800 Subject: [PATCH 15/20] Add action to fix existing SBT ExpDatas --- .../labkey/primeseq/PrimeseqController.java | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/primeseq/src/org/labkey/primeseq/PrimeseqController.java b/primeseq/src/org/labkey/primeseq/PrimeseqController.java index d93d71258..e5523d2b1 100644 --- a/primeseq/src/org/labkey/primeseq/PrimeseqController.java +++ b/primeseq/src/org/labkey/primeseq/PrimeseqController.java @@ -33,7 +33,10 @@ import org.labkey.api.data.ContainerType; import org.labkey.api.data.DbScope; import org.labkey.api.data.SQLFragment; +import org.labkey.api.data.SimpleFilter; import org.labkey.api.data.SqlExecutor; +import org.labkey.api.data.TableSelector; +import org.labkey.api.exp.api.ExpData; import org.labkey.api.module.Module; import org.labkey.api.module.ModuleLoader; import org.labkey.api.pipeline.PipeRoot; @@ -42,10 +45,13 @@ import org.labkey.api.pipeline.PipelineService; import org.labkey.api.pipeline.PipelineStatusFile; import org.labkey.api.pipeline.PipelineUrls; +import org.labkey.api.query.FieldKey; +import org.labkey.api.query.QueryService; import org.labkey.api.security.RequiresPermission; import org.labkey.api.security.RequiresSiteAdmin; import org.labkey.api.security.permissions.ReadPermission; import org.labkey.api.security.permissions.UpdatePermission; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; import org.labkey.api.sequenceanalysis.pipeline.HasJobParams; import org.labkey.api.sequenceanalysis.pipeline.JobResourceSettings; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; @@ -795,4 +801,74 @@ public void setRestartJobs(boolean restartJobs) _restartJobs = restartJobs; } } + + @RequiresSiteAdmin + public static class FixSbtAction extends ConfirmAction + { + @Override + public ModelAndView getConfirmView(Object o, BindException errors) throws Exception + { + setTitle("Fix SBT Errors"); + + return new HtmlView(HtmlString.of("This will update filepaths on SBT outputs. Do you want to continue?")); + } + + @Override + public boolean handlePost(Object o, BindException errors) throws Exception + { + new TableSelector(QueryService.get().getUserSchema(getUser(), getContainer(), "sequenceanalysis").getTable("outputfiles"), PageFlowUtil.set("rowid"), new SimpleFilter(FieldKey.fromString("category"), "SBT Results"), null).forEachResults(rs -> { + SequenceOutputFile so = SequenceOutputFile.getForId(rs.getInt(FieldKey.fromString("rowid"))); + + File f = so.getFile(); + if (f.exists()) + { + return; + } + + File root = f.getParentFile().getParentFile(); + File [] dirs = root.listFiles(fn -> { + return fn.isDirectory() & !fn.getName().equalsIgnoreCase("Shared"); + }); + + if (dirs == null || dirs.length == 0) + { + _log.error("Unable to file directory for: " + f.getPath()); + return; + } + + File parent = new File(dirs[0], "Alignment"); + File [] children = parent.listFiles(fn -> { + return fn.getName().endsWith(".sbt_hits.txt.gz"); + }); + + if (children == null || children.length != 1) + { + _log.error("Unable to file child under: " + parent.getPath()); + return; + } + + _log.info("Found: " + children[0].getPath()); + + ExpData d = so.getExpData(); + d.setDataFileURI(children[0].toURI()); + + //d.save(getUser()); + }); + + return true; + } + + @Override + public void validateCommand(Object o, Errors errors) + { + + } + + @NotNull + @Override + public URLHelper getSuccessURL(Object o) + { + return PageFlowUtil.urlProvider(PipelineUrls.class).urlBegin(getContainer()); + } + } } \ No newline at end of file From 70acbc47a02dcbd1249d9e7842889feab86fe5de Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 13 Nov 2024 10:59:20 -0800 Subject: [PATCH 16/20] Action to retroactively fix MHC filepaths --- primeseq/src/org/labkey/primeseq/PrimeseqController.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/primeseq/src/org/labkey/primeseq/PrimeseqController.java b/primeseq/src/org/labkey/primeseq/PrimeseqController.java index e5523d2b1..5bcfd42fb 100644 --- a/primeseq/src/org/labkey/primeseq/PrimeseqController.java +++ b/primeseq/src/org/labkey/primeseq/PrimeseqController.java @@ -852,7 +852,7 @@ public boolean handlePost(Object o, BindException errors) throws Exception ExpData d = so.getExpData(); d.setDataFileURI(children[0].toURI()); - //d.save(getUser()); + d.save(getUser()); }); return true; From c58ba3dd89a719f817dc3820483873042135a40c Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 13 Nov 2024 11:10:15 -0800 Subject: [PATCH 17/20] Update mGAP release code to handle multi-species --- .../mgap/pipeline/mGapReleaseGenerator.java | 73 ++++++++++--------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java b/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java index 278960bc1..6a2ac50bd 100644 --- a/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java +++ b/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java @@ -96,11 +96,21 @@ public class mGapReleaseGenerator extends AbstractParameterizedOutputHandler { private final FileType _vcfType = new FileType(List.of(".vcf"), ".vcf", false, FileType.gzSupportLevel.SUPPORT_GZ); - public static final String MMUL_GENOME = "mmulGenome"; + public static final String BASE_GENOME = "baseGenome"; public mGapReleaseGenerator() { super(ModuleLoader.getInstance().getModule(mGAPModule.class), "Create mGAP Release", "This will prepare an input VCF for use as an mGAP public release. This will optionally include: removing excess annotations and program records, limiting to SNVs (optional) and removing genotype data (optional). If genotypes are retained, the subject names will be checked for mGAP aliases and replaced as needed.", new LinkedHashSet<>(PageFlowUtil.set("sequenceanalysis/field/GenomeFileSelectorField.js")), Arrays.asList( + ToolParameterDescriptor.create("species", "Version", "The species, which is used to filter tracks", "ldk-simplelabkeycombo", new JSONObject(){{ + put("allowBlank", false); + put("doNotIncludeInTemplates", true); + put("width", 400); + put("schemaName", "laboratory"); + put("queryName", "species"); + put("containerPath", "js:Laboratory.Utils.getQueryContainerPath()"); + put("displayField", "common_name"); + put("valueField", "common_name"); + }}, null), ToolParameterDescriptor.create("releaseVersion", "Version", "This value will be used as the version when published.", "textfield", new JSONObject(){{ put("allowBlank", false); put("doNotIncludeInTemplates", true); @@ -182,10 +192,16 @@ public void init(JobContext ctx, List inputFiles, List toSelect = new HashSet<>(); toSelect.add(FieldKey.fromString("trackName")); - toSelect.add(FieldKey.fromString("mergepriority")); + toSelect.add(FieldKey.fromString("species")); toSelect.add(FieldKey.fromString("skipvalidation")); toSelect.add(FieldKey.fromString("isprimarytrack")); toSelect.add(FieldKey.fromString("vcfId")); @@ -197,7 +213,7 @@ public void init(JobContext ctx, List inputFiles, List { + new TableSelector(releaseTracks, colMap.values(), new SimpleFilter(FieldKey.fromString("species"), species), null).forEachResults(rs -> { if (rs.getObject(FieldKey.fromString("vcfId")) == null) { throw new SQLException("No VCF found for track: " + rs.getObject(FieldKey.fromString("trackName"))); @@ -217,7 +233,7 @@ public void init(JobContext ctx, List inputFiles, List inputFiles, List inputFiles, List row = new CaseInsensitiveHashMap<>(); row.put("version", job.getParameters().get("releaseVersion")); row.put("releaseDate", new Date()); + row.put("species", species); row.put("vcfId", so.getRowid()); row.put("liftedVcfId", liftedVcf.getRowid()); row.put("sitesOnlyVcfId", sitesOnlyVcf.getRowid()); @@ -583,7 +602,7 @@ else if (so.getCategory().endsWith("Release Track")) //also tracks: UserSchema us = QueryService.get().getUserSchema(job.getUser(), job.getContainer().isWorkbook() ? job.getContainer().getParent() : job.getContainer(), mGAPSchema.NAME); - new TableSelector(us.getTable(mGAPSchema.TABLE_RELEASE_TRACKS), null, null).forEachResults(rs -> { + new TableSelector(us.getTable(mGAPSchema.TABLE_RELEASE_TRACKS), new SimpleFilter(FieldKey.fromString("species"), species), null).forEachResults(rs -> { SequenceOutputFile so3 = trackVCFMap.get(rs.getString(FieldKey.fromString("trackName"))); if (so3 == null && rs.getBoolean(FieldKey.fromString("isprimarytrack"))) { @@ -836,7 +855,7 @@ public static class TrackDescriptor { String _trackName; Integer _dataId; - Integer _mergePriority; + String _species; boolean _skipValidation; boolean _isPrimary; @@ -844,7 +863,7 @@ public TrackDescriptor(String[] vals) { _trackName = vals[0]; _dataId = Integer.parseInt(vals[1]); - _mergePriority = Integer.parseInt(vals[2]); + _species = vals[2]; _skipValidation = Boolean.parseBoolean(vals[3]); _isPrimary = Boolean.parseBoolean(vals[4]); } @@ -859,9 +878,9 @@ public Integer getDataId() return _dataId; } - public Integer getMergePriority() + public String getSpecies() { - return _mergePriority; + return _species; } public boolean isSkipValidation() @@ -886,15 +905,6 @@ private List getTracks(File webserverDir) throws PipelineJobExc ret.add(new TrackDescriptor(line)); } - ret.sort(new Comparator() - { - @Override - public int compare(TrackDescriptor o1, TrackDescriptor o2) - { - return o1.getMergePriority().compareTo(o2.getMergePriority()); - } - }); - return ret; } catch (IOException e) @@ -917,12 +927,13 @@ public void processFilesRemote(List inputFiles, JobContext c GeneToNameTranslator translator = new GeneToNameTranslator(gtf, ctx.getLogger()); ReferenceGenome grch37Genome = ctx.getSequenceSupport().getCachedGenome(ctx.getParams().getInt(AnnotationStep.GRCH37)); - int genomeId = ctx.getSequenceSupport().getCachedObject(MMUL_GENOME, Integer.class); + int genomeId = ctx.getSequenceSupport().getCachedObject(BASE_GENOME, Integer.class); ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenome(genomeId); boolean testOnly = ctx.getParams().optBoolean("testOnly", false); + String species = ctx.getParams().getString("species"); String releaseVersion = ctx.getParams().optString("releaseVersion", "0.0"); - File primaryTrackVcf = new File(ctx.getOutputDir(), "mGap.v" + FileUtil.makeLegalName(releaseVersion).replaceAll(" ", "_") + ".vcf.gz"); + File primaryTrackVcf = new File(ctx.getOutputDir(), "mGap." + species + ".v" + FileUtil.makeLegalName(releaseVersion).replaceAll(" ", "_") + ".vcf.gz"); try { @@ -994,7 +1005,7 @@ public void processFilesRemote(List inputFiles, JobContext c SequenceOutputFile output = new SequenceOutputFile(); output.setFile(primaryTrackVcf); - output.setName("mGAP Release: " + releaseVersion); + output.setName("mGAP Release: " + species + " " + releaseVersion); output.setCategory((testOnly ? "Test " : "") + "mGAP Release"); output.setLibrary_id(genome.getGenomeId()); ctx.getFileManager().addSequenceOutput(output); @@ -1002,7 +1013,7 @@ public void processFilesRemote(List inputFiles, JobContext c File interestingVariantTable = getVariantTableName(ctx, primaryTrackVcf); SequenceOutputFile output2 = new SequenceOutputFile(); output2.setFile(interestingVariantTable); - output2.setName("mGAP Release: " + releaseVersion + " Variant Table"); + output2.setName("mGAP Release: " + species + " " + releaseVersion + " Variant Table"); output2.setCategory((testOnly ? "Test " : "") + "mGAP Release Variant Table"); output2.setLibrary_id(genome.getGenomeId()); ctx.getFileManager().addSequenceOutput(output2); @@ -1012,7 +1023,7 @@ public void processFilesRemote(List inputFiles, JobContext c File lifted = liftToHuman(ctx, primaryTrackVcf, sitesOnlyVcf, grch37Genome); SequenceOutputFile output3 = new SequenceOutputFile(); output3.setFile(lifted); - output3.setName("mGAP Release: " + releaseVersion + " Lifted to Human"); + output3.setName("mGAP Release: " + species + " " + releaseVersion + " Lifted to Human"); output3.setCategory((testOnly ? "Test " : "") + "mGAP Release Lifted to Human"); output3.setLibrary_id(grch37Genome.getGenomeId()); ctx.getFileManager().addSequenceOutput(output3); @@ -1111,16 +1122,6 @@ private File getSitesOnlyVcfName(File outDir, File primaryTrackVcf) return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".sitesOnly.vcf.gz"); } - private File getDroppedSitesVcfName(File outDir, File primaryTrackVcf) - { - return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".droppedFromPriorRelease.vcf.gz"); - } - - private File getNovelSitesVcfName(File outDir, File primaryTrackVcf) - { - return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".newToRelease.vcf.gz"); - } - private File getLiftedVcfName(File outDir, File primaryTrackVcf) { return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".liftToGRCh37.vcf.gz"); From c540e99d7f70e3755d92eb9037056f481f3bc4f9 Mon Sep 17 00:00:00 2001 From: bbimber Date: Fri, 15 Nov 2024 12:19:35 -0800 Subject: [PATCH 18/20] Rework action to repair improper SBT outputs --- primeseq/src/org/labkey/primeseq/PrimeseqController.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/primeseq/src/org/labkey/primeseq/PrimeseqController.java b/primeseq/src/org/labkey/primeseq/PrimeseqController.java index 5bcfd42fb..3606da7ef 100644 --- a/primeseq/src/org/labkey/primeseq/PrimeseqController.java +++ b/primeseq/src/org/labkey/primeseq/PrimeseqController.java @@ -37,6 +37,8 @@ import org.labkey.api.data.SqlExecutor; import org.labkey.api.data.TableSelector; import org.labkey.api.exp.api.ExpData; +import org.labkey.api.exp.api.ExpRun; +import org.labkey.api.exp.api.ExperimentService; import org.labkey.api.module.Module; import org.labkey.api.module.ModuleLoader; import org.labkey.api.pipeline.PipeRoot; @@ -825,7 +827,10 @@ public boolean handlePost(Object o, BindException errors) throws Exception return; } - File root = f.getParentFile().getParentFile(); + ExpRun run = ExperimentService.get().getExpRun(so.getRunId()); + PipelineStatusFile sf = PipelineService.get().getStatusFile(run.getJobId()); + File logFile = new File(sf.getFilePath()); + File root = logFile.getParentFile(); File [] dirs = root.listFiles(fn -> { return fn.isDirectory() & !fn.getName().equalsIgnoreCase("Shared"); }); From c2586eea0d8a47ecdfed2852e155946bca017e78 Mon Sep 17 00:00:00 2001 From: bbimber Date: Fri, 15 Nov 2024 15:46:55 -0800 Subject: [PATCH 19/20] Switch docker to conditionally mount volumes based on provider --- .../pipeline/ExacloudResourceSettings.java | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/primeseq/src/org/labkey/primeseq/pipeline/ExacloudResourceSettings.java b/primeseq/src/org/labkey/primeseq/pipeline/ExacloudResourceSettings.java index 5158b5783..e7eaf6c75 100644 --- a/primeseq/src/org/labkey/primeseq/pipeline/ExacloudResourceSettings.java +++ b/primeseq/src/org/labkey/primeseq/pipeline/ExacloudResourceSettings.java @@ -3,12 +3,17 @@ import org.json.JSONObject; import org.labkey.api.data.Container; import org.labkey.api.module.ModuleLoader; +import org.labkey.api.pipeline.PipeRoot; +import org.labkey.api.pipeline.PipelineService; import org.labkey.api.sequenceanalysis.pipeline.JobResourceSettings; import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; import org.labkey.primeseq.PrimeseqModule; import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; import java.util.List; +import java.util.Set; /** * Created by bimber on 9/30/2016. @@ -43,4 +48,37 @@ public boolean isAvailable(Container c) { return c.getActiveModules().contains(ModuleLoader.getInstance().getModule(PrimeseqModule.class)); } + + @Override + public Collection getDockerVolumes(Container c) + { + Set volumes = new HashSet<>(); + volumes.add("/home/groups/prime-seq"); + volumes.add("/home/exacloud/gscratch"); + + PipeRoot pr = PipelineService.get().findPipelineRoot(c); + if (pr != null && pr.getRootPath().exists()) + { + if (pr.getRootPath().getPath().startsWith("/home/groups/")) + { + String folderName = pr.getRootPath().getPath().replaceAll("^/home/groups/", "").split("/")[0]; + volumes.add("/home/groups/" + folderName); + } + } + + if (c.isWorkbook()) + { + PipeRoot pr2 = PipelineService.get().findPipelineRoot(c.getParent()); + if (pr2 != null && pr2.getRootPath().exists()) + { + if (pr2.getRootPath().getPath().startsWith("/home/groups/")) + { + String folderName = pr2.getRootPath().getPath().replaceAll("^/home/groups/", "").split("/")[0]; + volumes.add("/home/groups/" + folderName); + } + } + } + + return volumes; + } } From 7bd9494b6d40b1f4ab73a4cd66a077f9ba81956d Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 16 Nov 2024 09:30:36 -0800 Subject: [PATCH 20/20] Update several ETLs --- PMR/resources/etls/pmr-datasets.xml | 4 ++++ PMR/resources/etls/pmr-demographics.xml | 2 ++ mGAP/resources/etls/prime-seq.xml | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/PMR/resources/etls/pmr-datasets.xml b/PMR/resources/etls/pmr-datasets.xml index 19e981726..1186b27a4 100644 --- a/PMR/resources/etls/pmr-datasets.xml +++ b/PMR/resources/etls/pmr-datasets.xml @@ -141,6 +141,8 @@ relationship method objectid + created + modified @@ -171,6 +173,8 @@ conception conceptualDay objectid + created + modified diff --git a/PMR/resources/etls/pmr-demographics.xml b/PMR/resources/etls/pmr-demographics.xml index 9c06b7b3c..03b2bf94e 100644 --- a/PMR/resources/etls/pmr-demographics.xml +++ b/PMR/resources/etls/pmr-demographics.xml @@ -17,6 +17,8 @@ calculated_status QCState/Label objectid + created + modified diff --git a/mGAP/resources/etls/prime-seq.xml b/mGAP/resources/etls/prime-seq.xml index 458803fca..312f0d033 100644 --- a/mGAP/resources/etls/prime-seq.xml +++ b/mGAP/resources/etls/prime-seq.xml @@ -60,6 +60,9 @@ source description isprimarytrack + shouldindex + vcfIndexId/dataid/DataFileUrl + vcfIndexId/library_id/name vcfId/dataid/DataFileUrl vcfId/library_id/name @@ -67,6 +70,7 @@ +