Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
0d752f6
models: add PGS data models, #TASK-5407, #TASK-5387
jtarraga Dec 19, 2023
9b2b410
models: add protein substitution prediction (e.g., for AlphaMissense)…
jtarraga Dec 21, 2023
ea90325
models: simplified PGS data models, #TASK-5407, #TASK-5387
jtarraga Dec 22, 2023
aaa5f54
models: add PGS to variant annotation, #TASK-5411, #TASK-5387
jtarraga Dec 22, 2023
0f44e8f
models: improve PGS models for variant annotation, #TASK-5411, #TASK-…
jtarraga Jan 3, 2024
f7007bf
models: update data models for AlphaMissense and Revel, #TASK-5419, #…
jtarraga Jan 5, 2024
8babad9
formats: implement parser for file miRNA.dat, #TASK-5954, #TASK-5564
jtarraga Mar 28, 2024
4ea3ffe
Merge branch 'develop' into TASK-5387
jtarraga Apr 19, 2024
babb593
models: update PharmGKB classes according to the data model changes, …
jtarraga Apr 25, 2024
7972070
Merge branch 'TASK-5564' into TASK-5387
jtarraga May 7, 2024
68fa011
Merge branch 'TASK-5564' into TASK-5388
jtarraga May 7, 2024
4586542
Merge branch 'develop' into TASK-5564
jtarraga May 27, 2024
562e78d
Merge branch 'TASK-5564' into TASK-5387
jtarraga May 27, 2024
e082f09
Merge branch 'develop' into TASK-5564
imedina Jun 23, 2024
5103771
Merge branch 'develop' into TASK-5564
jtarraga Jul 22, 2024
858a437
Merge branch 'TASK-5564' of https://github.com/opencb/biodata into TA…
jtarraga Jul 23, 2024
938c5c4
Merge branch 'develop' into TASK-5564
jtarraga Aug 9, 2024
2161a9e
Merge branch 'TASK-5564' into TASK-5387
jtarraga Aug 13, 2024
b5be1a7
Merge branch 'release-3.x.x' into TASK-5564
jtarraga May 29, 2025
382831a
formats: add parser for UniProt 2025-02, #TASK-5576, #TASK-5564
jtarraga Jun 2, 2025
a6692c7
models: add imprinted data in gene annotation, #TASK-7745, #TASK-5564
jtarraga Jun 10, 2025
6fc7cf4
formats: add COSMIC parser for v101, #TASK-7430, #TASK-7367
jtarraga Feb 21, 2025
d4c6e7a
formats: remove additional property GENOMIC_MUTATION_ID since it is t…
jtarraga Feb 27, 2025
cada2e6
formats: fix NumberFormatException, #TASK-7430, #TASK-7367
jtarraga Mar 4, 2025
c088ae8
Merge branch 'release-3.x.x' into TASK-5564
jtarraga Jul 7, 2025
eedb2e7
Merge branch 'TASK-5564' into TASK-5387
jtarraga Jul 7, 2025
7fb8047
Merge branch 'TASK-5387' into TASK-5388
jtarraga Jul 7, 2025
cbd2a9f
Merge branch 'develop' into TASK-5564
jtarraga Jul 8, 2025
064f56a
formats: add ChimerDB parser, #TASK-7830, #TASK-5564
jtarraga Jul 24, 2025
20b2ef5
models: add ChimerDB data (gene fusion), #TASK-7830, #TASK-5564
jtarraga Jul 27, 2025
c3caccd
models: move GeneFusion to core, #TASK-7830, #TASK-5564
jtarraga Jul 28, 2025
22a8d35
models: add imprinted genes and gene fusions in variant annotation, #…
jtarraga Jul 28, 2025
c4e4e58
formats: initialize the attributes map before parsing ChimerDB, #TASK…
jtarraga Jul 29, 2025
d2ee79a
formats: add parser for files ChimerSeq.xlsx and ChimerPub.xlsx, #TAS…
jtarraga Aug 5, 2025
cc4a722
models: rename gene imprinting and gene fusions, #TASK-7745, #TASK-5564
jtarraga Aug 11, 2025
4e821ca
models: minor changes in ChimerDB data models, #TASK-7830, #TASK-5564
jtarraga Aug 11, 2025
00ea1d8
models: rename member, #TASK-7830, #TASK-5564
jtarraga Aug 11, 2025
712e737
formats: implement the CIViC parser, #TASK-7903, #TASK-5564
jtarraga Sep 5, 2025
6efe1d0
formats: improve CIViC parser, #TASK-7903, #TASK-5564
jtarraga Sep 25, 2025
46b27d5
Merge branch 'develop' into TASK-5564
jtarraga Nov 12, 2025
7ebb279
Merge branch 'develop' into TASK-5564
jtarraga Dec 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions biodata-formats/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
<groupId>org.opencb.commons</groupId>
<artifactId>commons-lib</artifactId>
</dependency>
<dependency>
<groupId>org.opencb.commons</groupId>
<artifactId>commons-datastore-core</artifactId>
</dependency>

<dependency>
<groupId>jakarta.xml.bind</groupId>
Expand Down Expand Up @@ -97,6 +101,16 @@
<artifactId>jackson-mapper-asl</artifactId>
<scope>test</scope>
</dependency>-->
<!-- Apache POI for Excel files -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* <!--
* ~ Copyright 2015-2017 OpenCB
* ~
* ~ Licensed under the Apache License, Version 2.0 (the "License");
* ~ you may not use this file except in compliance with the License.
* ~ You may obtain a copy of the License at
* ~
* ~ http://www.apache.org/licenses/LICENSE-2.0
* ~
* ~ Unless required by applicable law or agreed to in writing, software
* ~ distributed under the License is distributed on an "AS IS" BASIS,
* ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* ~ See the License for the specific language governing permissions and
* ~ limitations under the License.
* -->
*
*/

package org.opencb.biodata.formats.feature.chimerdb;


public interface ChimerDbParserCallback<T> {
boolean processChimerDbObject(T object);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
/*
* <!--
* ~ Copyright 2015-2017 OpenCB
* ~
* ~ Licensed under the Apache License, Version 2.0 (the "License");
* ~ you may not use this file except in compliance with the License.
* ~ You may obtain a copy of the License at
* ~
* ~ http://www.apache.org/licenses/LICENSE-2.0
* ~
* ~ Unless required by applicable law or agreed to in writing, software
* ~ distributed under the License is distributed on an "AS IS" BASIS,
* ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* ~ See the License for the specific language governing permissions and
* ~ limitations under the License.
* -->
*
*/

package org.opencb.biodata.formats.feature.chimerdb;

import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.opencb.biodata.models.core.chimerdb.ChimerKb;
import org.opencb.biodata.models.core.chimerdb.ChimerKbGeneBreakpoint;
import org.opencb.commons.utils.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Iterator;
import java.util.stream.Collectors;

import static org.opencb.biodata.formats.feature.chimerdb.ChimerPubParser.getIntCellValue;
import static org.opencb.biodata.formats.feature.chimerdb.ChimerPubParser.getStringCellValue;

public class ChimerKbParser {

private static Logger logger = LoggerFactory.getLogger(ChimerKbParser.class);

public static void parse(Path xlsxPath, ChimerDbParserCallback<ChimerKb> callback) throws IOException {
logger.info("Parsing ChimerKB file: {}", xlsxPath);
FileUtils.checkFile(xlsxPath);

try (FileInputStream excelFile = new FileInputStream(xlsxPath.toFile());
Workbook workbook = new XSSFWorkbook(excelFile)) {
// Get the first sheet from the workbook
Sheet sheet = workbook.getSheetAt(0);

// Iterate over rows
Iterator<Row> rowIterator = sheet.iterator();
while (rowIterator.hasNext()) {
Row row = rowIterator.next();

// Skip header row if needed (e.g., if first row is header)
if (row.getRowNum() == 0) {
continue;
}

String strValue;
Integer intValue;

ChimerKb chimerKb = new ChimerKb();

// 0 1 2 3 4 5 6 7 8 9 10
// id ChimerDB_Type Source webSource Fusion_pair 5Gene_Junction 3Gene_Junction H_gene H_chr H_position H_strand
// 11 12 13 14 15 16 17 18
// T_gene T_chr T_position T_strand Genomic_breakpoint Exonic_breakpoint Breakpoint_Type Genome_Build_Version
// 19 20 21 22 23 24 25 26 27 28
// PMID Disease Validation Frame Chr_info Kinase Oncogene Tumor_suppressor Receptor Transcription_Factor
// 29 30 31
// ChimerPub ChimerSeq ChimerSeq+

// ID
chimerKb.setId(String.valueOf((int) row.getCell(0).getNumericCellValue()));

// ChimerDB Type
strValue = getStringCellValue(row, 1);
if (strValue != null) {
chimerKb.setChimerDbType(strValue);
}

// Source (different from source provided by user)
strValue = getStringCellValue(row, 2);
if (strValue != null) {
chimerKb.setChimerSource(strValue);
}

// Web source
strValue = getStringCellValue(row, 3);
if (strValue != null) {
chimerKb.setWebSource(strValue);
}

// Fusion pair
strValue = getStringCellValue(row, 4);
if (strValue != null) {
chimerKb.setFusionPair(strValue);
}

// Gene junctions (5 and 3)
strValue = getStringCellValue(row, 5);
if (strValue != null) {
chimerKb.setFiveGeneJunction(strValue);
}
strValue = getStringCellValue(row, 6);
if (strValue != null) {
chimerKb.setThreeGeneJunction(strValue);
}

// Head gene breakpoint
ChimerKbGeneBreakpoint head = new ChimerKbGeneBreakpoint();
strValue = getStringCellValue(row, 7);
if (strValue != null) {
head.setGeneName(strValue);
}
strValue = getStringCellValue(row, 8);
if (strValue != null) {
if (strValue.startsWith("chr") || strValue.startsWith("Chr") || strValue.startsWith("CHR")) {
// Remove 'chr' prefix if present
strValue = strValue.substring(3);
}
head.setChromosome(strValue);
}
intValue = getIntCellValue(row, 9);
if (intValue != null) {
head.setPosition(intValue);
}
strValue = getStringCellValue(row, 10);
if (strValue != null) {
head.setStrand(strValue);
}
chimerKb.setHeadGene(head);

// Tail gene breakpoint
ChimerKbGeneBreakpoint tail = new ChimerKbGeneBreakpoint();
strValue = getStringCellValue(row, 11);
if (strValue != null) {
tail.setGeneName(strValue);
}
strValue = getStringCellValue(row, 12);
if (strValue != null) {
if (strValue.startsWith("chr") || strValue.startsWith("Chr") || strValue.startsWith("CHR")) {
// Remove 'chr' prefix if present
strValue = strValue.substring(3);
}
tail.setChromosome(strValue);
}
intValue = getIntCellValue(row, 13);
if (intValue != null) {
tail.setPosition(intValue);
}
strValue = getStringCellValue(row, 14);
if (strValue != null) {
tail.setStrand(strValue);
}
chimerKb.setTailGene(tail);

// Genomic breakpoint
intValue = getIntCellValue(row, 15);
if (intValue != null) {
chimerKb.setGenomicBreakpoint(intValue == 1);
}

// Exonic breakpoint
intValue = getIntCellValue(row, 16);
if (intValue != null) {
chimerKb.setExonicBreakpoint(intValue == 1);
}

// Breakpoint type
strValue = getStringCellValue(row, 17);
if (strValue != null) {
chimerKb.setBreakpointType(strValue);
}

// Genome build version
strValue = getStringCellValue(row, 18);
if (strValue != null) {
chimerKb.setGenomeBuildVersion(strValue);
}

// Publications
strValue = getStringCellValue(row, 19);
if (strValue != null) {
chimerKb.setPmid(Arrays.stream(strValue.split(",")).map(String::trim).collect(Collectors.toList()));
} else {
intValue = getIntCellValue(row, 19);
if (intValue != null && intValue > 0) {
chimerKb.setPmid(Arrays.asList(String.valueOf(intValue)));
}
}

// Diseases
strValue = getStringCellValue(row, 20);
if (strValue != null) {
chimerKb.setDiseases(Arrays.stream(strValue.split(",")).map(String::trim).collect(Collectors.toList()));
}

// Validations
strValue = getStringCellValue(row, 21);
if (strValue != null) {
chimerKb.setValidations(Arrays.stream(strValue.split(",")).map(String::trim).collect(Collectors.toList()));
}

// Frame
strValue = getStringCellValue(row, 22);
if (strValue != null) {
chimerKb.setFrame(strValue);
}

// Chromosome info
strValue = getStringCellValue(row, 23);
if (strValue != null) {
chimerKb.setChrInfo(strValue);
}

// Kinase
strValue = getStringCellValue(row, 24);
if (strValue != null) {
chimerKb.setKinase(strValue.equalsIgnoreCase("kinase"));
}

// Oncogene
strValue = getStringCellValue(row, 25);
if (strValue != null) {
chimerKb.setOncogene(strValue.equalsIgnoreCase("oncogene"));
}

// Tumor suppressor
strValue = getStringCellValue(row, 26);
if (strValue != null) {
chimerKb.setTumorSuppressor(strValue.equalsIgnoreCase("Tumor suppressor gene"));
}

// Receptor
strValue = getStringCellValue(row, 27);
if (strValue != null) {
chimerKb.setReceptor(strValue.equalsIgnoreCase("Receptor"));
}

// Transcription factor
strValue = getStringCellValue(row, 28);
if (strValue != null) {
chimerKb.setTranscriptionFactor(strValue.equalsIgnoreCase("Transcription factor"));
}

// ChimerPub
strValue = getStringCellValue(row, 29);
if (strValue != null) {
chimerKb.setChimerPub(strValue.equalsIgnoreCase("Pub"));
}

// ChimerSeq
strValue = getStringCellValue(row, 30);
if (strValue != null) {
chimerKb.setChimerSeq(strValue.equalsIgnoreCase("Seq"));
}

// ChimerSeq+
strValue = getStringCellValue(row, 31);
if (strValue != null) {
chimerKb.setChimerSeqPlus(strValue.equalsIgnoreCase("Seq+"));
}

// Callback to process the gene fusion
callback.processChimerDbObject(chimerKb);
}
} catch (IOException e) {
throw new IOException("Error reading the ChimerKB file: " + e.getMessage(), e);
}
logger.info("ChimerKB file parsed successfully: {}", xlsxPath);
}
}
Loading
Loading