RitchieLab
diff --git a/‎biofilter.db-shm‎
32 KB b/‎biofilter.db-shm‎
32 KB
diff --git a/‎biofilter.db-wal‎ b/‎biofilter.db-wal‎
diff --git a/‎biofilter/db/models/model_entities.py‎
Lines changed: 24 additions & 0 deletions b/‎biofilter/db/models/model_entities.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎biofilter/db/models/model_proteins.py‎
Lines changed: 1 addition & 1 deletion b/‎biofilter/db/models/model_proteins.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎biofilter/db/models/model_variants.py‎
Lines changed: 3 additions & 0 deletions b/‎biofilter/db/models/model_variants.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎biofilter/etl/dtps/dtp_reactome.py‎
Lines changed: 97 additions & 112 deletions b/‎biofilter/etl/dtps/dtp_reactome.py‎
Lines changed: 97 additions & 112 deletions
@@ -268,6 +268,18 @@ class EntityRelationship(Base):
         back_populates="relationships_as_1",  # noqa E501
     )  # noqa E501
 
+    entity_1_group_id = Column(
+        Integer,
+        ForeignKey("entity_groups.id", ondelete="CASCADE"),
+        nullable=True,
+        index=True,
+    )
+    entity_1_group = relationship(
+        "EntityGroup",
+        foreign_keys=[entity_1_group_id],
+        passive_deletes=True
+    )
+
     entity_2_id = Column(
         Integer,
         ForeignKey("entities.id", ondelete="CASCADE"),
@@ -279,6 +291,18 @@ class EntityRelationship(Base):
         back_populates="relationships_as_2",  # noqa E501
     )  # noqa E501
 
+    entity_2_group_id = Column(
+        Integer,
+        ForeignKey("entity_groups.id", ondelete="CASCADE"),
+        nullable=True,
+        index=True,
+    )
+    entity_2_group = relationship(
+        "EntityGroup",
+        foreign_keys=[entity_2_group_id],
+        passive_deletes=True
+    )
+
     relationship_type_id = Column(
         Integer,
         ForeignKey("entity_relationship_types.id", ondelete="CASCADE"),
 
@@ -105,7 +105,7 @@ class ProteinEntity(Base):
     protein_id = Column(
         Integer, ForeignKey("protein_masters.id"), nullable=False
     )  # noqa E501
-    protein_master = relationship("ProteinMaster", back_populates="protein_entity")
+    protein_master = relationship("ProteinMaster", back_populates="protein_entity")   # noqa E501
 
     is_isoform = Column(Boolean, default=False, nullable=False)
     isoform_accession = Column(String(20), nullable=True)
 
@@ -121,6 +121,9 @@ class VariantLocus(Base):
     start_pos = Column(Integer, nullable=False)
     end_pos = Column(Integer, nullable=False)  # SNP: end_pos == start_pos
 
+    reference_allele = Column(String(100), nullable=True)
+    alternate_allele = Column(String(100), nullable=True)
+
     data_source_id = Column(
         Integer,
         ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
 
@@ -5,7 +5,6 @@
 from biofilter.utils.file_hash import compute_file_hash
 from biofilter.etl.mixins.entity_query_mixin import EntityQueryMixin
 from biofilter.db.models import (
-    EntityGroup,
     PathwayMaster,
 )  # noqa E501
 from biofilter.etl.mixins.base_dtp import DTPBase
@@ -15,27 +14,29 @@ class DTP(DTPBase, EntityQueryMixin):
     def __init__(
         self,
         logger=None,
+        debug_mode=False,
         datasource=None,
-        etl_process=None,
+        package=None,
         session=None,
         use_conflict_csv=False,
     ):  # noqa: E501
         self.logger = logger
+        self.debug_mode = debug_mode
         self.data_source = datasource
-        self.etl_process = etl_process
+        self.package = package
         self.session = session
         self.use_conflict_csv = use_conflict_csv
 
         # DTP versioning
         self.dtp_name = "dtp_reactome"
-        self.dtp_version = "1.0.0"
-        self.compatible_schema_min = "3.0.0"
+        self.dtp_version = "1.1.0"
+        self.compatible_schema_min = "3.1.0"
         self.compatible_schema_max = "4.0.0"
 
     # ⬇️  --------------------------  ⬇️
     # ⬇️  ------ EXTRACT FASE ------  ⬇️
     # ⬇️  --------------------------  ⬇️
-    def extract(self, raw_dir: str, force_steps: bool):
+    def extract(self, raw_dir: str):
         """
         Downloads Reactome data. Uses the hash of 'ReactomePathways.txt' as
         reference. Only proceeds with full extraction if the hash has changed.
@@ -59,12 +60,6 @@ def extract(self, raw_dir: str, force_steps: bool):
             "Ensembl2Reactome.txt",
             "UniProt2Reactome.txt",
         ]
-        if force_steps:
-            last_hash = ""
-            msg = "Ignoring hash check."
-            self.logger.log(msg, "WARNING")
-        else:
-            last_hash = self.etl_process.raw_data_hash
 
         try:
             # Landing directory
@@ -86,10 +81,6 @@ def extract(self, raw_dir: str, force_steps: bool):
 
             # Step 2: Compute hash and compare
             current_hash = compute_file_hash(file_path)
-            if current_hash == last_hash:
-                msg = f"No change detected in {main_file}"  # noqa: E501
-                self.logger.log(msg, "INFO")
-                return False, msg, current_hash  # Skip further downloads
 
             # Step 3: Download the remaining files
             for file_name in files_to_download:
@@ -163,9 +154,11 @@ def transform(self, raw_dir: str, processed_dir: str):
             output_file_master = output_path / "master_data"
 
             # Save filtered pathways
-            df_pathways.to_csv(
-                output_file_master.with_suffix(".csv"), index=False
-            )  # noqa: E501
+            if self.debug_mode:
+                df_pathways.to_csv(
+                    output_file_master.with_suffix(".csv"), index=False
+                )  # noqa: E501
+
             df_pathways.to_parquet(
                 output_file_master.with_suffix(".parquet"), index=False
             )
@@ -311,9 +304,11 @@ def transform(self, raw_dir: str, processed_dir: str):
             output_file_relationship = output_path / "relationship_data"
 
             # Save relationship pathways
-            df_relations.to_csv(
-                output_file_relationship.with_suffix(".csv"), index=False
-            )  # noqa: E501
+            if self.debug_mode:
+                df_relations.to_csv(
+                    output_file_relationship.with_suffix(".csv"), index=False
+                )  # noqa: E501
+
             df_relations.to_parquet(
                 output_file_relationship.with_suffix(".parquet"), index=False
             )
@@ -324,17 +319,17 @@ def transform(self, raw_dir: str, processed_dir: str):
             )  # noqa: E501
 
             msg = f"✅ Finished transforming {self.data_source.name} data."
-            return None, True, msg
+            return True, f"{len(df_pathways)} pathways processed"
 
         except Exception as e:
-            msg = f"❌ ETL transform failed: {str(e)}"
+            msg = f"❌ Transform failed: {str(e)}"
             self.logger.log(msg, "ERROR")
-            return None, False, msg
+            return False, msg
 
     # 📥  ------------------------ 📥
     # 📥  ------ LOAD FASE ------  📥
     # 📥  ------------------------ 📥
-    def load(self, df=None, processed_dir=None, chunk_size=100_000):
+    def load(self, processed_dir=None):
 
         msg = f"📥 Loading {self.data_source.name} data into the database..."
         self.logger.log(
@@ -347,91 +342,66 @@ def load(self, df=None, processed_dir=None, chunk_size=100_000):
 
         total_pathways = 0
         total_warnings = 0
-        load_status = False
 
-        data_source_id = self.data_source.id
+        # ALIASES MAP FROM PROCESS DATA FIELDS
+        self.alias_schema = {
+            "reactome_id": ("code", "Reactome", True),
+            "pathway_name": ("name", "Reactome", None),
+        }
 
-        # Set DB and drop indexes
+        # READ PROCESSED DATA TO LOAD
         try:
-            index_specs = [
-                ("pathway_masters", ["entity_id"]),
-                # Já possui index=True + unique=True, mas bom explicitar
-                ("pathway_masters", ["pathway_id"]),
-                ("pathway_masters", ["data_source_id"]),
-            ]
-
-            index_specs_entity = [
-                # Entity
-                ("entities", ["group_id"]),
-                ("entities", ["has_conflict"]),
-                ("entities", ["is_deactive"]),
-                # EntityName
-                ("entity_names", ["entity_id"]),
-                ("entity_names", ["name"]),
-                ("entity_names", ["data_source_id"]),
-                ("entity_names", ["data_source_id", "name"]),
-                ("entity_names", ["data_source_id", "entity_id"]),
-                ("entity_names", ["entity_id", "is_primary"]),
-                # EntityRelationship
-                ("entity_relationships", ["entity_1_id"]),
-                ("entity_relationships", ["entity_2_id"]),
-                ("entity_relationships", ["relationship_type_id"]),
-                ("entity_relationships", ["data_source_id"]),
-                (
-                    "entity_relationships",
-                    ["entity_1_id", "relationship_type_id"],
-                ),  # noqa E501
-                (
-                    "entity_relationships",
-                    ["entity_1_id", "entity_2_id", "relationship_type_id"],
-                ),  # noqa E501
-                # EntityRelationshipType
-                ("entity_relationship_types", ["code"]),
-            ]
+            # Check if processed dir was set
+            if not processed_dir:
+                msg = "⚠️  processed_dir MUST be provided."
+                self.logger.log(msg, "ERROR")
+                return False, msg  # ⧮ Leaving with ERROR
 
-            self.db_write_mode()
-            # self.drop_indexes(index_specs) # Keep indices to improve checks
-        except Exception as e:
-            total_warnings += 1
-            msg = f"⚠️ Failed to switch DB to write mode or drop indexes: {e}"
-            self.logger.log(msg, "WARNING")
+            processed_path = os.path.join(
+                processed_dir,
+                self.data_source.source_system.name,
+                self.data_source.name,
+            )
+            processed_file_name = processed_path + "/master_data.parquet"
 
-        if df is None:
-            if not processed_dir:
-                msg = "Either 'df' or 'processed_path' must be provided."
+            if not os.path.exists(processed_file_name):
+                msg = f"⚠️  File not found: {processed_file_name}"
                 self.logger.log(msg, "ERROR")
-                return total_pathways, load_status, msg
+                return False, msg  # ⧮ Leaving with ERROR
 
-            processed_path = self.get_path(processed_dir)
-            processed_data = str(processed_path / "master_data.parquet")
+            df = pd.read_parquet(processed_file_name, engine="pyarrow")
 
-            if not os.path.exists(processed_data):
-                msg = f"File not found: {processed_data}"
+            if df.empty:
+                msg = "DataFrame is empty."
                 self.logger.log(msg, "ERROR")
-                return total_pathways, load_status, msg
+                return False, msg
 
-            self.logger.log(
-                f"📥 Reading data in chunks from {processed_data}", "INFO"
-            )  # noqa E501
+            df.fillna("", inplace=True)
 
-            df = pd.read_parquet(processed_data, engine="pyarrow")
+        except Exception as e:
+            msg = f"⚠️  Failed to try read data: {e}"
+            self.logger.log(msg, "ERROR")
+            return False, msg  # ⧮ Leaving with ERROR
 
-        # Get Entity Group ID
-        if not hasattr(self, "entity_group") or self.entity_group is None:
-            group = (
-                self.session.query(EntityGroup)
-                .filter_by(name="Pathways")
-                .first()  # noqa: E501
-            )  # noqa: E501
-            if not group:
-                msg = "EntityGroup 'Pathways' not found in the database."
-                self.logger.log(msg, "ERROR")
-                return total_pathways, load_status
-                # raise ValueError(msg)
-            self.entity_group = group.id
-            msg = f"EntityGroup ID for 'Pathways' is {self.entity_group}"
-            self.logger.log(msg, "DEBUG")
+        # Set DB and drop indexes
+        try:
+            self.db_write_mode()
+            self.drop_indexes(self.get_pathway_index_specs)
+            self.drop_indexes(self.get_entity_index_specs)
+        except Exception as e:
+            total_warnings += 1
+            msg = f"⚠️  Failed to switch DB to write mode or drop indexes: {e}"
+            self.logger.log(msg, "WARNING")
+            return False, msg  # ⧮ Leaving with ERROR
+
+        # GET ENTITY GROUP ID AND OMICS STATUS
+        try:
+            self.get_entity_group("Pathways")
+        except Exception as e:
+            msg = f"Error on DTP to get Entity Group: {e}"
+            return False, msg  # ⧮ Leaving with ERROR
 
+        # RUN LOAD BY ROW
         try:
             # Interaction to each Reactome Pathway
             for _, row in df.iterrows():
@@ -444,16 +414,37 @@ def load(self, df=None, processed_dir=None, chunk_size=100_000):
                     self.logger.log(msg, "WARNING")
                     continue
 
+                # --- ALIASES STRUCTURE ---
+                # Create a dict of Aliases
+                alias_dict = self.build_alias(row)
+                # Only primary Name
+                is_primary_alias = next(
+                    (a for a in alias_dict if a.get("is_primary")), None
+                )
+                # Only Aliases Names
+                not_primary_alias = [
+                    a for a in alias_dict if a != is_primary_alias
+                ]  # noqa E501
+
                 # Add or Get Entity
                 entity_id, _ = self.get_or_create_entity(
-                    name=pathway_master,
+                    name=is_primary_alias["alias_value"],
                     group_id=self.entity_group,
                     data_source_id=self.data_source.id,
+                    package_id=self.package.id,
+                    alias_type=is_primary_alias["alias_type"],
+                    xref_source=is_primary_alias["xref_source"],
+                    alias_norm=is_primary_alias["alias_norm"],
+                    is_active=True,
                 )
 
-                # Add or Get Entity Name
                 self.get_or_create_entity_name(
-                    entity_id, pathway_name, data_source_id=self.data_source.id
+                    group_id=self.entity_group,
+                    entity_id=entity_id,
+                    aliases=not_primary_alias,
+                    is_active=True,
+                    data_source_id=self.data_source.id,  # noqa E501
+                    package_id=self.package.id,
                 )
 
                 # Check if the pathway already exists
@@ -471,7 +462,7 @@ def load(self, df=None, processed_dir=None, chunk_size=100_000):
                         entity_id=entity_id,
                         pathway_id=pathway_master,
                         description=pathway_name,
-                        data_source_id=data_source_id,
+                        data_source_id=self.data_source.id,
                     )
 
                     self.session.add(pathway)
@@ -482,29 +473,23 @@ def load(self, df=None, processed_dir=None, chunk_size=100_000):
         except Exception as e:
             msg = f"❌ ETL load_relations failed: {str(e)}"
             self.logger.log(msg, "ERROR")
-            return 0, False, msg
+            return False, msg
 
         # Set DB to Read Mode and Create Index
         try:
-            # Drop Indexs
-            self.drop_indexes(index_specs)
-            self.drop_indexes(index_specs_entity)
-            # Stating Indexs
-            self.create_indexes(index_specs)
-            self.create_indexes(index_specs_entity)
+            self.create_indexes(self.get_pathway_index_specs)
+            self.create_indexes(self.get_entity_index_specs)
             self.db_read_mode()
         except Exception as e:
             total_warnings += 1
             msg = f"Failed to switch DB to write mode or drop indexes: {e}"
             self.logger.log(msg, "WARNING")
 
-        load_status = True
-
         if total_warnings != 0:
             msg = f"{total_warnings} warning to analysis in log file"
             self.logger.log(msg, "WARNING")
 
         msg = f"📥 Total Pathways: {total_pathways}"  # noqa E501  # noqa E501
         self.logger.log(msg, "INFO")
 
-        return total_pathways, True, msg
+        return True, msg