RitchieLab
diff --git a/‎biofilter/etl/dtps/dtp_variant_ncbi.py‎
Lines changed: 84 additions & 6 deletions b/‎biofilter/etl/dtps/dtp_variant_ncbi.py‎
Lines changed: 84 additions & 6 deletions
diff --git a/‎roteiro‎
Lines changed: 25 additions & 0 deletions b/‎roteiro‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎scripts/Run_ETLManager.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/Run_ETLManager.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/carga_02_load.py‎
Lines changed: 109 additions & 0 deletions b/‎scripts/carga_02_load.py‎
Lines changed: 109 additions & 0 deletions
@@ -446,7 +446,7 @@ def load(self, processed_dir=None):
             target_chrom = self._extract_chrom_from_ds(self.data_source.name)   # noqa E501
             result = self.session.execute(
                 text("""
-                    SELECT variant_id, entity_id
+                    SELECT rs_id, entity_id
                     FROM variant_masters
                     WHERE chromosome = :chrom
                 """),
@@ -599,7 +599,7 @@ def map_entrez_to_entity(entrez_list, gene_dict):
                 # ---= Inserir Variants as Master in bulk =----
                 variant_objects = [
                     VariantMaster(
-                        variant_id=row.rs_id,
+                        rs_id=row.rs_id,
                         variant_type=row.variant_type,
                         omic_status_id=1,
                         chromosome=row.chromosome,
@@ -617,7 +617,7 @@ def map_entrez_to_entity(entrez_list, gene_dict):
 
                 # --- Create Variant Master ID Column ---
                 rsid_to_variant_master_id = {
-                    variant.variant_id: variant.id for variant in variant_objects
+                    variant.rs_id: variant.id for variant in variant_objects
                 }
                 df["variant_master_id"] = df["rs_id"].map(rsid_to_variant_master_id)
 
@@ -673,9 +673,15 @@ def map_entrez_to_entity(entrez_list, gene_dict):
                             )
 
                     # --Inserir VariantLocus
+                    build = row.assembly
+                    build = build.replace("GRCh", "").split(".")[0]  # '38'
+                    
                     locus_records.append(
                         VariantLocus(
                             variant_id=row.variant_master_id,
+                            rs_id=row.rs_id,
+                            entity_id=row.entity_id,
+                            build=build,
                             assembly_id=row.assembly_id,
                             chromosome=row.chromosome,
                             start_pos=row.start_pos,
@@ -688,33 +694,100 @@ def map_entrez_to_entity(entrez_list, gene_dict):
                     )
 
                     # - Processar placements (se existirem)
+                    # placements = getattr(row, "placements", []) or []
+                    # for p in placements:
+                    #     p_acc = p.get("seq_id")
+                    #     asm_id = assemblies_map.get(p_acc)
+                    #     if not asm_id:
+                    #         continue
+                    #     p_start = p.get("start_pos")
+                    #     p_end = p.get("end_pos")
+                    #     if not p_start or not p_end:
+                    #         continue
+
+                    #     ref = p.get("ref")
+                    #     alt = p.get("alt")
+                    #     if not alt or alt == ref:
+                    #         continue
+
+                    #     chrom = acc2chrom.get(p_acc)
+                    #     ref_json = json.dumps([str(ref)]) if ref else json.dumps([])
+                    #     alt_json = json.dumps([str(alt)]) if alt else json.dumps([])
+
+                    #     build = p.get("assembly")
+                    #     build = build.replace("GRCh", "").split(".")[0]
+
+                    #     placement_locus.append(
+                    #         VariantLocus(
+                    #             variant_id=row.variant_master_id,
+                    #             rs_id=row.rs_id,
+                    #             entity_id=row.entity_id,
+                    #             build=build,
+                    #             assembly_id=asm_id,
+                    #             chromosome=chrom,
+                    #             start_pos=int(p_start),
+                    #             end_pos=int(p_end),
+                    #             reference_allele=ref_json,
+                    #             alternate_allele=alt_json,
+                    #             data_source_id=self.data_source.id,
+                    #             etl_package_id=self.package.id,
+                    #         )
+                    #     )
                     placements = getattr(row, "placements", []) or []
+
+                    # Acumulador por locus: (assembly_id, chrom, start, end, ref) -> {build, alts:set}
+                    agg = {}
+
                     for p in placements:
                         p_acc = p.get("seq_id")
                         asm_id = assemblies_map.get(p_acc)
                         if not asm_id:
                             continue
+
                         p_start = p.get("start_pos")
                         p_end = p.get("end_pos")
                         if not p_start or not p_end:
                             continue
 
                         ref = p.get("ref")
                         alt = p.get("alt")
+                        # ignorar alt vazio ou igual ao ref (sem variação)
                         if not alt or alt == ref:
                             continue
 
                         chrom = acc2chrom.get(p_acc)
+                        if not chrom:
+                            continue
+
+                        # build: "GRCh38.p14" -> "38"
+                        build = p.get("assembly") or ""
+                        build = build.replace("GRCh", "").split(".")[0]
+
+                        key = (asm_id, chrom, int(p_start), int(p_end), str(ref or ""))
+
+                        bucket = agg.get(key)
+                        if bucket is None:
+                            bucket = {"build": build, "alts": set()}
+                            agg[key] = bucket
+
+                        bucket["alts"].add(str(alt))
+
+                    # Agora, gerar UMA linha por locus com alts agregados
+                    for (asm_id, chrom, start, end, ref), bucket in agg.items():
                         ref_json = json.dumps([str(ref)]) if ref else json.dumps([])
-                        alt_json = json.dumps([str(alt)]) if alt else json.dumps([])
+                        # ordena para estabilidade determinística
+                        alt_json = json.dumps(sorted(bucket["alts"]))
 
                         placement_locus.append(
                             VariantLocus(
                                 variant_id=row.variant_master_id,
+                                rs_id=row.rs_id,
+                                entity_id=row.entity_id,
+                                build=bucket["build"],
                                 assembly_id=asm_id,
                                 chromosome=chrom,
-                                start_pos=int(p_start),
-                                end_pos=int(p_end),
+                                start_pos=start,
+                                end_pos=end,
                                 reference_allele=ref_json,
                                 alternate_allele=alt_json,
                                 data_source_id=self.data_source.id,
@@ -759,6 +832,9 @@ def map_entrez_to_entity(entrez_list, gene_dict):
                 for loc in all_loci:
                     key = (
                         loc.variant_id,
+                        loc.rs_id,
+                        loc.entity_id,
+                        loc.build,
                         loc.assembly_id,
                         loc.chromosome,
                         loc.start_pos,
@@ -773,6 +849,8 @@ def map_entrez_to_entity(entrez_list, gene_dict):
                         unique_loci.append(loc)
                 self.session.bulk_save_objects(unique_loci)
 
+                # TODO: no chromossomo Y temos dados do X
+
                 # manda para a DB
                 self.session.commit()
 
 
@@ -0,0 +1,25 @@
+full process
+y - DONE
+x - DONE
+22 - DONE
+21 - DONE
+20 - DONE
+19 - DONE
+18 - DONE
+17 - DONE
+16 - DONE
+15 - DONE
+14 - DONE
+13 - DONE
+12 - DONE
+11 - DONE
+10 - DONE
+09 - Loding
+08 - Loding
+07 - extracting
+06 - wait to transform
+05 - wait to Loding
+04 - Loding
+03 - wait to transform
+02 - Loding
+01 - DONE
@@ -28,7 +28,7 @@
     # Variants
     # --------
     # "dbsnp_sample",
-    "dbsnp_chr1",
+    # "dbsnp_chr1",
     # "dbsnp_chr2",
     # "dbsnp_chr3",
     # "dbsnp_chr4",
@@ -51,7 +51,7 @@
     # "dbsnp_chr21",
     # "dbsnp_chr22",
     # "dbsnp_chrx",
-    # "dbsnp_chry",
+    "dbsnp_chry",
     # "dbsnp_chrmt",
     # "gwas",
     #
 
@@ -0,0 +1,109 @@
+from biofilter import Biofilter
+
+# db_uri = "sqlite:///biofilter.db"
+db_uri = "postgresql+psycopg2://bioadmin:bioadmin@localhost/biofilter"
+
+# Configure below
+data_sources_to_process = [
+    # Genes
+    # -----
+    # "hgnc",
+    # "gene_ncbi",
+    # "ensembl",
+    # "gene_ncbi",
+    #
+    # Proteins
+    # --------
+    # "pfam",
+    # "uniprot",
+    #
+    # Pathways
+    # --------
+    # "reactome",
+    # "kegg_pathways",
+    #
+    # Gene Ontology
+    # -------------
+    # "gene_ontology",
+    #
+    # Variants
+    # --------
+    # "dbsnp_sample",
+    # "dbsnp_chr1",
+    "dbsnp_chr2",
+    # "dbsnp_chr3",
+    # "dbsnp_chr4",
+    # "dbsnp_chr5",
+    # "dbsnp_chr6",
+    # "dbsnp_chr7",
+    # "dbsnp_chr8",
+    # "dbsnp_chr9",
+    # "dbsnp_chr10",
+    # "dbsnp_chr11",
+    # "dbsnp_chr12",
+    # "dbsnp_chr13",
+    # "dbsnp_chr14",
+    # "dbsnp_chr15",
+    # "dbsnp_chr16",
+    # "dbsnp_chr17",
+    # "dbsnp_chr18",
+    # "dbsnp_chr19",
+    # "dbsnp_chr21",
+    # "dbsnp_chr21",
+    # "dbsnp_chr22",
+    # "dbsnp_chrx",
+    # "dbsnp_chry",
+    # "dbsnp_chrmt",
+    # "gwas",
+    #
+    # RelationShips
+    # -------------
+    # "reactome_relationships",
+    # "uniprot_relationships",
+    #
+    # DISEASE
+    # -------
+    # "mondo",
+    # "mondo_relationships",
+    #
+    # CHEMICAL
+    # --------
+    # "chebi",
+]
+
+run_steps = [
+    # "extract",
+    # "transform",
+    "load",
+    # "all"
+]  # noqa E501
+
+if __name__ == "__main__":
+    bf = Biofilter(db_uri, debug_mode=True)
+    # bf = Biofilter(db_uri)
+
+    for source in data_sources_to_process:
+        for step in run_steps:
+            if step != "all":
+                try:
+                    print(f"▶ Running ETL - Source: {source} | Step: {step}")
+                    bf.update(
+                        data_sources=[source],
+                        run_steps=[step],
+                        force_steps=[step],
+                    )
+                except Exception as e:
+                    print(f"❌ Error processing {source} [{step}]: {e}")
+            elif step == "all":
+                try:
+                    print(f"▶ Running ETL - Source: {source} | Step: {step}")
+                    bf.update(
+                        data_sources=[source],
+                        # run_steps=[step],
+                        # force_steps=[step],
+                    )
+                except Exception as e:
+                    print(f"❌ Error processing {source} [{step}]: {e}")
+
+    print("✅ All ETL tasks finished.")
+    print("------------------------------")