@@ -446,7 +446,7 @@ def load(self, processed_dir=None):
446446 target_chrom = self ._extract_chrom_from_ds (self .data_source .name ) # noqa E501
447447 result = self .session .execute (
448448 text ("""
449- SELECT variant_id , entity_id
449+ SELECT rs_id , entity_id
450450 FROM variant_masters
451451 WHERE chromosome = :chrom
452452 """ ),
@@ -599,7 +599,7 @@ def map_entrez_to_entity(entrez_list, gene_dict):
599599 # ---= Inserir Variants as Master in bulk =----
600600 variant_objects = [
601601 VariantMaster (
602- variant_id = row .rs_id ,
602+ rs_id = row .rs_id ,
603603 variant_type = row .variant_type ,
604604 omic_status_id = 1 ,
605605 chromosome = row .chromosome ,
@@ -617,7 +617,7 @@ def map_entrez_to_entity(entrez_list, gene_dict):
617617
618618 # --- Create Variant Master ID Column ---
619619 rsid_to_variant_master_id = {
620- variant .variant_id : variant .id for variant in variant_objects
620+ variant .rs_id : variant .id for variant in variant_objects
621621 }
622622 df ["variant_master_id" ] = df ["rs_id" ].map (rsid_to_variant_master_id )
623623
@@ -673,9 +673,15 @@ def map_entrez_to_entity(entrez_list, gene_dict):
673673 )
674674
675675 # --Inserir VariantLocus
676+ build = row .assembly
677+ build = build .replace ("GRCh" , "" ).split ("." )[0 ] # '38'
678+
676679 locus_records .append (
677680 VariantLocus (
678681 variant_id = row .variant_master_id ,
682+ rs_id = row .rs_id ,
683+ entity_id = row .entity_id ,
684+ build = build ,
679685 assembly_id = row .assembly_id ,
680686 chromosome = row .chromosome ,
681687 start_pos = row .start_pos ,
@@ -688,33 +694,100 @@ def map_entrez_to_entity(entrez_list, gene_dict):
688694 )
689695
690696 # - Processar placements (se existirem)
697+ # placements = getattr(row, "placements", []) or []
698+ # for p in placements:
699+ # p_acc = p.get("seq_id")
700+ # asm_id = assemblies_map.get(p_acc)
701+ # if not asm_id:
702+ # continue
703+ # p_start = p.get("start_pos")
704+ # p_end = p.get("end_pos")
705+ # if not p_start or not p_end:
706+ # continue
707+
708+ # ref = p.get("ref")
709+ # alt = p.get("alt")
710+ # if not alt or alt == ref:
711+ # continue
712+
713+ # chrom = acc2chrom.get(p_acc)
714+ # ref_json = json.dumps([str(ref)]) if ref else json.dumps([])
715+ # alt_json = json.dumps([str(alt)]) if alt else json.dumps([])
716+
717+ # build = p.get("assembly")
718+ # build = build.replace("GRCh", "").split(".")[0]
719+
720+ # placement_locus.append(
721+ # VariantLocus(
722+ # variant_id=row.variant_master_id,
723+ # rs_id=row.rs_id,
724+ # entity_id=row.entity_id,
725+ # build=build,
726+ # assembly_id=asm_id,
727+ # chromosome=chrom,
728+ # start_pos=int(p_start),
729+ # end_pos=int(p_end),
730+ # reference_allele=ref_json,
731+ # alternate_allele=alt_json,
732+ # data_source_id=self.data_source.id,
733+ # etl_package_id=self.package.id,
734+ # )
735+ # )
691736 placements = getattr (row , "placements" , []) or []
737+
738+ # Acumulador por locus: (assembly_id, chrom, start, end, ref) -> {build, alts:set}
739+ agg = {}
740+
692741 for p in placements :
693742 p_acc = p .get ("seq_id" )
694743 asm_id = assemblies_map .get (p_acc )
695744 if not asm_id :
696745 continue
746+
697747 p_start = p .get ("start_pos" )
698748 p_end = p .get ("end_pos" )
699749 if not p_start or not p_end :
700750 continue
701751
702752 ref = p .get ("ref" )
703753 alt = p .get ("alt" )
754+ # ignorar alt vazio ou igual ao ref (sem variação)
704755 if not alt or alt == ref :
705756 continue
706757
707758 chrom = acc2chrom .get (p_acc )
759+ if not chrom :
760+ continue
761+
762+ # build: "GRCh38.p14" -> "38"
763+ build = p .get ("assembly" ) or ""
764+ build = build .replace ("GRCh" , "" ).split ("." )[0 ]
765+
766+ key = (asm_id , chrom , int (p_start ), int (p_end ), str (ref or "" ))
767+
768+ bucket = agg .get (key )
769+ if bucket is None :
770+ bucket = {"build" : build , "alts" : set ()}
771+ agg [key ] = bucket
772+
773+ bucket ["alts" ].add (str (alt ))
774+
775+ # Agora, gerar UMA linha por locus com alts agregados
776+ for (asm_id , chrom , start , end , ref ), bucket in agg .items ():
708777 ref_json = json .dumps ([str (ref )]) if ref else json .dumps ([])
709- alt_json = json .dumps ([str (alt )]) if alt else json .dumps ([])
778+ # ordena para estabilidade determinística
779+ alt_json = json .dumps (sorted (bucket ["alts" ]))
710780
711781 placement_locus .append (
712782 VariantLocus (
713783 variant_id = row .variant_master_id ,
784+ rs_id = row .rs_id ,
785+ entity_id = row .entity_id ,
786+ build = bucket ["build" ],
714787 assembly_id = asm_id ,
715788 chromosome = chrom ,
716- start_pos = int ( p_start ) ,
717- end_pos = int ( p_end ) ,
789+ start_pos = start ,
790+ end_pos = end ,
718791 reference_allele = ref_json ,
719792 alternate_allele = alt_json ,
720793 data_source_id = self .data_source .id ,
@@ -759,6 +832,9 @@ def map_entrez_to_entity(entrez_list, gene_dict):
759832 for loc in all_loci :
760833 key = (
761834 loc .variant_id ,
835+ loc .rs_id ,
836+ loc .entity_id ,
837+ loc .build ,
762838 loc .assembly_id ,
763839 loc .chromosome ,
764840 loc .start_pos ,
@@ -773,6 +849,8 @@ def map_entrez_to_entity(entrez_list, gene_dict):
773849 unique_loci .append (loc )
774850 self .session .bulk_save_objects (unique_loci )
775851
852+ # TODO: no chromossomo Y temos dados do X
853+
776854 # manda para a DB
777855 self .session .commit ()
778856
0 commit comments