Skip to content

Commit efdfe2e

Browse files
committed
News DTP and Schema
1 parent e2f2f8c commit efdfe2e

19 files changed

Lines changed: 2344 additions & 1815 deletions

biofilter.db-shm

-32 KB
Binary file not shown.

biofilter.db-wal

Whitespace-only changes.

biofilter/db/models/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from .model_variants import (
2929
VariantMaster,
3030
VariantLocus,
31+
VariantGWAS,
3132
# VariantLiftedPosition,
3233
# VariantMergeLog,
3334
)
@@ -41,6 +42,10 @@
4142
)
4243
from .model_go import GOMaster, GORelation
4344

45+
from .model_diseases import DiseaseGroup, DiseaseGroupMembership, DiseaseMaster
46+
47+
from .model_chemicals import ChemicalMaster, ChemicalData
48+
4449
__all__ = [
4550
# # CONFIGURATION MODELS
4651
"SystemConfig",
@@ -73,6 +78,7 @@
7378
# VARIANTS MODELS
7479
"VariantMaster",
7580
"VariantLocus",
81+
"VariantGWAS"
7682
# "VariantLiftedPosition",
7783
# "VariantMergeLog",
7884
# PATHWAY MODELS
@@ -85,4 +91,11 @@
8591
# GENE ONTOLOGY MODELS
8692
"GOMaster",
8793
"GORelation",
94+
# DISEASE MODELS
95+
"DiseaseGroup",
96+
"DiseaseGroupMembership",
97+
"DiseaseMaster",
98+
# CHEMICAL MODELS
99+
"ChemicalMaster",
100+
"ChemicalData",
88101
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from biofilter.db.base import Base
2+
from sqlalchemy.orm import relationship
3+
from sqlalchemy import Column, Integer, String, ForeignKey, Text, Float, Boolean
4+
5+
6+
class ChemicalMaster(Base):
7+
"""
8+
Canonical representation of chemicals in Biofilter3R.
9+
10+
Each chemical is linked to a unique Biofilter Entity (`entity_id`)
11+
and identified by a ChEBI accession (CHEBI:xxxx).
12+
The description provides a human-readable definition.
13+
Provenance is tracked via DataSource and ETLPackage.
14+
15+
Relationships:
16+
- entity: Unique entity representation for the chemical
17+
- chemical_data: One-to-one physical/chemical properties
18+
- aliases: EntityAlias for synonyms/cross-references
19+
"""
20+
21+
__tablename__ = "chemical_masters"
22+
23+
id = Column(Integer, primary_key=True, autoincrement=True)
24+
25+
# CHEBI ID (e.g., CHEBI:1234)
26+
chemical_id = Column(String(50), nullable=False, index=True, unique=True)
27+
28+
# Canonical label
29+
name = Column(String(255), nullable=True)
30+
31+
# Optional definition from ChEBI
32+
definition = Column(Text, nullable=True)
33+
34+
# ASCII-only version of name
35+
ascii_name = Column(String(255), nullable=True)
36+
37+
# Status (active/obsolete/merged)
38+
omic_status_id = Column(
39+
Integer,
40+
ForeignKey("omic_status.id", ondelete="SET NULL"),
41+
nullable=True,
42+
)
43+
omic_status = relationship("OmicStatus")
44+
45+
# Links to the central Entity table
46+
entity_id = Column(
47+
Integer, ForeignKey("entities.id", ondelete="CASCADE"), nullable=False
48+
)
49+
entity = relationship("Entity", passive_deletes=True)
50+
51+
# Provenance
52+
data_source_id = Column(
53+
Integer,
54+
ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
55+
nullable=True,
56+
)
57+
data_source = relationship("ETLDataSource", passive_deletes=True)
58+
59+
etl_package_id = Column(
60+
Integer,
61+
ForeignKey("etl_packages.id", ondelete="CASCADE"),
62+
nullable=True,
63+
)
64+
etl_package = relationship("ETLPackage", passive_deletes=True)
65+
66+
# One-to-one relation with ChemicalData
67+
chemical_data = relationship(
68+
"ChemicalData", back_populates="chemical", uselist=False, cascade="all, delete-orphan"
69+
)
70+
71+
72+
class ChemicalData(Base):
73+
"""
74+
Physical and chemical properties of a chemical compound.
75+
Derived from chemical_data.tsv, linked to ChemicalMaster.
76+
"""
77+
78+
__tablename__ = "chemical_data"
79+
80+
id = Column(Integer, primary_key=True, autoincrement=True)
81+
82+
chemical_id = Column(
83+
Integer,
84+
ForeignKey("chemical_masters.id", ondelete="CASCADE"),
85+
nullable=False,
86+
unique=True, # one-to-one
87+
)
88+
chemical = relationship("ChemicalMaster", back_populates="chemical_data")
89+
90+
# Formula string (e.g., C6H12O6)
91+
formula = Column(String(100), nullable=True)
92+
93+
# Net charge
94+
charge = Column(Integer, nullable=True)
95+
96+
# Average molecular mass
97+
mass = Column(Float, nullable=True)
98+
99+
# Monoisotopic mass
100+
monoisotopic_mass = Column(Float, nullable=True)
101+
102+
# Optional structure reference (FK to structures.tsv, future)
103+
structure_id = Column(Integer, nullable=True)
104+
105+
# Flag for autogenerated records
106+
is_autogenerated = Column(Boolean, nullable=True)
107+
108+
# Provenance
109+
data_source_id = Column(
110+
Integer,
111+
ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
112+
nullable=True,
113+
)
114+
data_source = relationship("ETLDataSource", passive_deletes=True)
115+
116+
etl_package_id = Column(
117+
Integer,
118+
ForeignKey("etl_packages.id", ondelete="CASCADE"),
119+
nullable=True,
120+
)
121+
etl_package = relationship("ETLPackage", passive_deletes=True)
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from biofilter.db.base import Base
2+
from sqlalchemy.orm import relationship
3+
from sqlalchemy import Column, Integer, String, ForeignKey, Text
4+
5+
6+
class DiseaseGroup(Base):
7+
"""
8+
Reference table for disease subsets (tags).
9+
Example: rare, gard_rare, nord_rare, otar.
10+
"""
11+
__tablename__ = "disease_groups"
12+
13+
id = Column(Integer, primary_key=True, autoincrement=True)
14+
name = Column(String(100), unique=True, nullable=False)
15+
description = Column(String(255), nullable=True)
16+
17+
data_source_id = Column(
18+
Integer,
19+
ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
20+
nullable=True,
21+
)
22+
data_source = relationship("ETLDataSource", passive_deletes=True)
23+
24+
etl_package_id = Column(
25+
Integer,
26+
ForeignKey("etl_packages.id", ondelete="CASCADE"),
27+
nullable=True,
28+
)
29+
etl_package = relationship("ETLPackage", passive_deletes=True)
30+
31+
# Relationships
32+
memberships = relationship(
33+
"DiseaseGroupMembership", back_populates="group", cascade="all, delete-orphan"
34+
)
35+
36+
37+
class DiseaseGroupMembership(Base):
38+
"""
39+
Linking table between DiseaseMaster and DiseaseGroup.
40+
One disease can have multiple groups, and each group can apply to many diseases.
41+
"""
42+
__tablename__ = "disease_group_memberships"
43+
44+
id = Column(Integer, primary_key=True, autoincrement=True)
45+
46+
disease_id = Column(Integer, ForeignKey("disease_masters.id", ondelete="CASCADE"))
47+
disease = relationship("DiseaseMaster", back_populates="group_memberships")
48+
49+
group_id = Column(Integer, ForeignKey("disease_groups.id", ondelete="CASCADE"))
50+
group = relationship("DiseaseGroup", back_populates="memberships")
51+
52+
data_source_id = Column(
53+
Integer,
54+
ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
55+
nullable=True,
56+
)
57+
data_source = relationship("ETLDataSource", passive_deletes=True)
58+
59+
etl_package_id = Column(
60+
Integer,
61+
ForeignKey("etl_packages.id", ondelete="CASCADE"),
62+
nullable=True,
63+
)
64+
etl_package = relationship("ETLPackage", passive_deletes=True)
65+
66+
67+
class DiseaseMaster(Base):
68+
"""
69+
Canonical representation of diseases in Biofilter3R.
70+
71+
Each disease is linked to a unique Biofilter Entity (`entity_id`) and
72+
identified by a MONDO ID (preferred primary identifier).
73+
The description provides a human-readable label or definition.
74+
Provenance is tracked via the originating DataSource and ETLPackage.
75+
76+
Relationships:
77+
- entity: Unique entity representation for the disease
78+
- group_memberships: Links to DiseaseGroup through DiseaseGroupMembership
79+
"""
80+
81+
__tablename__ = "disease_masters"
82+
83+
id = Column(Integer, primary_key=True, autoincrement=True)
84+
85+
disease_id = Column(String(50), nullable=False, index=True, unique=True)
86+
label = Column(String(255), nullable=True)
87+
description = Column(Text, nullable=True)
88+
89+
# Omic status (like in GeneMaster)
90+
omic_status_id = Column(
91+
Integer,
92+
ForeignKey("omic_status.id", ondelete="SET NULL"),
93+
nullable=True,
94+
)
95+
omic_status = relationship("OmicStatus")
96+
97+
# Links to the central Entity table
98+
entity_id = Column(
99+
Integer, ForeignKey("entities.id", ondelete="CASCADE"), nullable=False
100+
)
101+
entity = relationship("Entity", passive_deletes=True)
102+
103+
# Provenance
104+
data_source_id = Column(
105+
Integer,
106+
ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
107+
nullable=True,
108+
)
109+
data_source = relationship("ETLDataSource", passive_deletes=True)
110+
111+
etl_package_id = Column(
112+
Integer,
113+
ForeignKey("etl_packages.id", ondelete="CASCADE"),
114+
nullable=True,
115+
)
116+
etl_package = relationship("ETLPackage", passive_deletes=True)
117+
118+
# Relationships
119+
group_memberships = relationship(
120+
"DiseaseGroupMembership", back_populates="disease", cascade="all, delete-orphan"
121+
)

biofilter/db/models/model_variants.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
Column,
33
Integer,
44
Numeric,
5-
String,
5+
# String,
66
ForeignKey,
7+
String,
8+
Float,
9+
Text,
710
# UniqueConstraint,
811
# Index,
912
# CheckConstraint, # noqa E501
@@ -156,6 +159,79 @@ class VariantLocus(Base):
156159
# )
157160

158161

162+
class VariantGWAS(Base):
163+
"""
164+
Flat representation of GWAS Catalog associations.
165+
166+
This table hosts the raw + mapped data from the GWAS Catalog,
167+
joined with the EFO trait mapping file. It allows queries
168+
on variants, studies, and traits, even before full Entity integration.
169+
170+
Future: link `variant_id`, `trait_id`, and `study_id` to Entities.
171+
"""
172+
173+
__tablename__ = "variant_gwas"
174+
175+
id = Column(Integer, primary_key=True, autoincrement=True)
176+
177+
# Publication / study info
178+
pubmed_id = Column(String(50), index=True, nullable=True)
179+
# first_author = Column(String(255), nullable=True)
180+
# publication_date = Column(String(50), nullable=True) # raw string for now
181+
# journal = Column(String(255), nullable=True)
182+
# study_title = Column(Text, nullable=True)
183+
# link = Column(String(500), nullable=True)
184+
185+
# Trait / phenotype mapping
186+
raw_trait = Column(String(255), nullable=True) # "DISEASE/TRAIT" field
187+
mapped_trait = Column(String(255), nullable=True) # "EFO term"
188+
mapped_trait_id = Column(String(100), nullable=True) # "EFO/MONDO ID"
189+
parent_trait = Column(String(255), nullable=True) # Parent term
190+
parent_trait_id = Column(String(100), nullable=True) # Parent URI ID
191+
192+
# Variant info
193+
chr_id = Column(String(10), nullable=True)
194+
chr_pos = Column(Integer, nullable=True)
195+
reported_gene = Column(String(255), nullable=True)
196+
mapped_gene = Column(String(255), nullable=True)
197+
snp_id = Column(String(50), index=True, nullable=True) # dbSNP ID (rsID)
198+
snp_risk_allele = Column(String(50), nullable=True) # Qual a origem
199+
risk_allele_frequency = Column(Float, nullable=True)
200+
context = Column(String(100), nullable=True)
201+
intergenic = Column(String(10), nullable=True)
202+
203+
# Statistics
204+
p_value = Column(Float, nullable=True)
205+
pvalue_mlog = Column(Float, nullable=True)
206+
odds_ratio_beta = Column(String(50), nullable=True)
207+
ci_text = Column(String(100), nullable=True) # confidence interval raw
208+
209+
# Sample sizes
210+
initial_sample_size = Column(Text, nullable=True)
211+
replication_sample_size = Column(Text, nullable=True)
212+
213+
# Platform info
214+
platform = Column(String(255), nullable=True)
215+
cnv = Column(String(10), nullable=True)
216+
217+
# Notes
218+
notes = Column(Text, nullable=True)
219+
220+
data_source_id = Column(
221+
Integer,
222+
ForeignKey("etl_data_sources.id", ondelete="CASCADE"),
223+
nullable=True,
224+
)
225+
data_source = relationship("ETLDataSource", passive_deletes=True)
226+
227+
etl_package_id = Column(
228+
Integer,
229+
ForeignKey("etl_packages.id", ondelete="CASCADE"),
230+
nullable=True,
231+
)
232+
etl_package = relationship("ETLPackage", passive_deletes=True)
233+
234+
159235
# --- Liftover cache/audit (for derived mappings or missing placements) ------
160236
# class VariantLiftedPosition(Base):
161237
# """

0 commit comments

Comments
 (0)