diff --git a/api/models/gaia.py b/api/models/gaia.py index ec27635..2a2346c 100644 --- a/api/models/gaia.py +++ b/api/models/gaia.py @@ -60,3 +60,22 @@ class Figures(db.Model): img_name: db.Mapped[str] = db.mapped_column(db.String(64), nullable=False) caption: db.Mapped[str] = db.mapped_column(db.Text, nullable=True) img_url: db.Mapped[str] = db.mapped_column(db.String(256), nullable=True) + + +class AuthorList(db.Model): + __bind_key__ = "gaia" + __tablename__ = "author_list" + + id: db.Mapped[int] = db.mapped_column(db.Integer, nullable=False, primary_key=True) + publication_figures_id: db.Mapped[int] = db.mapped_column( + ForeignKey("publication_figures.id", ondelete="CASCADE"), nullable=False + ) + author: db.Mapped[str] = db.mapped_column(db.String(128), nullable=False) + + +class FigureModels(db.Model): + __bind_key__ = "gaia" + __tablename__ = "figure_models" + + id: db.Mapped[int] = db.mapped_column(db.Integer, nullable=False, primary_key=True) + data: db.Mapped[dict] = db.mapped_column(db.JSON, nullable=True) diff --git a/api/resources/gaia.py b/api/resources/gaia.py index 8c7818e..7fcaa5d 100644 --- a/api/resources/gaia.py +++ b/api/resources/gaia.py @@ -3,10 +3,12 @@ from markupsafe import escape from api import db from api.utils.bar_utils import BARUtils -from api.models.gaia import Genes, Aliases, PubIds, Figures -from sqlalchemy import func, or_ +from api.models.gaia import Genes, Aliases, PublicationFigures, PubIds, Figures, AuthorList, FigureModels +from sqlalchemy import func, or_, cast, literal +from sqlalchemy.dialects import mysql from marshmallow import Schema, ValidationError, fields as marshmallow_fields import json +import re gaia = Namespace("Gaia", description="Gaia", path="/gaia") @@ -174,3 +176,168 @@ def post(self): # Return final data return BARUtils.success_exit(data) + + +@gaia.route("/publication_figures_by_gene/") +class GaiaPublicationFiguresByGene(Resource): + @gaia.param("identifier", _in="path", default="ABI3") + def get(self, identifier=""): + + # Escape input + identifier = escape(identifier) + + # Is it valid + if not BARUtils.is_gaia_alias(identifier): + return BARUtils.error_exit("Invalid identifier"), 400 + + # Resolve to gene ids: try alias first, then locus / ncbi id + rows = db.session.execute(db.select(Aliases.genes_id).filter(Aliases.alias == identifier)).fetchall() + gene_ids = [r.genes_id for r in rows] + + if not gene_ids: + rows = db.session.execute( + db.select(Genes.id).filter(or_(Genes.locus == identifier, Genes.geneid == identifier)) + ).fetchall() + gene_ids = [r.id for r in rows] + + if not gene_ids: + return BARUtils.error_exit("Nothing found"), 404 + + # Get the gene's full alias set + aliases = [ + r.alias.lower() + for r in db.session.execute(db.select(Aliases.alias).filter(Aliases.genes_id.in_(gene_ids))).fetchall() + ] + + # Match OCR words: word-boundary regex for long aliases, exact match for short ones + long_aliases = sorted({re.escape(a) for a in aliases if len(a) >= 4}) + short_aliases = sorted({a for a in aliases if len(a) < 4}) + + # No usable aliases, nothing to match on + if not long_aliases and not short_aliases: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + word_expr = func.lower(func.json_unquote(func.json_extract(FigureModels.data, "$.word"))) + match_conds = [] + if long_aliases: + alias_re = "(^|[^a-z0-9])(" + "|".join(long_aliases) + ")([^a-z0-9]|$)" + match_conds.append(word_expr.regexp_match(alias_re)) + if short_aliases: + match_conds.append(word_expr.in_(short_aliases)) + + matched_rows = db.session.execute(db.select(FigureModels.data).where(or_(*match_conds))).fetchall() + if not matched_rows: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # Collect each matched image and its boxes (keep the image even if a box is missing) + bbox_by_name = {} + for row in matched_rows: + d = row.data if isinstance(row.data, dict) else json.loads(row.data) + for img in d.get("image", []): + name = (img.get("imageName") or "").lstrip("/") + if not name: + continue + bbox_list = bbox_by_name.setdefault(name, []) + bbox = img.get("bbox") + if bbox is not None: + bbox_list.append(bbox) + + stripped_names = list(bbox_by_name.keys()) + if not stripped_names: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # Drop image names used by more than one publication, we can't attribute those + collision = ( + db.select(Figures.img_name) + .group_by(Figures.img_name) + .having(func.count(func.distinct(Figures.publication_figures_id)) > 1) + ) + + # Pull the figures and their publication info, skip null urls, newest pubmed first + core_stmt = ( + db.select( + PubIds.pmc, + PubIds.pubmed, + PublicationFigures.id.label("pf_id"), + PublicationFigures.title, + PublicationFigures.abstract, + Figures.img_name, + Figures.img_url, + Figures.caption, + ) + .select_from(Figures) + .join(PublicationFigures, PublicationFigures.id == Figures.publication_figures_id) + .join(PubIds, PubIds.publication_figures_id == PublicationFigures.id) + .where(Figures.img_name.in_(stripped_names)) + .where(Figures.img_url.isnot(None)) + .where(Figures.img_name.not_in(collision)) + .order_by(cast(PubIds.pubmed, mysql.INTEGER(unsigned=True)).desc()) + ) + fig_rows = db.session.execute(core_stmt).fetchall() + + if not fig_rows: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # Group figures by PMC, one entry per image name + figures_by_pmc, pmc_to_pf, pf_ids, seen_names = {}, {}, set(), set() + for r in fig_rows: + pf_ids.add(r.pf_id) + pmc_to_pf[r.pmc] = r.pf_id + if r.pmc not in figures_by_pmc: + figures_by_pmc[r.pmc] = { + "title": r.title, + "abstract": r.abstract, + "authors": [], + "pubmed": r.pubmed, + "figures": [], + } + if r.img_name in seen_names: + continue + seen_names.add(r.img_name) + figures_by_pmc[r.pmc]["figures"].append( + { + "img_name": r.img_name, + "img_url": r.img_url, + "caption": r.caption, + "bbox": bbox_by_name.get(r.img_name, []), + } + ) + + # Attach authors to each publication + authors_by_pf = {} + for r in db.session.execute( + db.select(AuthorList.publication_figures_id, AuthorList.author).filter( + AuthorList.publication_figures_id.in_(pf_ids) + ) + ).fetchall(): + authors_by_pf.setdefault(r.publication_figures_id, []).append(r.author) + for pmc, pf_id in pmc_to_pf.items(): + figures_by_pmc[pmc]["authors"] = authors_by_pf.get(pf_id, []) + + # allImageWords: gene words detected on the shown figures, for the gene-name filter + displayed_names = list({r.img_name for r in fig_rows}) + all_image_words = {} + if displayed_names: + displayed_slashed = json.dumps(["/" + n for n in displayed_names]) # stored names keep a leading / + words_rows = db.session.execute( + db.select(FigureModels.data) + .where(func.json_unquote(func.json_extract(FigureModels.data, "$.gene")) == "true") + .where( + func.json_overlaps( + func.json_extract(FigureModels.data, "$.image[*].imageName"), + cast(literal(displayed_slashed), mysql.JSON), + ) + ) + ).fetchall() + displayed_set = set(displayed_names) + for row in words_rows: + d = row.data if isinstance(row.data, dict) else json.loads(row.data) + word = (d.get("word") or "").lower() + for img in d.get("image", []): + name = (img.get("imageName") or "").lstrip("/") + if name in displayed_set: + bbox = img.get("bbox") + all_image_words.setdefault(word, {})[name] = bbox if bbox is not None else [] + + # Return final data + return BARUtils.success_exit({"figures": figures_by_pmc, "allImageWords": all_image_words}) diff --git a/config/BAR_API.cfg b/config/BAR_API.cfg index a7f2cbf..a241974 100755 --- a/config/BAR_API.cfg +++ b/config/BAR_API.cfg @@ -42,5 +42,6 @@ SQLALCHEMY_BINDS = { 'striga' : 'mysql://root:root@localhost/striga', 'tomato_nssnp' : 'mysql://root:root@localhost/tomato_nssnp', 'tomato_sequence' : 'mysql://root:root@localhost/tomato_sequence', - 'triphysaria' : 'mysql://root:root@localhost/triphysaria' + 'triphysaria' : 'mysql://root:root@localhost/triphysaria', + 'gaia' : 'mysql://root:root@localhost/gaia' } diff --git a/config/databases/gaia.sql b/config/databases/gaia.sql new file mode 100644 index 0000000..b3f9a85 --- /dev/null +++ b/config/databases/gaia.sql @@ -0,0 +1,235 @@ +-- MySQL dump 10.13 Distrib 9.4.0, for Linux (x86_64) +-- +-- Host: localhost Database: gaia +-- ------------------------------------------------------ +-- Server version 9.4.0 +-- +-- Curated TEST FIXTURE for the `gaia` bind (BAR_API local/CI) — NOT a prod dump +-- (read-only access blocks mysqldump). Small by design; each row group trips one +-- rule of GET /gaia/publication_figures_by_gene/: +-- * PMC151246 (real PMID 12045268, figs 01-0441f1..f5): abi3 OCR'd on f2+f4 only. +-- * abi3/abi5 (gene:false) on variantfig.jpg : word-boundary match must CATCH. +-- * gabi390_r (gene:false) on gabitest.jpg : word-boundary match must REJECT. +-- * gr1.jpg in two publications : bare-name guard must DROP. +-- * nullfig.jpg (NULL img_url) : null-url skip must DROP. +-- * gene 2 (alias NOMATCH4) : empty-payload (200, not 404) path. +-- figure_models.imageName keeps a leading "/" to exercise TRIM(LEADING '/' ...). +-- DDL matched to prod SHOW CREATE TABLE (2026-06-20). FK constraints intentionally +-- omitted (they do not affect read queries; keeps the fixture load-order-independent). + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Current Database: `gaia` +-- + +CREATE DATABASE /*!32312 IF NOT EXISTS*/ `gaia` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci */ /*!80016 DEFAULT ENCRYPTION='N' */; + +USE `gaia`; + +-- +-- Table structure for table `genes` +-- + +DROP TABLE IF EXISTS `genes`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `genes` ( + `id` int NOT NULL AUTO_INCREMENT, + `species` varchar(64) NOT NULL, + `locus` varchar(64) DEFAULT NULL, + `geneid` varchar(32) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `idx_locus` (`locus`), + KEY `idx_geneid` (`geneid`) +) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `genes` +-- + +LOCK TABLES `genes` WRITE; +/*!40000 ALTER TABLE `genes` DISABLE KEYS */; +INSERT INTO `genes` VALUES (1,'Arabidopsis_thaliana','At3g24650','822061'),(2,'Arabidopsis_thaliana','At1g01010','000001'),(3,'Arabidopsis_thaliana','At5g12340','999340'); +/*!40000 ALTER TABLE `genes` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `aliases` +-- + +DROP TABLE IF EXISTS `aliases`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `aliases` ( + `id` int NOT NULL AUTO_INCREMENT, + `genes_id` int NOT NULL, + `alias` varchar(256) NOT NULL, + PRIMARY KEY (`id`), + KEY `FK_genes` (`genes_id`), + KEY `idx_aliases` (`alias`,`genes_id`) +) ENGINE=InnoDB AUTO_INCREMENT=9 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `aliases` +-- + +LOCK TABLES `aliases` WRITE; +/*!40000 ALTER TABLE `aliases` DISABLE KEYS */; +INSERT INTO `aliases` VALUES (1,1,'ABI3'),(2,1,'SIS10'),(3,1,'AtABI3'),(4,1,'ABA INSENSITIVE 3'),(5,1,'ABSCISIC ACID INSENSITIVE 3'),(6,1,'SUGAR INSENSITIVE 10'),(7,2,'NOMATCH4'),(8,3,'34G'); +/*!40000 ALTER TABLE `aliases` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `publication_figures` +-- + +DROP TABLE IF EXISTS `publication_figures`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `publication_figures` ( + `id` int NOT NULL AUTO_INCREMENT, + `title` varchar(512) DEFAULT NULL, + `abstract` text, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=71 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `publication_figures` +-- +-- NOTE: pub 60 (PMC6403161) title/abstract are stored TRUNCATED in the prod corpus +-- (title ends "... Responses in ", abstract ends "... selected family members in ") — kept VERBATIM. + +LOCK TABLES `publication_figures` WRITE; +/*!40000 ALTER TABLE `publication_figures` DISABLE KEYS */; +INSERT INTO `publication_figures` VALUES (10,'Abscisic acid signaling in seeds and seedlings.','Review of ABA signaling pathways in seeds and seedlings.'),(20,'Unrelated paper A (collision partner 1).','Abstract A.'),(30,'Unrelated paper B (collision partner 2 + null url).','Abstract B.'),(40,'Unrelated paper C (false-positive container).','Abstract C.'),(60,'AP2/ERF Transcription Factor Regulatory Networks in Hormone and Abiotic Stress Responses in ','Dynamic environmental changes such as extreme temperature, water scarcity and high salinity affect plant growth, survival, and reproduction. Plants have evolved sophisticated regulatory mechanisms to adapt to these unfavorable conditions, many of which interface with plant hormone signaling pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stress responses at least in part through hormone- and stress-responsive transcription factors. Among these, the APETALA2/ETHYLENE RESPONSIVE FACTOR (AP2/ERF) family transcription factors (AP2/ERFs) have emerged as key regulators of various stress responses, in which they also respond to hormones with improved plant survival during stress conditions. Apart from participation in specific stresses, AP2/ERFs are involved in a wide range of stress tolerance, enabling them to form an interconnected stress regulatory network. Additionally, many AP2/ERFs respond to the plant hormones abscisic acid (ABA) and ethylene (ET) to help activate ABA and ET dependent and independent stress-responsive genes. While some AP2/ERFs are implicated in growth and developmental processes mediated by gibberellins (GAs), cytokinins (CTK), and brassinosteroids (BRs). The involvement of AP2/ERFs in hormone signaling adds the complexity of stress regulatory network. In this review, we summarize recent studies on AP2/ERF transcription factors in hormonal and abiotic stress responses with an emphasis on selected family members in '),(70,'Short-code gene 34G test figure.','Abstract for the 34G short-alias exact-match test.'); +/*!40000 ALTER TABLE `publication_figures` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `figures` +-- + +DROP TABLE IF EXISTS `figures`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `figures` ( + `id` int NOT NULL AUTO_INCREMENT, + `publication_figures_id` int NOT NULL, + `img_name` varchar(64) NOT NULL, + `caption` text, + `img_url` varchar(265) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `FK_publication_figures_figures` (`publication_figures_id`) +) ENGINE=InnoDB AUTO_INCREMENT=704 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `figures` +-- + +LOCK TABLES `figures` WRITE; +/*!40000 ALTER TABLE `figures` DISABLE KEYS */; +INSERT INTO `figures` VALUES (100,10,'01-0441f1.jpg','Domain Structure of B3 and bZIP Domain Transcription Factors','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/8e4f6ebe4139/01-0441f1.jpg'),(101,10,'01-0441f2.jpg','Scheme of Signaling Pathways in Seed Development.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/a56d4d6d5560/01-0441f2.jpg'),(102,10,'01-0441f3.jpg','Regulation of ABA-Responsive Promoter Activity in a Rice Embryo Protoplast','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/67ad8027f184/01-0441f3.jpg'),(103,10,'01-0441f4.jpg','Scheme of Signaling Pathways That Interact with the ABA Regulation of Germination.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/7be9f20adc7b/01-0441f4.jpg'),(104,10,'01-0441f5.jpg','Sensitivity of Seedlings of Wild-Type, abi, and ABI Overexpressing Lines','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/3c7ab6e933bb/01-0441f5.jpg'),(200,20,'gr1.jpg','Collision figure (pub A).','https://example.org/PMC900001/gr1.jpg'),(300,30,'gr1.jpg','Collision figure (pub B).','https://example.org/PMC900002/gr1.jpg'),(301,30,'nullfig.jpg','Figure with no URL.',NULL),(400,40,'gabitest.jpg','GABI line figure (false-positive bait).','https://example.org/PMC900003/gabitest.jpg'),(600,60,'fpls-10-00228-g003.jpg','AP2/ERFs roles in hormone pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stresses responses through hormone signaling components and AP2/ERFs. Arrows and bar ends indicate activation and repression effect, respectively. Figure is created with BioRender.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/9ceb/6403161/d18acaa7332e/fpls-10-00228-g003.jpg'),(700,70,'fpls-11-01234-g002.jpg','Real 34G figure (valid bbox).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/aaaa1111aaaa/fpls-11-01234-g002.jpg'),(701,70,'fpls-11-01234-g005.jpg','Boundary decoy figure (word 34g/x).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/bbbb2222bbbb/fpls-11-01234-g005.jpg'),(702,70,'fpls-11-01234-g009.jpg','Substring decoy figure (word x34gy).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/cccc3333cccc/fpls-11-01234-g009.jpg'),(703,70,'fpls-11-01234-g003.jpg','Malformed-bbox figure (OCR entry has no bbox).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/dddd4444dddd/fpls-11-01234-g003.jpg'); +/*!40000 ALTER TABLE `figures` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `pub_ids` +-- + +DROP TABLE IF EXISTS `pub_ids`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `pub_ids` ( + `id` int NOT NULL AUTO_INCREMENT, + `publication_figures_id` int NOT NULL, + `pubmed` varchar(16) DEFAULT NULL, + `pmc` varchar(16) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `FK_publication_figures_pub_ids` (`publication_figures_id`) +) ENGINE=InnoDB AUTO_INCREMENT=71 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `pub_ids` +-- + +LOCK TABLES `pub_ids` WRITE; +/*!40000 ALTER TABLE `pub_ids` DISABLE KEYS */; +INSERT INTO `pub_ids` VALUES (10,10,'12045268','PMC151246'),(20,20,'30000001','PMC900001'),(30,30,'30000002','PMC900002'),(40,40,'30000003','PMC900003'),(60,60,'30873200','PMC6403161'),(70,70,'31000000','PMC7000001'); +/*!40000 ALTER TABLE `pub_ids` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `author_list` +-- + +DROP TABLE IF EXISTS `author_list`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `author_list` ( + `id` int NOT NULL AUTO_INCREMENT, + `publication_figures_id` int NOT NULL, + `author` varchar(128) NOT NULL, + PRIMARY KEY (`id`), + KEY `FK_publication_figures_author_list` (`publication_figures_id`) +) ENGINE=InnoDB AUTO_INCREMENT=10 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `author_list` +-- + +LOCK TABLES `author_list` WRITE; +/*!40000 ALTER TABLE `author_list` DISABLE KEYS */; +INSERT INTO `author_list` VALUES (1,10,'Finkelstein RR'),(2,10,'Gampala SSL'),(3,10,'Rock CD'),(5,60,'Zhouli Xie'),(6,60,'Trevor M Nolan'),(7,60,'Hao Jiang'),(8,60,'Yanhai Yin'),(9,70,'Test Author A'); +/*!40000 ALTER TABLE `author_list` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `figure_models` +-- + +DROP TABLE IF EXISTS `figure_models`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `figure_models` ( + `id` int NOT NULL AUTO_INCREMENT, + `data` json DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `figure_models` +-- + +LOCK TABLES `figure_models` WRITE; +/*!40000 ALTER TABLE `figure_models` DISABLE KEYS */; +INSERT INTO `figure_models` VALUES (1,'{"gene": true, "word": "abi3", "image": [{"bbox": [[[268,108],[317,108],[317,118],[268,118]],[[45,110],[72,111],[72,120],[45,119]]], "imageName": "/01-0441f2.jpg"},{"bbox": [[[274,96],[298,96],[298,105],[274,105]]], "imageName": "/01-0441f4.jpg"},{"bbox": [[[10,10],[40,10],[40,20],[10,20]]], "imageName": "/gr1.jpg"},{"bbox": [[[5,5],[25,5],[25,15],[5,15]]], "imageName": "/nullfig.jpg"}]}'),(3,'{"gene": false, "word": "gabi390_r", "image": [{"bbox": [[[200,200],[260,200],[260,212],[200,212]]], "imageName": "/gabitest.jpg"}]}'),(4,'{"gene": false, "word": "abi3/vp1", "image": [{"bbox": [[[34, 357], [96, 357], [96, 369], [34, 369]]], "imageName": "/fpls-10-00228-g003.jpg"}]}'),(5,'{"gene": true, "word": "34g", "image": [{"bbox": [[[10,10],[40,10],[40,20],[10,20]]], "imageName": "/fpls-11-01234-g002.jpg"},{"imageName": "/fpls-11-01234-g003.jpg"}]}'),(6,'{"gene": false, "word": "34g/x", "image": [{"bbox": [[[50,50],[80,50],[80,60],[50,60]]], "imageName": "/fpls-11-01234-g005.jpg"}]}'),(7,'{"gene": false, "word": "x34gy", "image": [{"bbox": [[[70,70],[90,70],[90,80],[70,80]]], "imageName": "/fpls-11-01234-g009.jpg"}]}'); +/*!40000 ALTER TABLE `figure_models` ENABLE KEYS */; +UNLOCK TABLES; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed \ No newline at end of file diff --git a/config/init.sh b/config/init.sh index c3e4a06..2002159 100755 --- a/config/init.sh +++ b/config/init.sh @@ -48,6 +48,7 @@ mysql -u $DB_USER -p$DB_PASS < ./config/databases/striga.sql mysql -u $DB_USER -p$DB_PASS < ./config/databases/tomato_nssnp.sql mysql -u $DB_USER -p$DB_PASS < ./config/databases/tomato_sequence.sql mysql -u $DB_USER -p$DB_PASS < ./config/databases/triphysaria.sql +mysql -u $DB_USER -p$DB_PASS < ./config/databases/gaia.sql echo "Data are now loaded. Preparing API config" echo "Please manually edit config file!" diff --git a/tests/resources/test_gaia.py b/tests/resources/test_gaia.py new file mode 100644 index 0000000..dd5e7e6 --- /dev/null +++ b/tests/resources/test_gaia.py @@ -0,0 +1,120 @@ +from api import app, db +from unittest import TestCase +from sqlalchemy.exc import UnboundExecutionError + + +class TestGaiaPublicationFiguresByGene(TestCase): + def setUp(self): + self.app_client = app.test_client() + with app.app_context(): + try: + self.gaia_is_mysql = db.engines["gaia"].dialect.name == "mysql" + except (KeyError, UnboundExecutionError): + self.gaia_is_mysql = False + + def _require_mysql(self): + if not self.gaia_is_mysql: + self.skipTest("requires MySQL gaia bind; skipped under SQLite harness") + + def test_publication_figures_by_gene_abi3(self): + """ABI3 should surface OCR-detected figures grouped by PMC, exercising every + fixture rule (match, word-boundary catch/reject, bare-name guard, null-url skip). + :return: + """ + self._require_mysql() + response = self.app_client.get("/gaia/publication_figures_by_gene/ABI3") + self.assertEqual(response.status_code, 200) + + body = response.json + self.assertTrue(body["wasSuccessful"]) + data = body["data"] + figures = data["figures"] + + # ABI3 returns two REAL pubs -> assert by set membership, not position. + self.assertLessEqual({"PMC6403161", "PMC151246"}, set(figures)) + + # PMC151246: abi3 OCR'd on f2 + f4 only (NOT f1/f3/f5). + self.assertIn("PMC151246", figures) + pmc151246_names = {f["img_name"] for f in figures["PMC151246"]["figures"]} + self.assertIn("01-0441f2.jpg", pmc151246_names) + self.assertIn("01-0441f4.jpg", pmc151246_names) + self.assertNotIn("01-0441f1.jpg", pmc151246_names) + self.assertNotIn("01-0441f3.jpg", pmc151246_names) + self.assertNotIn("01-0441f5.jpg", pmc151246_names) + + # PMC6403161: word-boundary match must CATCH the gene:false word abi3/vp1 -> + # fpls-10-00228-g003.jpg. Exact IN(...) would miss it -> this pins boundary-vs-exact. + self.assertIn("PMC6403161", figures) + pmc6403161_names = {f["img_name"] for f in figures["PMC6403161"]["figures"]} + self.assertIn("fpls-10-00228-g003.jpg", pmc6403161_names) + + # Across all PMCs: gr1.jpg dropped (bare-name guard), nullfig.jpg dropped + # (null img_url), gabitest.jpg dropped (gabi390_r boundary-rejected). + all_names = {f["img_name"] for pmc in figures.values() for f in pmc["figures"]} + self.assertNotIn("gr1.jpg", all_names) + self.assertNotIn("nullfig.jpg", all_names) + self.assertNotIn("gabitest.jpg", all_names) + + # allImageWords powers the gene-name filter. + self.assertIn("abi3", data["allImageWords"]) + + # Authors are attached per publication. + self.assertIn("Finkelstein RR", figures["PMC151246"]["authors"]) + + def test_publication_figures_by_gene_empty_payload(self): + """A valid gene with no OCR-matched figures returns 200 with an empty payload, + not a 404. + :return: + """ + self._require_mysql() + response = self.app_client.get("/gaia/publication_figures_by_gene/NOMATCH4") + self.assertEqual(response.status_code, 200) + expected = {"wasSuccessful": True, "data": {"figures": {}, "allImageWords": {}}} + self.assertEqual(response.json, expected) + + def test_publication_figures_by_gene_short_alias_34g(self): + """A short (<=3 char) alias uses EXACT-IN matching, not the word-boundary regex. + 34G must match the OCR word '34g' exactly and reject the boundary-delimited decoy + '34g/x' and the substring decoy 'x34gy'. Also covers the malformed (no-bbox) OCR + entry -> the figure still returns with a clean bbox list (no None). + :return: + """ + self._require_mysql() + response = self.app_client.get("/gaia/publication_figures_by_gene/34G") + self.assertEqual(response.status_code, 200) + + body = response.json + self.assertTrue(body["wasSuccessful"]) + figures = body["data"]["figures"] + + self.assertIn("PMC7000001", figures) + names = {f["img_name"] for f in figures["PMC7000001"]["figures"]} + + # Exact match on '34g' returns its real figure (g002) and the malformed-bbox figure (g003). + self.assertIn("fpls-11-01234-g002.jpg", names) + self.assertIn("fpls-11-01234-g003.jpg", names) + + # Exact-IN must NOT match the boundary decoy '34g/x' (the regex WOULD have) nor the + # substring decoy 'x34gy' (a LIKE would have) -> both figures absent. + self.assertNotIn("fpls-11-01234-g005.jpg", names) + self.assertNotIn("fpls-11-01234-g009.jpg", names) + + # Malformed OCR entry (imageName but no bbox) -> figure returns with a clean, None-free + # bbox list (here empty, since its only OCR entry had no box). + by_name = {f["img_name"]: f for f in figures["PMC7000001"]["figures"]} + self.assertEqual(by_name["fpls-11-01234-g003.jpg"]["bbox"], []) + for fig in figures["PMC7000001"]["figures"]: + self.assertNotIn(None, fig["bbox"]) + + # allImageWords keeps the malformed image's key but maps it to [] (not null, not absent), + # so the gene-name filter still lists it without the frontend choking on a null box. + words_34g = body["data"]["allImageWords"]["34g"] + self.assertEqual(words_34g["fpls-11-01234-g003.jpg"], []) + + def test_publication_figures_by_gene_invalid_identifier(self): + """An identifier failing the gaia alias check returns a 400 error. + :return: + """ + response = self.app_client.get("/gaia/publication_figures_by_gene/abc!def") + expected = {"wasSuccessful": False, "error": "Invalid identifier"} + self.assertEqual(response.json, expected)