From 70a57d0f3fcb39c6d04bcfe537efe607b7284dc0 Mon Sep 17 00:00:00 2001 From: Vin Date: Mon, 22 Jun 2026 03:42:15 -0400 Subject: [PATCH 1/2] Add gaia publication_figures_by_gene endpoint + wire gaia for tests GET /gaia/publication_figures_by_gene/ resolves a gene to its alias set, matches OCR-detected figure words (word-boundary regex for aliases >=4 chars, exact match for <=3), and returns figures grouped by PMC with img_url/caption/bbox plus allImageWords. Includes the bare-name collision guard, null-url skip, and numeric-pubmed ordering. Implemented in pure SQLAlchemy Core (gaia bind inferred from models; no raw text()) similar fashion to our typical endpoints. - models: add AuthorList + FigureModels (reuse existing gaia models) - wire gaia for local/CI: SQLALCHEMY_BINDS entry, init.sh load line, and a curated config/databases/gaia.sql test fixture - tests: test_gaia.py with a skip-unless-MySQL guard so the MySQL-only endpoint skips (not errors) under the SQLite test harness --- api/models/gaia.py | 19 +++ api/resources/gaia.py | 170 ++++++++++++++++++++++++- config/BAR_API.cfg | 3 +- config/databases/gaia.sql | 235 +++++++++++++++++++++++++++++++++++ config/init.sh | 1 + tests/resources/test_gaia.py | 81 ++++++++++++ 6 files changed, 506 insertions(+), 3 deletions(-) create mode 100644 config/databases/gaia.sql create mode 100644 tests/resources/test_gaia.py diff --git a/api/models/gaia.py b/api/models/gaia.py index ec276356..2a2346ce 100644 --- a/api/models/gaia.py +++ b/api/models/gaia.py @@ -60,3 +60,22 @@ class Figures(db.Model): img_name: db.Mapped[str] = db.mapped_column(db.String(64), nullable=False) caption: db.Mapped[str] = db.mapped_column(db.Text, nullable=True) img_url: db.Mapped[str] = db.mapped_column(db.String(256), nullable=True) + + +class AuthorList(db.Model): + __bind_key__ = "gaia" + __tablename__ = "author_list" + + id: db.Mapped[int] = db.mapped_column(db.Integer, nullable=False, primary_key=True) + publication_figures_id: db.Mapped[int] = db.mapped_column( + ForeignKey("publication_figures.id", ondelete="CASCADE"), nullable=False + ) + author: db.Mapped[str] = db.mapped_column(db.String(128), nullable=False) + + +class FigureModels(db.Model): + __bind_key__ = "gaia" + __tablename__ = "figure_models" + + id: db.Mapped[int] = db.mapped_column(db.Integer, nullable=False, primary_key=True) + data: db.Mapped[dict] = db.mapped_column(db.JSON, nullable=True) diff --git a/api/resources/gaia.py b/api/resources/gaia.py index 8c7818e8..aae863e8 100644 --- a/api/resources/gaia.py +++ b/api/resources/gaia.py @@ -3,10 +3,12 @@ from markupsafe import escape from api import db from api.utils.bar_utils import BARUtils -from api.models.gaia import Genes, Aliases, PubIds, Figures -from sqlalchemy import func, or_ +from api.models.gaia import Genes, Aliases, PublicationFigures, PubIds, Figures, AuthorList, FigureModels +from sqlalchemy import func, or_, cast, literal +from sqlalchemy.dialects import mysql from marshmallow import Schema, ValidationError, fields as marshmallow_fields import json +import re gaia = Namespace("Gaia", description="Gaia", path="/gaia") @@ -174,3 +176,167 @@ def post(self): # Return final data return BARUtils.success_exit(data) + + +@gaia.route("/publication_figures_by_gene/") +class GaiaPublicationFiguresByGene(Resource): + @gaia.param("identifier", _in="path", default="ABI3") + def get(self, identifier=""): + # Escape input and validate + identifier = escape(identifier) + if not BARUtils.is_gaia_alias(identifier): + return BARUtils.error_exit("Invalid identifier"), 400 + + # --- Resolve identifier -> gene id(s) -> the gene's full alias set --- + rows = db.session.execute(db.select(Aliases.genes_id).filter(Aliases.alias == identifier)).fetchall() + gene_ids = [r.genes_id for r in rows] + + if not gene_ids: # not an alias - try locus / ncbi geneid + rows = db.session.execute( + db.select(Genes.id).filter(or_(Genes.locus == identifier, Genes.geneid == identifier)) + ).fetchall() + gene_ids = [r.id for r in rows] + + if not gene_ids: + return BARUtils.error_exit("Nothing found"), 404 + + aliases = [ + r.alias.lower() + for r in db.session.execute(db.select(Aliases.alias).filter(Aliases.genes_id.in_(gene_ids))).fetchall() + ] + + # --- Build the OCR word-boundary match (V1-resolved policy, no gene:true filter) --- + # Aliases >= 4 chars: whole-token regex match (catches "abi3/vp1", rejects "gabi390_r"). + # Aliases <= 3 chars: exact IN(...) - a boundary match on a short token over-matches. + long_aliases = sorted({re.escape(a) for a in aliases if len(a) >= 4}) + short_aliases = sorted({a for a in aliases if len(a) < 4}) + + # Gene resolved but no usable aliases -> empty payload (frontend falls back to PubMed feed). + if not long_aliases and not short_aliases: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # --- Step 1: match on the scalar OCR word (pure Core; no array explosion) --- + word_expr = func.lower(func.json_unquote(func.json_extract(FigureModels.data, "$.word"))) + match_conds = [] + if long_aliases: + alias_re = "(^|[^a-z0-9])(" + "|".join(long_aliases) + ")([^a-z0-9]|$)" + match_conds.append(word_expr.regexp_match(alias_re)) + if short_aliases: + match_conds.append(word_expr.in_(short_aliases)) + + matched_rows = db.session.execute(db.select(FigureModels.data).where(or_(*match_conds))).fetchall() + if not matched_rows: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # --- Step 2: unpack matched rows' image[] in Python -> name -> [bbox] (data: dict or str) --- + bbox_by_name = {} + for row in matched_rows: + d = row.data if isinstance(row.data, dict) else json.loads(row.data) + for img in d.get("image", []): + name = (img.get("imageName") or "").lstrip("/") + if name: + bbox_by_name.setdefault(name, []).append(img.get("bbox")) + + stripped_names = list(bbox_by_name.keys()) + if not stripped_names: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # --- Step 3: figures + publication metadata (pure Core; bbox now comes from Python) --- + # Bare-name guard: exclude any img_name mapping to >1 publication (un-attributable detections). + collision = ( + db.select(Figures.img_name) + .group_by(Figures.img_name) + .having(func.count(func.distinct(Figures.publication_figures_id)) > 1) + ) + core_stmt = ( + db.select( + PubIds.pmc, + PubIds.pubmed, + PublicationFigures.id.label("pf_id"), + PublicationFigures.title, + PublicationFigures.abstract, + Figures.img_name, + Figures.img_url, + Figures.caption, + ) + .select_from(Figures) + .join(PublicationFigures, PublicationFigures.id == Figures.publication_figures_id) + .join(PubIds, PubIds.publication_figures_id == PublicationFigures.id) + .where(Figures.img_name.in_(stripped_names)) + .where(Figures.img_url.isnot(None)) + .where(Figures.img_name.not_in(collision)) + .order_by(cast(PubIds.pubmed, mysql.INTEGER(unsigned=True)).desc()) + ) + fig_rows = db.session.execute(core_stmt).fetchall() + + # Valid gene, but no OCR-matched figures -> empty payload (NOT 404). + if not fig_rows: + return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) + + # --- Step 4: group by PMC; dedupe figures by img_name; attach the full accumulated bbox list --- + # Dedup guard: the collision guard only excludes names spanning >1 publication, so two figures + # rows with the same name under one pf_id could still duplicate. Surviving img_name -> pmc is + # 1:1, so a seen-set on img_name is sufficient; bbox_by_name[name] already holds the full list. + figures_by_pmc, pmc_to_pf, pf_ids, seen_names = {}, {}, set(), set() + for r in fig_rows: + pmc = r.pmc + pf_ids.add(r.pf_id) + pmc_to_pf[pmc] = r.pf_id + if pmc not in figures_by_pmc: + figures_by_pmc[pmc] = { + "title": r.title, + "abstract": r.abstract, + "authors": [], + "pubmed": r.pubmed, + "figures": [], + } + if r.img_name in seen_names: + continue + seen_names.add(r.img_name) + figures_by_pmc[pmc]["figures"].append( + { + "img_name": r.img_name, + "img_url": r.img_url, + "caption": r.caption, + "bbox": bbox_by_name.get(r.img_name, []), + } + ) + + # --- Step 5: authors per publication (one bulk query; db.select infers the gaia bind) --- + authors_by_pf = {} + for r in db.session.execute( + db.select(AuthorList.publication_figures_id, AuthorList.author).filter( + AuthorList.publication_figures_id.in_(pf_ids) + ) + ).fetchall(): + authors_by_pf.setdefault(r.publication_figures_id, []).append(r.author) + for pmc, pf_id in pmc_to_pf.items(): + figures_by_pmc[pmc]["authors"] = authors_by_pf.get(pf_id, []) + + # --- Step 6: allImageWords (pure Core JSON_OVERLAPS pre-filter + Python unpack) --- + # gene:true words detected on the displayed figures. JSON_OVERLAPS pre-filters rows that share + # any displayed image; the Python pass keeps only the displayed names. All gaia-bound -> bind inferred. + displayed_names = list({r.img_name for r in fig_rows}) + all_image_words = {} + if displayed_names: + displayed_slashed = json.dumps(["/" + n for n in displayed_names]) # re-add slash to match stored + words_rows = db.session.execute( + db.select(FigureModels.data) + .where(func.json_unquote(func.json_extract(FigureModels.data, "$.gene")) == "true") + .where( + func.json_overlaps( + func.json_extract(FigureModels.data, "$.image[*].imageName"), + cast(literal(displayed_slashed), mysql.JSON), + ) + ) + ).fetchall() + displayed_set = set(displayed_names) + for row in words_rows: + d = row.data if isinstance(row.data, dict) else json.loads(row.data) + word = (d.get("word") or "").lower() + for img in d.get("image", []): + name = (img.get("imageName") or "").lstrip("/") + if name in displayed_set: + all_image_words.setdefault(word, {})[name] = img.get("bbox") + + return BARUtils.success_exit({"figures": figures_by_pmc, "allImageWords": all_image_words}) diff --git a/config/BAR_API.cfg b/config/BAR_API.cfg index a7f2cbff..a2419747 100755 --- a/config/BAR_API.cfg +++ b/config/BAR_API.cfg @@ -42,5 +42,6 @@ SQLALCHEMY_BINDS = { 'striga' : 'mysql://root:root@localhost/striga', 'tomato_nssnp' : 'mysql://root:root@localhost/tomato_nssnp', 'tomato_sequence' : 'mysql://root:root@localhost/tomato_sequence', - 'triphysaria' : 'mysql://root:root@localhost/triphysaria' + 'triphysaria' : 'mysql://root:root@localhost/triphysaria', + 'gaia' : 'mysql://root:root@localhost/gaia' } diff --git a/config/databases/gaia.sql b/config/databases/gaia.sql new file mode 100644 index 00000000..c15a7b44 --- /dev/null +++ b/config/databases/gaia.sql @@ -0,0 +1,235 @@ +-- MySQL dump 10.13 Distrib 9.4.0, for Linux (x86_64) +-- +-- Host: localhost Database: gaia +-- ------------------------------------------------------ +-- Server version 9.4.0 +-- +-- Curated TEST FIXTURE for the `gaia` bind (BAR_API local/CI) — NOT a prod dump +-- (read-only access blocks mysqldump). Small by design; each row group trips one +-- rule of GET /gaia/publication_figures_by_gene/: +-- * PMC151246 (real PMID 12045268, figs 01-0441f1..f5): abi3 OCR'd on f2+f4 only. +-- * abi3/abi5 (gene:false) on variantfig.jpg : word-boundary match must CATCH. +-- * gabi390_r (gene:false) on gabitest.jpg : word-boundary match must REJECT. +-- * gr1.jpg in two publications : bare-name guard must DROP. +-- * nullfig.jpg (NULL img_url) : null-url skip must DROP. +-- * gene 2 (alias NOMATCH4) : empty-payload (200, not 404) path. +-- figure_models.imageName keeps a leading "/" to exercise TRIM(LEADING '/' ...). +-- DDL matched to prod SHOW CREATE TABLE (2026-06-20). FK constraints intentionally +-- omitted (they do not affect read queries; keeps the fixture load-order-independent). + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Current Database: `gaia` +-- + +CREATE DATABASE /*!32312 IF NOT EXISTS*/ `gaia` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci */ /*!80016 DEFAULT ENCRYPTION='N' */; + +USE `gaia`; + +-- +-- Table structure for table `genes` +-- + +DROP TABLE IF EXISTS `genes`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `genes` ( + `id` int NOT NULL AUTO_INCREMENT, + `species` varchar(64) NOT NULL, + `locus` varchar(64) DEFAULT NULL, + `geneid` varchar(32) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `idx_locus` (`locus`), + KEY `idx_geneid` (`geneid`) +) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `genes` +-- + +LOCK TABLES `genes` WRITE; +/*!40000 ALTER TABLE `genes` DISABLE KEYS */; +INSERT INTO `genes` VALUES (1,'Arabidopsis_thaliana','At3g24650','822061'),(2,'Arabidopsis_thaliana','At1g01010','000001'); +/*!40000 ALTER TABLE `genes` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `aliases` +-- + +DROP TABLE IF EXISTS `aliases`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `aliases` ( + `id` int NOT NULL AUTO_INCREMENT, + `genes_id` int NOT NULL, + `alias` varchar(256) NOT NULL, + PRIMARY KEY (`id`), + KEY `FK_genes` (`genes_id`), + KEY `idx_aliases` (`alias`,`genes_id`) +) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `aliases` +-- + +LOCK TABLES `aliases` WRITE; +/*!40000 ALTER TABLE `aliases` DISABLE KEYS */; +INSERT INTO `aliases` VALUES (1,1,'ABI3'),(2,1,'SIS10'),(3,1,'AtABI3'),(4,1,'ABA INSENSITIVE 3'),(5,1,'ABSCISIC ACID INSENSITIVE 3'),(6,1,'SUGAR INSENSITIVE 10'),(7,2,'NOMATCH4'); +/*!40000 ALTER TABLE `aliases` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `publication_figures` +-- + +DROP TABLE IF EXISTS `publication_figures`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `publication_figures` ( + `id` int NOT NULL AUTO_INCREMENT, + `title` varchar(512) DEFAULT NULL, + `abstract` text, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=61 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `publication_figures` +-- +-- NOTE: pub 60 (PMC6403161) title/abstract are stored TRUNCATED in the prod corpus +-- (title ends "... Responses in ", abstract ends "... selected family members in ") — kept VERBATIM. + +LOCK TABLES `publication_figures` WRITE; +/*!40000 ALTER TABLE `publication_figures` DISABLE KEYS */; +INSERT INTO `publication_figures` VALUES (10,'Abscisic acid signaling in seeds and seedlings.','Review of ABA signaling pathways in seeds and seedlings.'),(20,'Unrelated paper A (collision partner 1).','Abstract A.'),(30,'Unrelated paper B (collision partner 2 + null url).','Abstract B.'),(40,'Unrelated paper C (false-positive container).','Abstract C.'),(60,'AP2/ERF Transcription Factor Regulatory Networks in Hormone and Abiotic Stress Responses in ','Dynamic environmental changes such as extreme temperature, water scarcity and high salinity affect plant growth, survival, and reproduction. Plants have evolved sophisticated regulatory mechanisms to adapt to these unfavorable conditions, many of which interface with plant hormone signaling pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stress responses at least in part through hormone- and stress-responsive transcription factors. Among these, the APETALA2/ETHYLENE RESPONSIVE FACTOR (AP2/ERF) family transcription factors (AP2/ERFs) have emerged as key regulators of various stress responses, in which they also respond to hormones with improved plant survival during stress conditions. Apart from participation in specific stresses, AP2/ERFs are involved in a wide range of stress tolerance, enabling them to form an interconnected stress regulatory network. Additionally, many AP2/ERFs respond to the plant hormones abscisic acid (ABA) and ethylene (ET) to help activate ABA and ET dependent and independent stress-responsive genes. While some AP2/ERFs are implicated in growth and developmental processes mediated by gibberellins (GAs), cytokinins (CTK), and brassinosteroids (BRs). The involvement of AP2/ERFs in hormone signaling adds the complexity of stress regulatory network. In this review, we summarize recent studies on AP2/ERF transcription factors in hormonal and abiotic stress responses with an emphasis on selected family members in '); +/*!40000 ALTER TABLE `publication_figures` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `figures` +-- + +DROP TABLE IF EXISTS `figures`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `figures` ( + `id` int NOT NULL AUTO_INCREMENT, + `publication_figures_id` int NOT NULL, + `img_name` varchar(64) NOT NULL, + `caption` text, + `img_url` varchar(265) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `FK_publication_figures_figures` (`publication_figures_id`) +) ENGINE=InnoDB AUTO_INCREMENT=601 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `figures` +-- + +LOCK TABLES `figures` WRITE; +/*!40000 ALTER TABLE `figures` DISABLE KEYS */; +INSERT INTO `figures` VALUES (100,10,'01-0441f1.jpg','Domain Structure of B3 and bZIP Domain Transcription Factors','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/8e4f6ebe4139/01-0441f1.jpg'),(101,10,'01-0441f2.jpg','Scheme of Signaling Pathways in Seed Development.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/a56d4d6d5560/01-0441f2.jpg'),(102,10,'01-0441f3.jpg','Regulation of ABA-Responsive Promoter Activity in a Rice Embryo Protoplast','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/67ad8027f184/01-0441f3.jpg'),(103,10,'01-0441f4.jpg','Scheme of Signaling Pathways That Interact with the ABA Regulation of Germination.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/7be9f20adc7b/01-0441f4.jpg'),(104,10,'01-0441f5.jpg','Sensitivity of Seedlings of Wild-Type, abi, and ABI Overexpressing Lines','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/3c7ab6e933bb/01-0441f5.jpg'),(200,20,'gr1.jpg','Collision figure (pub A).','https://example.org/PMC900001/gr1.jpg'),(300,30,'gr1.jpg','Collision figure (pub B).','https://example.org/PMC900002/gr1.jpg'),(301,30,'nullfig.jpg','Figure with no URL.',NULL),(400,40,'gabitest.jpg','GABI line figure (false-positive bait).','https://example.org/PMC900003/gabitest.jpg'),(600,60,'fpls-10-00228-g003.jpg','AP2/ERFs roles in hormone pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stresses responses through hormone signaling components and AP2/ERFs. Arrows and bar ends indicate activation and repression effect, respectively. Figure is created with BioRender.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/9ceb/6403161/d18acaa7332e/fpls-10-00228-g003.jpg'); +/*!40000 ALTER TABLE `figures` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `pub_ids` +-- + +DROP TABLE IF EXISTS `pub_ids`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `pub_ids` ( + `id` int NOT NULL AUTO_INCREMENT, + `publication_figures_id` int NOT NULL, + `pubmed` varchar(16) DEFAULT NULL, + `pmc` varchar(16) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `FK_publication_figures_pub_ids` (`publication_figures_id`) +) ENGINE=InnoDB AUTO_INCREMENT=61 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `pub_ids` +-- + +LOCK TABLES `pub_ids` WRITE; +/*!40000 ALTER TABLE `pub_ids` DISABLE KEYS */; +INSERT INTO `pub_ids` VALUES (10,10,'12045268','PMC151246'),(20,20,'30000001','PMC900001'),(30,30,'30000002','PMC900002'),(40,40,'30000003','PMC900003'),(60,60,'30873200','PMC6403161'); +/*!40000 ALTER TABLE `pub_ids` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `author_list` +-- + +DROP TABLE IF EXISTS `author_list`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `author_list` ( + `id` int NOT NULL AUTO_INCREMENT, + `publication_figures_id` int NOT NULL, + `author` varchar(128) NOT NULL, + PRIMARY KEY (`id`), + KEY `FK_publication_figures_author_list` (`publication_figures_id`) +) ENGINE=InnoDB AUTO_INCREMENT=9 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `author_list` +-- + +LOCK TABLES `author_list` WRITE; +/*!40000 ALTER TABLE `author_list` DISABLE KEYS */; +INSERT INTO `author_list` VALUES (1,10,'Finkelstein RR'),(2,10,'Gampala SSL'),(3,10,'Rock CD'),(5,60,'Zhouli Xie'),(6,60,'Trevor M Nolan'),(7,60,'Hao Jiang'),(8,60,'Yanhai Yin'); +/*!40000 ALTER TABLE `author_list` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `figure_models` +-- + +DROP TABLE IF EXISTS `figure_models`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `figure_models` ( + `id` int NOT NULL AUTO_INCREMENT, + `data` json DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `figure_models` +-- + +LOCK TABLES `figure_models` WRITE; +/*!40000 ALTER TABLE `figure_models` DISABLE KEYS */; +INSERT INTO `figure_models` VALUES (1,'{"gene": true, "word": "abi3", "image": [{"bbox": [[[268,108],[317,108],[317,118],[268,118]],[[45,110],[72,111],[72,120],[45,119]]], "imageName": "/01-0441f2.jpg"},{"bbox": [[[274,96],[298,96],[298,105],[274,105]]], "imageName": "/01-0441f4.jpg"},{"bbox": [[[10,10],[40,10],[40,20],[10,20]]], "imageName": "/gr1.jpg"},{"bbox": [[[5,5],[25,5],[25,15],[5,15]]], "imageName": "/nullfig.jpg"}]}'),(3,'{"gene": false, "word": "gabi390_r", "image": [{"bbox": [[[200,200],[260,200],[260,212],[200,212]]], "imageName": "/gabitest.jpg"}]}'),(4,'{"gene": false, "word": "abi3/vp1", "image": [{"bbox": [[[34, 357], [96, 357], [96, 369], [34, 369]]], "imageName": "/fpls-10-00228-g003.jpg"}]}'); +/*!40000 ALTER TABLE `figure_models` ENABLE KEYS */; +UNLOCK TABLES; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed \ No newline at end of file diff --git a/config/init.sh b/config/init.sh index c3e4a06b..2002159f 100755 --- a/config/init.sh +++ b/config/init.sh @@ -48,6 +48,7 @@ mysql -u $DB_USER -p$DB_PASS < ./config/databases/striga.sql mysql -u $DB_USER -p$DB_PASS < ./config/databases/tomato_nssnp.sql mysql -u $DB_USER -p$DB_PASS < ./config/databases/tomato_sequence.sql mysql -u $DB_USER -p$DB_PASS < ./config/databases/triphysaria.sql +mysql -u $DB_USER -p$DB_PASS < ./config/databases/gaia.sql echo "Data are now loaded. Preparing API config" echo "Please manually edit config file!" diff --git a/tests/resources/test_gaia.py b/tests/resources/test_gaia.py new file mode 100644 index 00000000..6de90de3 --- /dev/null +++ b/tests/resources/test_gaia.py @@ -0,0 +1,81 @@ +from api import app, db +from unittest import TestCase +from sqlalchemy.exc import UnboundExecutionError + + +class TestGaiaPublicationFiguresByGene(TestCase): + def setUp(self): + self.app_client = app.test_client() + with app.app_context(): + try: + self.gaia_is_mysql = db.engines["gaia"].dialect.name == "mysql" + except (KeyError, UnboundExecutionError): + self.gaia_is_mysql = False + + def _require_mysql(self): + if not self.gaia_is_mysql: + self.skipTest("requires MySQL gaia bind; skipped under SQLite harness") + + def test_publication_figures_by_gene_abi3(self): + """ABI3 should surface OCR-detected figures grouped by PMC, exercising every + fixture rule (match, word-boundary catch/reject, bare-name guard, null-url skip). + :return: + """ + self._require_mysql() + response = self.app_client.get("/gaia/publication_figures_by_gene/ABI3") + self.assertEqual(response.status_code, 200) + + body = response.json + self.assertTrue(body["wasSuccessful"]) + data = body["data"] + figures = data["figures"] + + # ABI3 returns two REAL pubs -> assert by set membership, not position. + self.assertLessEqual({"PMC6403161", "PMC151246"}, set(figures)) + + # PMC151246: abi3 OCR'd on f2 + f4 only (NOT f1/f3/f5). + self.assertIn("PMC151246", figures) + pmc151246_names = {f["img_name"] for f in figures["PMC151246"]["figures"]} + self.assertIn("01-0441f2.jpg", pmc151246_names) + self.assertIn("01-0441f4.jpg", pmc151246_names) + self.assertNotIn("01-0441f1.jpg", pmc151246_names) + self.assertNotIn("01-0441f3.jpg", pmc151246_names) + self.assertNotIn("01-0441f5.jpg", pmc151246_names) + + # PMC6403161: word-boundary match must CATCH the gene:false word abi3/vp1 -> + # fpls-10-00228-g003.jpg. Exact IN(...) would miss it -> this pins boundary-vs-exact. + self.assertIn("PMC6403161", figures) + pmc6403161_names = {f["img_name"] for f in figures["PMC6403161"]["figures"]} + self.assertIn("fpls-10-00228-g003.jpg", pmc6403161_names) + + # Across all PMCs: gr1.jpg dropped (bare-name guard), nullfig.jpg dropped + # (null img_url), gabitest.jpg dropped (gabi390_r boundary-rejected). + all_names = {f["img_name"] for pmc in figures.values() for f in pmc["figures"]} + self.assertNotIn("gr1.jpg", all_names) + self.assertNotIn("nullfig.jpg", all_names) + self.assertNotIn("gabitest.jpg", all_names) + + # allImageWords powers the gene-name filter. + self.assertIn("abi3", data["allImageWords"]) + + # Authors are attached per publication. + self.assertIn("Finkelstein RR", figures["PMC151246"]["authors"]) + + def test_publication_figures_by_gene_empty_payload(self): + """A valid gene with no OCR-matched figures returns 200 with an empty payload, + not a 404. + :return: + """ + self._require_mysql() + response = self.app_client.get("/gaia/publication_figures_by_gene/NOMATCH4") + self.assertEqual(response.status_code, 200) + expected = {"wasSuccessful": True, "data": {"figures": {}, "allImageWords": {}}} + self.assertEqual(response.json, expected) + + def test_publication_figures_by_gene_invalid_identifier(self): + """An identifier failing the gaia alias check returns a 400 error. + :return: + """ + response = self.app_client.get("/gaia/publication_figures_by_gene/abc!def") + expected = {"wasSuccessful": False, "error": "Invalid identifier"} + self.assertEqual(response.json, expected) From be250419f20c9324e38d0ed35cc0c664cbac3938 Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 23 Jun 2026 03:29:33 -0400 Subject: [PATCH 2/2] gaia: cover short-alias exact-IN branch + harden null bbox handling - gaia.sql: add gene 34G (3-char alias) test group with the exact OCR word 34g, a boundary decoy 34g/x and a substring decoy x34gy on their own figures, plus a malformed OCR entry (imageName with no bbox). Pins that short aliases (<=3 chars) match exact-only, never via the word-boundary regex or a LIKE. This is because prod db only has gene aliases of at least 3 len. - resources/gaia.py: unpack records the image name even when an OCR entry has no bbox (so the figure still displays) but skips the None, keeping each figure's bbox list clean for the frontend's box-drawing. - test_gaia.py: add the 34G short-alias test (gated MySQL-only via the skip guard) asserting exact match, decoy exclusion, and a None-free bbox list. --- api/resources/gaia.py | 59 ++++++++++++++++++------------------ config/databases/gaia.sql | 28 ++++++++--------- tests/resources/test_gaia.py | 39 ++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 43 deletions(-) diff --git a/api/resources/gaia.py b/api/resources/gaia.py index aae863e8..7fcaa5df 100644 --- a/api/resources/gaia.py +++ b/api/resources/gaia.py @@ -182,16 +182,19 @@ def post(self): class GaiaPublicationFiguresByGene(Resource): @gaia.param("identifier", _in="path", default="ABI3") def get(self, identifier=""): - # Escape input and validate + + # Escape input identifier = escape(identifier) + + # Is it valid if not BARUtils.is_gaia_alias(identifier): return BARUtils.error_exit("Invalid identifier"), 400 - # --- Resolve identifier -> gene id(s) -> the gene's full alias set --- + # Resolve to gene ids: try alias first, then locus / ncbi id rows = db.session.execute(db.select(Aliases.genes_id).filter(Aliases.alias == identifier)).fetchall() gene_ids = [r.genes_id for r in rows] - if not gene_ids: # not an alias - try locus / ncbi geneid + if not gene_ids: rows = db.session.execute( db.select(Genes.id).filter(or_(Genes.locus == identifier, Genes.geneid == identifier)) ).fetchall() @@ -200,22 +203,20 @@ def get(self, identifier=""): if not gene_ids: return BARUtils.error_exit("Nothing found"), 404 + # Get the gene's full alias set aliases = [ r.alias.lower() for r in db.session.execute(db.select(Aliases.alias).filter(Aliases.genes_id.in_(gene_ids))).fetchall() ] - # --- Build the OCR word-boundary match (V1-resolved policy, no gene:true filter) --- - # Aliases >= 4 chars: whole-token regex match (catches "abi3/vp1", rejects "gabi390_r"). - # Aliases <= 3 chars: exact IN(...) - a boundary match on a short token over-matches. + # Match OCR words: word-boundary regex for long aliases, exact match for short ones long_aliases = sorted({re.escape(a) for a in aliases if len(a) >= 4}) short_aliases = sorted({a for a in aliases if len(a) < 4}) - # Gene resolved but no usable aliases -> empty payload (frontend falls back to PubMed feed). + # No usable aliases, nothing to match on if not long_aliases and not short_aliases: return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) - # --- Step 1: match on the scalar OCR word (pure Core; no array explosion) --- word_expr = func.lower(func.json_unquote(func.json_extract(FigureModels.data, "$.word"))) match_conds = [] if long_aliases: @@ -228,26 +229,31 @@ def get(self, identifier=""): if not matched_rows: return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) - # --- Step 2: unpack matched rows' image[] in Python -> name -> [bbox] (data: dict or str) --- + # Collect each matched image and its boxes (keep the image even if a box is missing) bbox_by_name = {} for row in matched_rows: d = row.data if isinstance(row.data, dict) else json.loads(row.data) for img in d.get("image", []): name = (img.get("imageName") or "").lstrip("/") - if name: - bbox_by_name.setdefault(name, []).append(img.get("bbox")) + if not name: + continue + bbox_list = bbox_by_name.setdefault(name, []) + bbox = img.get("bbox") + if bbox is not None: + bbox_list.append(bbox) stripped_names = list(bbox_by_name.keys()) if not stripped_names: return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) - # --- Step 3: figures + publication metadata (pure Core; bbox now comes from Python) --- - # Bare-name guard: exclude any img_name mapping to >1 publication (un-attributable detections). + # Drop image names used by more than one publication, we can't attribute those collision = ( db.select(Figures.img_name) .group_by(Figures.img_name) .having(func.count(func.distinct(Figures.publication_figures_id)) > 1) ) + + # Pull the figures and their publication info, skip null urls, newest pubmed first core_stmt = ( db.select( PubIds.pmc, @@ -269,21 +275,16 @@ def get(self, identifier=""): ) fig_rows = db.session.execute(core_stmt).fetchall() - # Valid gene, but no OCR-matched figures -> empty payload (NOT 404). if not fig_rows: return BARUtils.success_exit({"figures": {}, "allImageWords": {}}) - # --- Step 4: group by PMC; dedupe figures by img_name; attach the full accumulated bbox list --- - # Dedup guard: the collision guard only excludes names spanning >1 publication, so two figures - # rows with the same name under one pf_id could still duplicate. Surviving img_name -> pmc is - # 1:1, so a seen-set on img_name is sufficient; bbox_by_name[name] already holds the full list. + # Group figures by PMC, one entry per image name figures_by_pmc, pmc_to_pf, pf_ids, seen_names = {}, {}, set(), set() for r in fig_rows: - pmc = r.pmc pf_ids.add(r.pf_id) - pmc_to_pf[pmc] = r.pf_id - if pmc not in figures_by_pmc: - figures_by_pmc[pmc] = { + pmc_to_pf[r.pmc] = r.pf_id + if r.pmc not in figures_by_pmc: + figures_by_pmc[r.pmc] = { "title": r.title, "abstract": r.abstract, "authors": [], @@ -293,7 +294,7 @@ def get(self, identifier=""): if r.img_name in seen_names: continue seen_names.add(r.img_name) - figures_by_pmc[pmc]["figures"].append( + figures_by_pmc[r.pmc]["figures"].append( { "img_name": r.img_name, "img_url": r.img_url, @@ -302,7 +303,7 @@ def get(self, identifier=""): } ) - # --- Step 5: authors per publication (one bulk query; db.select infers the gaia bind) --- + # Attach authors to each publication authors_by_pf = {} for r in db.session.execute( db.select(AuthorList.publication_figures_id, AuthorList.author).filter( @@ -313,13 +314,11 @@ def get(self, identifier=""): for pmc, pf_id in pmc_to_pf.items(): figures_by_pmc[pmc]["authors"] = authors_by_pf.get(pf_id, []) - # --- Step 6: allImageWords (pure Core JSON_OVERLAPS pre-filter + Python unpack) --- - # gene:true words detected on the displayed figures. JSON_OVERLAPS pre-filters rows that share - # any displayed image; the Python pass keeps only the displayed names. All gaia-bound -> bind inferred. + # allImageWords: gene words detected on the shown figures, for the gene-name filter displayed_names = list({r.img_name for r in fig_rows}) all_image_words = {} if displayed_names: - displayed_slashed = json.dumps(["/" + n for n in displayed_names]) # re-add slash to match stored + displayed_slashed = json.dumps(["/" + n for n in displayed_names]) # stored names keep a leading / words_rows = db.session.execute( db.select(FigureModels.data) .where(func.json_unquote(func.json_extract(FigureModels.data, "$.gene")) == "true") @@ -337,6 +336,8 @@ def get(self, identifier=""): for img in d.get("image", []): name = (img.get("imageName") or "").lstrip("/") if name in displayed_set: - all_image_words.setdefault(word, {})[name] = img.get("bbox") + bbox = img.get("bbox") + all_image_words.setdefault(word, {})[name] = bbox if bbox is not None else [] + # Return final data return BARUtils.success_exit({"figures": figures_by_pmc, "allImageWords": all_image_words}) diff --git a/config/databases/gaia.sql b/config/databases/gaia.sql index c15a7b44..b3f9a858 100644 --- a/config/databases/gaia.sql +++ b/config/databases/gaia.sql @@ -51,7 +51,7 @@ CREATE TABLE `genes` ( PRIMARY KEY (`id`), KEY `idx_locus` (`locus`), KEY `idx_geneid` (`geneid`) -) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -60,7 +60,7 @@ CREATE TABLE `genes` ( LOCK TABLES `genes` WRITE; /*!40000 ALTER TABLE `genes` DISABLE KEYS */; -INSERT INTO `genes` VALUES (1,'Arabidopsis_thaliana','At3g24650','822061'),(2,'Arabidopsis_thaliana','At1g01010','000001'); +INSERT INTO `genes` VALUES (1,'Arabidopsis_thaliana','At3g24650','822061'),(2,'Arabidopsis_thaliana','At1g01010','000001'),(3,'Arabidopsis_thaliana','At5g12340','999340'); /*!40000 ALTER TABLE `genes` ENABLE KEYS */; UNLOCK TABLES; @@ -78,7 +78,7 @@ CREATE TABLE `aliases` ( PRIMARY KEY (`id`), KEY `FK_genes` (`genes_id`), KEY `idx_aliases` (`alias`,`genes_id`) -) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=9 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -87,7 +87,7 @@ CREATE TABLE `aliases` ( LOCK TABLES `aliases` WRITE; /*!40000 ALTER TABLE `aliases` DISABLE KEYS */; -INSERT INTO `aliases` VALUES (1,1,'ABI3'),(2,1,'SIS10'),(3,1,'AtABI3'),(4,1,'ABA INSENSITIVE 3'),(5,1,'ABSCISIC ACID INSENSITIVE 3'),(6,1,'SUGAR INSENSITIVE 10'),(7,2,'NOMATCH4'); +INSERT INTO `aliases` VALUES (1,1,'ABI3'),(2,1,'SIS10'),(3,1,'AtABI3'),(4,1,'ABA INSENSITIVE 3'),(5,1,'ABSCISIC ACID INSENSITIVE 3'),(6,1,'SUGAR INSENSITIVE 10'),(7,2,'NOMATCH4'),(8,3,'34G'); /*!40000 ALTER TABLE `aliases` ENABLE KEYS */; UNLOCK TABLES; @@ -103,7 +103,7 @@ CREATE TABLE `publication_figures` ( `title` varchar(512) DEFAULT NULL, `abstract` text, PRIMARY KEY (`id`) -) ENGINE=InnoDB AUTO_INCREMENT=61 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=71 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -114,7 +114,7 @@ CREATE TABLE `publication_figures` ( LOCK TABLES `publication_figures` WRITE; /*!40000 ALTER TABLE `publication_figures` DISABLE KEYS */; -INSERT INTO `publication_figures` VALUES (10,'Abscisic acid signaling in seeds and seedlings.','Review of ABA signaling pathways in seeds and seedlings.'),(20,'Unrelated paper A (collision partner 1).','Abstract A.'),(30,'Unrelated paper B (collision partner 2 + null url).','Abstract B.'),(40,'Unrelated paper C (false-positive container).','Abstract C.'),(60,'AP2/ERF Transcription Factor Regulatory Networks in Hormone and Abiotic Stress Responses in ','Dynamic environmental changes such as extreme temperature, water scarcity and high salinity affect plant growth, survival, and reproduction. Plants have evolved sophisticated regulatory mechanisms to adapt to these unfavorable conditions, many of which interface with plant hormone signaling pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stress responses at least in part through hormone- and stress-responsive transcription factors. Among these, the APETALA2/ETHYLENE RESPONSIVE FACTOR (AP2/ERF) family transcription factors (AP2/ERFs) have emerged as key regulators of various stress responses, in which they also respond to hormones with improved plant survival during stress conditions. Apart from participation in specific stresses, AP2/ERFs are involved in a wide range of stress tolerance, enabling them to form an interconnected stress regulatory network. Additionally, many AP2/ERFs respond to the plant hormones abscisic acid (ABA) and ethylene (ET) to help activate ABA and ET dependent and independent stress-responsive genes. While some AP2/ERFs are implicated in growth and developmental processes mediated by gibberellins (GAs), cytokinins (CTK), and brassinosteroids (BRs). The involvement of AP2/ERFs in hormone signaling adds the complexity of stress regulatory network. In this review, we summarize recent studies on AP2/ERF transcription factors in hormonal and abiotic stress responses with an emphasis on selected family members in '); +INSERT INTO `publication_figures` VALUES (10,'Abscisic acid signaling in seeds and seedlings.','Review of ABA signaling pathways in seeds and seedlings.'),(20,'Unrelated paper A (collision partner 1).','Abstract A.'),(30,'Unrelated paper B (collision partner 2 + null url).','Abstract B.'),(40,'Unrelated paper C (false-positive container).','Abstract C.'),(60,'AP2/ERF Transcription Factor Regulatory Networks in Hormone and Abiotic Stress Responses in ','Dynamic environmental changes such as extreme temperature, water scarcity and high salinity affect plant growth, survival, and reproduction. Plants have evolved sophisticated regulatory mechanisms to adapt to these unfavorable conditions, many of which interface with plant hormone signaling pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stress responses at least in part through hormone- and stress-responsive transcription factors. Among these, the APETALA2/ETHYLENE RESPONSIVE FACTOR (AP2/ERF) family transcription factors (AP2/ERFs) have emerged as key regulators of various stress responses, in which they also respond to hormones with improved plant survival during stress conditions. Apart from participation in specific stresses, AP2/ERFs are involved in a wide range of stress tolerance, enabling them to form an interconnected stress regulatory network. Additionally, many AP2/ERFs respond to the plant hormones abscisic acid (ABA) and ethylene (ET) to help activate ABA and ET dependent and independent stress-responsive genes. While some AP2/ERFs are implicated in growth and developmental processes mediated by gibberellins (GAs), cytokinins (CTK), and brassinosteroids (BRs). The involvement of AP2/ERFs in hormone signaling adds the complexity of stress regulatory network. In this review, we summarize recent studies on AP2/ERF transcription factors in hormonal and abiotic stress responses with an emphasis on selected family members in '),(70,'Short-code gene 34G test figure.','Abstract for the 34G short-alias exact-match test.'); /*!40000 ALTER TABLE `publication_figures` ENABLE KEYS */; UNLOCK TABLES; @@ -133,7 +133,7 @@ CREATE TABLE `figures` ( `img_url` varchar(265) DEFAULT NULL, PRIMARY KEY (`id`), KEY `FK_publication_figures_figures` (`publication_figures_id`) -) ENGINE=InnoDB AUTO_INCREMENT=601 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=704 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -142,7 +142,7 @@ CREATE TABLE `figures` ( LOCK TABLES `figures` WRITE; /*!40000 ALTER TABLE `figures` DISABLE KEYS */; -INSERT INTO `figures` VALUES (100,10,'01-0441f1.jpg','Domain Structure of B3 and bZIP Domain Transcription Factors','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/8e4f6ebe4139/01-0441f1.jpg'),(101,10,'01-0441f2.jpg','Scheme of Signaling Pathways in Seed Development.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/a56d4d6d5560/01-0441f2.jpg'),(102,10,'01-0441f3.jpg','Regulation of ABA-Responsive Promoter Activity in a Rice Embryo Protoplast','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/67ad8027f184/01-0441f3.jpg'),(103,10,'01-0441f4.jpg','Scheme of Signaling Pathways That Interact with the ABA Regulation of Germination.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/7be9f20adc7b/01-0441f4.jpg'),(104,10,'01-0441f5.jpg','Sensitivity of Seedlings of Wild-Type, abi, and ABI Overexpressing Lines','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/3c7ab6e933bb/01-0441f5.jpg'),(200,20,'gr1.jpg','Collision figure (pub A).','https://example.org/PMC900001/gr1.jpg'),(300,30,'gr1.jpg','Collision figure (pub B).','https://example.org/PMC900002/gr1.jpg'),(301,30,'nullfig.jpg','Figure with no URL.',NULL),(400,40,'gabitest.jpg','GABI line figure (false-positive bait).','https://example.org/PMC900003/gabitest.jpg'),(600,60,'fpls-10-00228-g003.jpg','AP2/ERFs roles in hormone pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stresses responses through hormone signaling components and AP2/ERFs. Arrows and bar ends indicate activation and repression effect, respectively. Figure is created with BioRender.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/9ceb/6403161/d18acaa7332e/fpls-10-00228-g003.jpg'); +INSERT INTO `figures` VALUES (100,10,'01-0441f1.jpg','Domain Structure of B3 and bZIP Domain Transcription Factors','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/8e4f6ebe4139/01-0441f1.jpg'),(101,10,'01-0441f2.jpg','Scheme of Signaling Pathways in Seed Development.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/a56d4d6d5560/01-0441f2.jpg'),(102,10,'01-0441f3.jpg','Regulation of ABA-Responsive Promoter Activity in a Rice Embryo Protoplast','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/67ad8027f184/01-0441f3.jpg'),(103,10,'01-0441f4.jpg','Scheme of Signaling Pathways That Interact with the ABA Regulation of Germination.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/7be9f20adc7b/01-0441f4.jpg'),(104,10,'01-0441f5.jpg','Sensitivity of Seedlings of Wild-Type, abi, and ABI Overexpressing Lines','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/82d7/151246/3c7ab6e933bb/01-0441f5.jpg'),(200,20,'gr1.jpg','Collision figure (pub A).','https://example.org/PMC900001/gr1.jpg'),(300,30,'gr1.jpg','Collision figure (pub B).','https://example.org/PMC900002/gr1.jpg'),(301,30,'nullfig.jpg','Figure with no URL.',NULL),(400,40,'gabitest.jpg','GABI line figure (false-positive bait).','https://example.org/PMC900003/gabitest.jpg'),(600,60,'fpls-10-00228-g003.jpg','AP2/ERFs roles in hormone pathways. Abiotic stresses alter the production and distribution of phytohormones that in turn mediate stresses responses through hormone signaling components and AP2/ERFs. Arrows and bar ends indicate activation and repression effect, respectively. Figure is created with BioRender.','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/9ceb/6403161/d18acaa7332e/fpls-10-00228-g003.jpg'),(700,70,'fpls-11-01234-g002.jpg','Real 34G figure (valid bbox).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/aaaa1111aaaa/fpls-11-01234-g002.jpg'),(701,70,'fpls-11-01234-g005.jpg','Boundary decoy figure (word 34g/x).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/bbbb2222bbbb/fpls-11-01234-g005.jpg'),(702,70,'fpls-11-01234-g009.jpg','Substring decoy figure (word x34gy).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/cccc3333cccc/fpls-11-01234-g009.jpg'),(703,70,'fpls-11-01234-g003.jpg','Malformed-bbox figure (OCR entry has no bbox).','https://cdn.ncbi.nlm.nih.gov/pmc/blobs/aaaa/7000001/dddd4444dddd/fpls-11-01234-g003.jpg'); /*!40000 ALTER TABLE `figures` ENABLE KEYS */; UNLOCK TABLES; @@ -160,7 +160,7 @@ CREATE TABLE `pub_ids` ( `pmc` varchar(16) DEFAULT NULL, PRIMARY KEY (`id`), KEY `FK_publication_figures_pub_ids` (`publication_figures_id`) -) ENGINE=InnoDB AUTO_INCREMENT=61 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=71 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -169,7 +169,7 @@ CREATE TABLE `pub_ids` ( LOCK TABLES `pub_ids` WRITE; /*!40000 ALTER TABLE `pub_ids` DISABLE KEYS */; -INSERT INTO `pub_ids` VALUES (10,10,'12045268','PMC151246'),(20,20,'30000001','PMC900001'),(30,30,'30000002','PMC900002'),(40,40,'30000003','PMC900003'),(60,60,'30873200','PMC6403161'); +INSERT INTO `pub_ids` VALUES (10,10,'12045268','PMC151246'),(20,20,'30000001','PMC900001'),(30,30,'30000002','PMC900002'),(40,40,'30000003','PMC900003'),(60,60,'30873200','PMC6403161'),(70,70,'31000000','PMC7000001'); /*!40000 ALTER TABLE `pub_ids` ENABLE KEYS */; UNLOCK TABLES; @@ -186,7 +186,7 @@ CREATE TABLE `author_list` ( `author` varchar(128) NOT NULL, PRIMARY KEY (`id`), KEY `FK_publication_figures_author_list` (`publication_figures_id`) -) ENGINE=InnoDB AUTO_INCREMENT=9 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=10 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -195,7 +195,7 @@ CREATE TABLE `author_list` ( LOCK TABLES `author_list` WRITE; /*!40000 ALTER TABLE `author_list` DISABLE KEYS */; -INSERT INTO `author_list` VALUES (1,10,'Finkelstein RR'),(2,10,'Gampala SSL'),(3,10,'Rock CD'),(5,60,'Zhouli Xie'),(6,60,'Trevor M Nolan'),(7,60,'Hao Jiang'),(8,60,'Yanhai Yin'); +INSERT INTO `author_list` VALUES (1,10,'Finkelstein RR'),(2,10,'Gampala SSL'),(3,10,'Rock CD'),(5,60,'Zhouli Xie'),(6,60,'Trevor M Nolan'),(7,60,'Hao Jiang'),(8,60,'Yanhai Yin'),(9,70,'Test Author A'); /*!40000 ALTER TABLE `author_list` ENABLE KEYS */; UNLOCK TABLES; @@ -210,7 +210,7 @@ CREATE TABLE `figure_models` ( `id` int NOT NULL AUTO_INCREMENT, `data` json DEFAULT NULL, PRIMARY KEY (`id`) -) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -219,7 +219,7 @@ CREATE TABLE `figure_models` ( LOCK TABLES `figure_models` WRITE; /*!40000 ALTER TABLE `figure_models` DISABLE KEYS */; -INSERT INTO `figure_models` VALUES (1,'{"gene": true, "word": "abi3", "image": [{"bbox": [[[268,108],[317,108],[317,118],[268,118]],[[45,110],[72,111],[72,120],[45,119]]], "imageName": "/01-0441f2.jpg"},{"bbox": [[[274,96],[298,96],[298,105],[274,105]]], "imageName": "/01-0441f4.jpg"},{"bbox": [[[10,10],[40,10],[40,20],[10,20]]], "imageName": "/gr1.jpg"},{"bbox": [[[5,5],[25,5],[25,15],[5,15]]], "imageName": "/nullfig.jpg"}]}'),(3,'{"gene": false, "word": "gabi390_r", "image": [{"bbox": [[[200,200],[260,200],[260,212],[200,212]]], "imageName": "/gabitest.jpg"}]}'),(4,'{"gene": false, "word": "abi3/vp1", "image": [{"bbox": [[[34, 357], [96, 357], [96, 369], [34, 369]]], "imageName": "/fpls-10-00228-g003.jpg"}]}'); +INSERT INTO `figure_models` VALUES (1,'{"gene": true, "word": "abi3", "image": [{"bbox": [[[268,108],[317,108],[317,118],[268,118]],[[45,110],[72,111],[72,120],[45,119]]], "imageName": "/01-0441f2.jpg"},{"bbox": [[[274,96],[298,96],[298,105],[274,105]]], "imageName": "/01-0441f4.jpg"},{"bbox": [[[10,10],[40,10],[40,20],[10,20]]], "imageName": "/gr1.jpg"},{"bbox": [[[5,5],[25,5],[25,15],[5,15]]], "imageName": "/nullfig.jpg"}]}'),(3,'{"gene": false, "word": "gabi390_r", "image": [{"bbox": [[[200,200],[260,200],[260,212],[200,212]]], "imageName": "/gabitest.jpg"}]}'),(4,'{"gene": false, "word": "abi3/vp1", "image": [{"bbox": [[[34, 357], [96, 357], [96, 369], [34, 369]]], "imageName": "/fpls-10-00228-g003.jpg"}]}'),(5,'{"gene": true, "word": "34g", "image": [{"bbox": [[[10,10],[40,10],[40,20],[10,20]]], "imageName": "/fpls-11-01234-g002.jpg"},{"imageName": "/fpls-11-01234-g003.jpg"}]}'),(6,'{"gene": false, "word": "34g/x", "image": [{"bbox": [[[50,50],[80,50],[80,60],[50,60]]], "imageName": "/fpls-11-01234-g005.jpg"}]}'),(7,'{"gene": false, "word": "x34gy", "image": [{"bbox": [[[70,70],[90,70],[90,80],[70,80]]], "imageName": "/fpls-11-01234-g009.jpg"}]}'); /*!40000 ALTER TABLE `figure_models` ENABLE KEYS */; UNLOCK TABLES; /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; diff --git a/tests/resources/test_gaia.py b/tests/resources/test_gaia.py index 6de90de3..dd5e7e60 100644 --- a/tests/resources/test_gaia.py +++ b/tests/resources/test_gaia.py @@ -72,6 +72,45 @@ def test_publication_figures_by_gene_empty_payload(self): expected = {"wasSuccessful": True, "data": {"figures": {}, "allImageWords": {}}} self.assertEqual(response.json, expected) + def test_publication_figures_by_gene_short_alias_34g(self): + """A short (<=3 char) alias uses EXACT-IN matching, not the word-boundary regex. + 34G must match the OCR word '34g' exactly and reject the boundary-delimited decoy + '34g/x' and the substring decoy 'x34gy'. Also covers the malformed (no-bbox) OCR + entry -> the figure still returns with a clean bbox list (no None). + :return: + """ + self._require_mysql() + response = self.app_client.get("/gaia/publication_figures_by_gene/34G") + self.assertEqual(response.status_code, 200) + + body = response.json + self.assertTrue(body["wasSuccessful"]) + figures = body["data"]["figures"] + + self.assertIn("PMC7000001", figures) + names = {f["img_name"] for f in figures["PMC7000001"]["figures"]} + + # Exact match on '34g' returns its real figure (g002) and the malformed-bbox figure (g003). + self.assertIn("fpls-11-01234-g002.jpg", names) + self.assertIn("fpls-11-01234-g003.jpg", names) + + # Exact-IN must NOT match the boundary decoy '34g/x' (the regex WOULD have) nor the + # substring decoy 'x34gy' (a LIKE would have) -> both figures absent. + self.assertNotIn("fpls-11-01234-g005.jpg", names) + self.assertNotIn("fpls-11-01234-g009.jpg", names) + + # Malformed OCR entry (imageName but no bbox) -> figure returns with a clean, None-free + # bbox list (here empty, since its only OCR entry had no box). + by_name = {f["img_name"]: f for f in figures["PMC7000001"]["figures"]} + self.assertEqual(by_name["fpls-11-01234-g003.jpg"]["bbox"], []) + for fig in figures["PMC7000001"]["figures"]: + self.assertNotIn(None, fig["bbox"]) + + # allImageWords keeps the malformed image's key but maps it to [] (not null, not absent), + # so the gene-name filter still lists it without the frontend choking on a null box. + words_34g = body["data"]["allImageWords"]["34g"] + self.assertEqual(words_34g["fpls-11-01234-g003.jpg"], []) + def test_publication_figures_by_gene_invalid_identifier(self): """An identifier failing the gaia alias check returns a 400 error. :return: