From 703ad9951fc0718b9a1c94d85f8776205a0f1f6d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 15 Dec 2025 21:12:45 +0000
Subject: [PATCH] refactor: decompose BasePipeline and fix import-linter

R3: Decompose BasePipeline (315 -> 93 LOC):
- Extract LockManager for lock acquisition, release, heartbeat
- Extract CheckpointManager for checkpoint save/load/delete
- Extract RecordProcessor for Bronze/Silver/Gold loading and quarantine

R4: Fix import-linter violations:
- Move ChEMBLActivityPipelineFactory to infrastructure.factories
- Remove broken layers contract (incompatible with hexagonal arch)
- Keep domain-independence and application-no-infrastructure contracts
---
 .importlinter                                 |  13 +-
 src/bioetl/application/pipeline/__init__.py   |  19 ++
 src/bioetl/application/pipeline/base.py       | 289 ++++--------------
 .../pipeline/checkpoint_manager.py            |  71 +++++
 .../application/pipeline/lock_manager.py      |  94 ++++++
 .../application/pipeline/record_processor.py  | 136 +++++++++
 .../application/pipelines/chembl_activity.py  | 262 +---------------
 src/bioetl/cli.py                             |   4 +-
 .../infrastructure/factories/__init__.py      |   5 +
 .../factories/chembl_factory.py               | 147 +++++++++
 10 files changed, 549 insertions(+), 491 deletions(-)
 create mode 100644 src/bioetl/application/pipeline/checkpoint_manager.py
 create mode 100644 src/bioetl/application/pipeline/lock_manager.py
 create mode 100644 src/bioetl/application/pipeline/record_processor.py
 create mode 100644 src/bioetl/infrastructure/factories/__init__.py
 create mode 100644 src/bioetl/infrastructure/factories/chembl_factory.py

diff --git a/.importlinter b/.importlinter
index 1f47fa7540..499ade8eae 100644
--- a/.importlinter
+++ b/.importlinter
@@ -2,16 +2,6 @@
 root_package = bioetl
 include_external_packages = True
 
-[importlinter:contract:layers]
-name = Layer dependencies (Ports & Adapters)
-type = layers
-layers =
-    bioetl.application
-    bioetl.domain
-    bioetl.infrastructure
-containers =
-    bioetl
-
 [importlinter:contract:domain-independence]
 name = Domain layer independence
 type = forbidden
@@ -32,5 +22,4 @@ forbidden_modules =
     bioetl.infrastructure.locking
     bioetl.infrastructure.checkpoint
     bioetl.infrastructure.quarantine
-ignore_imports =
-    bioetl.application.pipelines.chembl_activity -> bioetl.infrastructure.*
+    bioetl.infrastructure.factories
diff --git a/src/bioetl/application/pipeline/__init__.py b/src/bioetl/application/pipeline/__init__.py
index dc03a17322..5f5ca1c014 100644
--- a/src/bioetl/application/pipeline/__init__.py
+++ b/src/bioetl/application/pipeline/__init__.py
@@ -1 +1,20 @@
 """Pipeline components and base classes."""
+
+from bioetl.application.pipeline.base import (
+    BasePipeline,
+    PipelineShutdownError,
+    run_pipeline_flow,
+)
+from bioetl.application.pipeline.checkpoint_manager import CheckpointManager
+from bioetl.application.pipeline.lock_manager import LockManager, PipelineLockLostError
+from bioetl.application.pipeline.record_processor import RecordProcessor
+
+__all__ = [
+    "BasePipeline",
+    "CheckpointManager",
+    "LockManager",
+    "PipelineLockLostError",
+    "PipelineShutdownError",
+    "RecordProcessor",
+    "run_pipeline_flow",
+]
diff --git a/src/bioetl/application/pipeline/base.py b/src/bioetl/application/pipeline/base.py
index 0fd37e4813..33d01c74da 100644
--- a/src/bioetl/application/pipeline/base.py
+++ b/src/bioetl/application/pipeline/base.py
@@ -1,28 +1,20 @@
 """Base ETL Pipeline class with Prefect integration.
 
 Implements RULES.md §4 - Generic ETL Pipeline pattern.
-
-Requirements:
-- Lock acquisition and heartbeat
-- Checkpoint save/restore
-- Graceful shutdown (SIGTERM/SIGINT)
-- Bronze → Silver → Gold flow
-- Error handling and quarantine
-- Observability (logging, metrics)
-- Prefect integration for orchestration
 """
 
-import asyncio
-import json
 import signal
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from typing import Any
 from uuid import uuid4
 
 from prefect import flow, task
 
+from bioetl.application.pipeline.checkpoint_manager import CheckpointManager
+from bioetl.application.pipeline.lock_manager import LockManager, PipelineLockLostError
+from bioetl.application.pipeline.record_processor import RecordProcessor
 from bioetl.domain.ports import (
     CheckpointPort,
     DataSourcePort,
@@ -30,13 +22,7 @@
     QuarantinePort,
     StoragePort,
 )
-from bioetl.domain.types import (
-    BatchID,
-    ErrorType,
-    RunID,
-    RunType,
-    Watermark,
-)
+from bioetl.domain.types import RunID, RunType, Watermark
 from bioetl.infrastructure.observability.logging import create_logger
 
 
@@ -46,33 +32,14 @@ class PipelineShutdownError(Exception):
     pass
 
 
-@flow(
-    name="{self.pipeline_name}",
-    log_prints=True,
-    validate_parameters=False,
-)
+@flow(name="{self.pipeline_name}", log_prints=True, validate_parameters=False)
 async def run_pipeline_flow(pipeline: "BasePipeline") -> None:
     """Prefect flow to run the ETL pipeline."""
     await pipeline.run()
 
 
 class BasePipeline(ABC):
-    """Base class for ETL pipelines.
-
-    Implements common pipeline patterns:
-    - Lock acquisition with heartbeat
-    - Checkpoint management
-    - Graceful shutdown
-    - Bronze → Silver → Gold flow
-    - Error handling and quarantine
-
-    Example:
-        >>> class MyPipeline(BasePipeline):
-        ...     # ... implementation ...
-        ...
-        >>> pipeline = MyPipeline(...)
-        >>> await run_pipeline_flow(pipeline)
-    """
+    """Base class for ETL pipelines with decomposed responsibilities."""
 
     def __init__(
         self,
@@ -92,223 +59,89 @@ def __init__(
         self.entity_type = entity_type
         self.run_type = run_type
         self.data_source = data_source
-        self.storage = storage
-        self.lock = lock
-        self.checkpoint = checkpoint
-        self.quarantine = quarantine
         self.resume = resume
         self.run_id = RunID(uuid4())
         self.shutdown_requested = False
-        self.heartbeat_task: asyncio.Task[None] | None = None
         self.records_fetched = 0
-        self.records_bronze = 0
-        self.records_silver = 0
-        self.records_gold = 0
-        self.records_quarantined = 0
-        self.logger = create_logger(
-            run_id=str(self.run_id),
-            pipeline=pipeline_name,
+        self.logger = create_logger(run_id=str(self.run_id), pipeline=pipeline_name)
+
+        self._lock_mgr = LockManager(lock, self.run_id, self.logger)
+        self._checkpoint_mgr = CheckpointManager(
+            checkpoint, pipeline_name, self.run_id, self.logger
+        )
+        self._processor = RecordProcessor(
+            storage, quarantine, provider, entity_type,
+            pipeline_name, self.run_id, run_type, self.logger
         )
 
     async def run(self) -> None:
         """Execute pipeline. Main entry point."""
-        self.logger.info(
-            f"Starting pipeline: {self.pipeline_name}",
-            extra={"stage": "startup", "run_type": self.run_type.value},
-        )
+        self.logger.info(f"Starting pipeline: {self.pipeline_name}",
+                         extra={"stage": "startup", "run_type": self.run_type.value})
         self._setup_shutdown_handlers()
 
         lock_key = f"{self.provider}_{self.entity_type}"
         exclusive = self.run_type in (RunType.BACKFILL, RunType.REBUILD)
 
         try:
-            acquired = await self.lock.acquire(
-                key=lock_key, owner_id=self.run_id, wait=False, exclusive=exclusive
-            )
-            if not acquired:
-                self.logger.error(f"Failed to acquire lock for {lock_key}")
+            if not await self._lock_mgr.acquire(lock_key, exclusive):
                 return
-
-            self.logger.info(f"Lock acquired for {lock_key}")
-            self.heartbeat_task = asyncio.create_task(
-                self._heartbeat_loop(lock_key, exclusive)
+            self._lock_mgr.start_heartbeat(
+                lock_key, exclusive, lambda: setattr(self, "shutdown_requested", True)
             )
-
-            watermark = await self._load_checkpoint_task()
-            await self._execute_pipeline_task(watermark)
-            await self._delete_checkpoint_task()
-
-            self.logger.info(
-                "Pipeline completed successfully",
-                extra={
-                    "stage": "complete",
-                    "records_fetched": self.records_fetched,
-                },
-            )
-        except PipelineShutdownError:
-            self.logger.warning(
-                "Pipeline shutdown requested", extra={"stage": "shutdown"}
-            )
-            raise
-        except Exception as e:
-            self.logger.error(f"Pipeline failed: {e}", exc_info=True)
-            raise
+            await self._execute_with_checkpoint()
+        except (PipelineShutdownError, PipelineLockLostError):
+            self.logger.warning("Pipeline shutdown requested", extra={"stage": "shutdown"})
+            raise PipelineShutdownError("Shutdown requested") from None
         finally:
-            if self.heartbeat_task:
-                self.heartbeat_task.cancel()
-            await self.lock.release(lock_key, self.run_id, exclusive=exclusive)
-            self.logger.info("Lock released", extra={"stage": "cleanup"})
-
-    @task(name="Load Checkpoint")
-    async def _load_checkpoint_task(self) -> Watermark | None:
-        if self.resume:
-            checkpoint_data = self.checkpoint.load(self.pipeline_name)
-            if checkpoint_data:
-                watermark, _, metadata = checkpoint_data
-                self.logger.info(
-                    f"Resuming from checkpoint: {watermark}",
-                    extra={"metadata": metadata},
-                )
-                return watermark
-        return None
+            self._lock_mgr.stop_heartbeat()
+            await self._lock_mgr.release(lock_key, exclusive)
 
     @task(name="Execute Pipeline")
-    async def _execute_pipeline_task(self, watermark: Watermark | None) -> None:
-        """Execute main pipeline logic as a Prefect task."""
-        async for raw_record in self._extract(watermark):
+    async def _execute_with_checkpoint(self) -> None:
+        watermark = self._checkpoint_mgr.load(self.resume)
+        async for record in self._extract(watermark):
             if self.shutdown_requested:
                 raise PipelineShutdownError("Shutdown during extraction")
+            await self._process_record(record)
+        self._checkpoint_mgr.delete()
+        self.logger.info("Pipeline completed", extra={"records": self.records_fetched})
 
-            self.records_fetched += 1
-            batch_id = await self._load_bronze(raw_record)
-            self.records_bronze += 1
-
-            try:
-                transformed = await self.transform_bronze_to_silver(raw_record)
-                if transformed:
-                    await self._load_silver(transformed, batch_id)
-                    self.records_silver += 1
-                    if self.should_write_gold(transformed):
-                        await self._load_gold(transformed)
-                        self.records_gold += 1
-            except Exception as e:
-                error_type = self._classify_error(e)
-                if error_type.is_data_quality():
-                    await self._quarantine_record(
-                        raw_record, error_type, batch_id, str(e)
-                    )
-                    self.records_quarantined += 1
-                else:
-                    raise
-
-            if self.records_fetched % 1000 == 0:
-                await self._save_checkpoint(raw_record)
-
-    @task(name="Delete Checkpoint")
-    async def _delete_checkpoint_task(self) -> None:
-        self.checkpoint.delete(self.pipeline_name)
-
-    async def _extract(
-        self, watermark: Watermark | None
-    ) -> AsyncIterator[dict[str, Any]]:
-        async for record in self.data_source.fetch(
-            entity_type=self.entity_type, watermark=watermark
-        ):
+    async def _process_record(self, raw: dict[str, Any]) -> None:
+        self.records_fetched += 1
+        batch_id = self._processor.load_bronze(raw)
+        try:
+            transformed = await self.transform_bronze_to_silver(raw)
+            if transformed:
+                self._processor.load_silver(transformed, batch_id)
+                if self.should_write_gold(transformed):
+                    self._processor.load_gold(transformed)
+        except Exception as e:
+            err_type = RecordProcessor.classify_error(e)
+            if err_type.is_data_quality():
+                self._processor.quarantine_record(raw, err_type, batch_id, str(e))
+            else:
+                raise
+        if self.records_fetched % 1000 == 0:
+            self._checkpoint_mgr.save(self.extract_watermark(raw), self.records_fetched)
+
+    async def _extract(self, watermark: Watermark | None) -> AsyncIterator[dict[str, Any]]:
+        async for record in self.data_source.fetch(self.entity_type, watermark):
             yield record
 
-    async def _load_bronze(self, record: dict[str, Any]) -> BatchID:
-        batch_id = BatchID(uuid4())
-        record_bytes = (json.dumps(record) + "\n").encode("utf-8")
-        self.storage.write_bronze(
-            records=iter([record_bytes]),
-            provider=self.provider,
-            entity=self.entity_type,
-            date=datetime.now(timezone.utc),
-            batch_id=batch_id,
-        )
-        return batch_id
-
-    async def _load_silver(self, record: dict[str, Any], batch_id: BatchID) -> None:
-        record_with_meta = {
-            **record,
-            "_run_id": str(self.run_id),
-            "_run_type": self.run_type.value,
-            "_source_batch_id": str(batch_id),
-            "_ingestion_ts": datetime.now(timezone.utc).isoformat(),
-        }
-        table_name = f"{self.provider}.{self.entity_type}"
-        self.storage.write_silver(
-            table_name=table_name,
-            records=[record_with_meta],
-            primary_keys=["entity_id"],
-        )
-
-    async def _load_gold(self, record: dict[str, Any]) -> None:
-        table_name = f"{self.provider}.{self.entity_type}_gold"
-        self.storage.write_gold(table_name=table_name, records=[record], mode="append")
-
-    async def _quarantine_record(
-        self,
-        record: dict[str, Any],
-        error_type: ErrorType,
-        batch_id: BatchID,
-        error_details: str,
-    ) -> None:
-        self.quarantine.write(
-            pipeline=self.pipeline_name,
-            error_code=error_type.value,
-            payload=record,
-            bronze_batch_id=batch_id,
-            error_details={"message": error_details},
-        )
-
-    async def _save_checkpoint(self, last_record: dict[str, Any]) -> None:
-        watermark = self.extract_watermark(last_record)
-        self.checkpoint.save(
-            pipeline=self.pipeline_name,
-            watermark=watermark,
-            run_id=self.run_id,
-            metadata={"records_processed": self.records_fetched},
-        )
-
-    async def _heartbeat_loop(self, lock_key: str, exclusive: bool) -> None:
-        while not self.shutdown_requested:
-            await asyncio.sleep(20)
-            success = await self.lock.heartbeat(
-                lock_key, self.run_id, exclusive=exclusive
-            )
-            if not success:
-                self.logger.error("Lost lock during execution!")
-                self.shutdown_requested = True
-                raise PipelineShutdownError("Lock lost")
-
     def _setup_shutdown_handlers(self) -> None:
-        def signal_handler(signum: int, frame: Any) -> None:
-            self.logger.warning(
-                f"Received signal {signum}, initiating graceful shutdown"
-            )
+        def handler(signum: int, _frame: Any) -> None:
+            self.logger.warning(f"Received signal {signum}, initiating shutdown")
             self.shutdown_requested = True
-
-        signal.signal(signal.SIGTERM, signal_handler)
-        signal.signal(signal.SIGINT, signal_handler)
-
-    def _classify_error(self, error: Exception) -> ErrorType:
-        error_name = type(error).__name__
-        if "Schema" in error_name or "Validation" in error_name:
-            return ErrorType.SCHEMA_VIOLATION
-        elif "Missing" in error_name or "Required" in error_name:
-            return ErrorType.MISSING_REQUIRED_FIELD
-        else:
-            return ErrorType.INVALID_DATA
+        signal.signal(signal.SIGTERM, handler)
+        signal.signal(signal.SIGINT, handler)
 
     @abstractmethod
-    async def transform_bronze_to_silver(
-        self, record: dict[str, Any]
-    ) -> dict[str, Any] | None:
+    async def transform_bronze_to_silver(self, record: dict[str, Any]) -> dict[str, Any] | None:
         pass
 
-    def should_write_gold(self, record: dict[str, Any]) -> bool:
+    def should_write_gold(self, record: dict[str, Any]) -> bool:  # noqa: ARG002
         return True
 
-    def extract_watermark(self, record: dict[str, Any]) -> Watermark:
-        return Watermark(datetime.now(timezone.utc))
+    def extract_watermark(self, record: dict[str, Any]) -> Watermark:  # noqa: ARG002
+        return Watermark(datetime.now(UTC))
diff --git a/src/bioetl/application/pipeline/checkpoint_manager.py b/src/bioetl/application/pipeline/checkpoint_manager.py
new file mode 100644
index 0000000000..9f01818f7c
--- /dev/null
+++ b/src/bioetl/application/pipeline/checkpoint_manager.py
@@ -0,0 +1,71 @@
+"""Checkpoint manager for ETL pipelines.
+
+Handles checkpoint save, load, and delete operations for resumable pipelines.
+"""
+
+from logging import Logger
+
+from bioetl.domain.ports import CheckpointPort
+from bioetl.domain.types import RunID, Watermark
+
+
+class CheckpointManager:
+    """Manages checkpoint lifecycle for pipelines.
+
+    Responsibilities:
+    - Load checkpoint for resume functionality
+    - Save periodic checkpoints during execution
+    - Delete checkpoint on successful completion
+    """
+
+    def __init__(
+        self,
+        checkpoint: CheckpointPort,
+        pipeline_name: str,
+        run_id: RunID,
+        logger: Logger,
+    ) -> None:
+        self._checkpoint = checkpoint
+        self._pipeline_name = pipeline_name
+        self._run_id = run_id
+        self._logger = logger
+
+    def load(self, resume: bool) -> Watermark | None:
+        """Load checkpoint if resume mode is enabled.
+
+        Args:
+            resume: Whether to attempt loading checkpoint
+
+        Returns:
+            Watermark from checkpoint or None
+        """
+        if not resume:
+            return None
+
+        checkpoint_data = self._checkpoint.load(self._pipeline_name)
+        if checkpoint_data:
+            watermark, _, metadata = checkpoint_data
+            self._logger.info(
+                f"Resuming from checkpoint: {watermark}",
+                extra={"metadata": metadata},
+            )
+            return watermark
+        return None
+
+    def save(self, watermark: Watermark, records_processed: int) -> None:
+        """Save checkpoint with current progress.
+
+        Args:
+            watermark: Current position watermark
+            records_processed: Number of records processed
+        """
+        self._checkpoint.save(
+            pipeline=self._pipeline_name,
+            watermark=watermark,
+            run_id=self._run_id,
+            metadata={"records_processed": records_processed},
+        )
+
+    def delete(self) -> None:
+        """Delete checkpoint after successful completion."""
+        self._checkpoint.delete(self._pipeline_name)
diff --git a/src/bioetl/application/pipeline/lock_manager.py b/src/bioetl/application/pipeline/lock_manager.py
new file mode 100644
index 0000000000..06b1425eaa
--- /dev/null
+++ b/src/bioetl/application/pipeline/lock_manager.py
@@ -0,0 +1,94 @@
+"""Lock manager for ETL pipelines.
+
+Handles distributed lock acquisition, release, and heartbeat maintenance.
+"""
+
+import asyncio
+from logging import Logger
+
+from bioetl.domain.ports import LockPort
+from bioetl.domain.types import RunID
+
+
+class PipelineLockLostError(Exception):
+    """Raised when pipeline loses its lock."""
+
+    pass
+
+
+class LockManager:
+    """Manages distributed lock lifecycle for pipelines.
+
+    Responsibilities:
+    - Lock acquisition with exclusive/shared mode
+    - Heartbeat maintenance during execution
+    - Clean lock release on completion
+    """
+
+    def __init__(
+        self,
+        lock: LockPort,
+        run_id: RunID,
+        logger: Logger,
+    ) -> None:
+        self._lock = lock
+        self._run_id = run_id
+        self._logger = logger
+        self._heartbeat_task: asyncio.Task[None] | None = None
+        self._shutdown_callback: callable | None = None
+
+    async def acquire(self, key: str, exclusive: bool) -> bool:
+        """Acquire lock for the given key.
+
+        Args:
+            key: Lock key (usually provider_entity)
+            exclusive: True for exclusive lock (backfill/rebuild)
+
+        Returns:
+            True if lock acquired, False otherwise
+        """
+        acquired = await self._lock.acquire(
+            key=key, owner_id=self._run_id, wait=False, exclusive=exclusive
+        )
+        if acquired:
+            self._logger.info(f"Lock acquired for {key}")
+        else:
+            self._logger.error(f"Failed to acquire lock for {key}")
+        return acquired
+
+    async def release(self, key: str, exclusive: bool) -> None:
+        """Release lock for the given key."""
+        await self._lock.release(key, self._run_id, exclusive=exclusive)
+        self._logger.info("Lock released", extra={"stage": "cleanup"})
+
+    def start_heartbeat(
+        self, key: str, exclusive: bool, shutdown_callback: callable
+    ) -> None:
+        """Start heartbeat loop for lock maintenance.
+
+        Args:
+            key: Lock key
+            exclusive: Lock mode
+            shutdown_callback: Called when lock is lost
+        """
+        self._shutdown_callback = shutdown_callback
+        self._heartbeat_task = asyncio.create_task(
+            self._heartbeat_loop(key, exclusive)
+        )
+
+    def stop_heartbeat(self) -> None:
+        """Stop heartbeat loop."""
+        if self._heartbeat_task:
+            self._heartbeat_task.cancel()
+            self._heartbeat_task = None
+
+    async def _heartbeat_loop(self, key: str, exclusive: bool) -> None:
+        """Send periodic heartbeats to maintain lock."""
+        while True:
+            await asyncio.sleep(20)
+            success = await self._lock.heartbeat(key, self._run_id, exclusive=exclusive)
+            if not success:
+                self._logger.error("Lost lock during execution!")
+                if self._shutdown_callback:
+                    self._shutdown_callback()
+                raise PipelineLockLostError("Lock lost")
diff --git a/src/bioetl/application/pipeline/record_processor.py b/src/bioetl/application/pipeline/record_processor.py
new file mode 100644
index 0000000000..90d3955114
--- /dev/null
+++ b/src/bioetl/application/pipeline/record_processor.py
@@ -0,0 +1,136 @@
+"""Record processor for ETL pipelines.
+
+Handles Bronze/Silver/Gold layer loading and quarantine operations.
+"""
+
+import json
+from datetime import UTC, datetime
+from logging import Logger
+from typing import Any
+from uuid import uuid4
+
+from bioetl.domain.ports import QuarantinePort, StoragePort
+from bioetl.domain.types import BatchID, ErrorType, RunID, RunType
+
+
+class RecordProcessor:
+    """Processes records through Bronze/Silver/Gold layers.
+
+    Responsibilities:
+    - Load raw records to Bronze layer
+    - Load transformed records to Silver layer
+    - Load filtered records to Gold layer
+    - Handle quarantine for failed records
+    - Classify errors for proper handling
+    """
+
+    def __init__(
+        self,
+        storage: StoragePort,
+        quarantine: QuarantinePort,
+        provider: str,
+        entity_type: str,
+        pipeline_name: str,
+        run_id: RunID,
+        run_type: RunType,
+        logger: Logger,
+    ) -> None:
+        self._storage = storage
+        self._quarantine = quarantine
+        self._provider = provider
+        self._entity_type = entity_type
+        self._pipeline_name = pipeline_name
+        self._run_id = run_id
+        self._run_type = run_type
+        self._logger = logger
+
+    def load_bronze(self, record: dict[str, Any]) -> BatchID:
+        """Load raw record to Bronze layer.
+
+        Args:
+            record: Raw record from data source
+
+        Returns:
+            BatchID for tracking lineage
+        """
+        batch_id = BatchID(uuid4())
+        record_bytes = (json.dumps(record) + "\n").encode("utf-8")
+        self._storage.write_bronze(
+            records=iter([record_bytes]),
+            provider=self._provider,
+            entity=self._entity_type,
+            date=datetime.now(UTC),
+            batch_id=batch_id,
+        )
+        return batch_id
+
+    def load_silver(self, record: dict[str, Any], batch_id: BatchID) -> None:
+        """Load transformed record to Silver layer.
+
+        Args:
+            record: Transformed record
+            batch_id: Source batch ID for lineage
+        """
+        record_with_meta = {
+            **record,
+            "_run_id": str(self._run_id),
+            "_run_type": self._run_type.value,
+            "_source_batch_id": str(batch_id),
+            "_ingestion_ts": datetime.now(UTC).isoformat(),
+        }
+        table_name = f"{self._provider}.{self._entity_type}"
+        self._storage.write_silver(
+            table_name=table_name,
+            records=[record_with_meta],
+            primary_keys=["entity_id"],
+        )
+
+    def load_gold(self, record: dict[str, Any]) -> None:
+        """Load record to Gold layer.
+
+        Args:
+            record: Record that passed quality filters
+        """
+        table_name = f"{self._provider}.{self._entity_type}_gold"
+        self._storage.write_gold(table_name=table_name, records=[record], mode="append")
+
+    def quarantine_record(
+        self,
+        record: dict[str, Any],
+        error_type: ErrorType,
+        batch_id: BatchID,
+        error_details: str,
+    ) -> None:
+        """Send failed record to quarantine.
+
+        Args:
+            record: Failed record
+            error_type: Classification of the error
+            batch_id: Source batch ID
+            error_details: Error message
+        """
+        self._quarantine.write(
+            pipeline=self._pipeline_name,
+            error_code=error_type.value,
+            payload=record,
+            bronze_batch_id=batch_id,
+            error_details={"message": error_details},
+        )
+
+    @staticmethod
+    def classify_error(error: Exception) -> ErrorType:
+        """Classify error for proper handling.
+
+        Args:
+            error: Exception that occurred
+
+        Returns:
+            ErrorType classification
+        """
+        error_name = type(error).__name__
+        if "Schema" in error_name or "Validation" in error_name:
+            return ErrorType.SCHEMA_VIOLATION
+        elif "Missing" in error_name or "Required" in error_name:
+            return ErrorType.MISSING_REQUIRED_FIELD
+        else:
+            return ErrorType.INVALID_DATA
diff --git a/src/bioetl/application/pipelines/chembl_activity.py b/src/bioetl/application/pipelines/chembl_activity.py
index 76a80ac321..c0e2e27b8f 100644
--- a/src/bioetl/application/pipelines/chembl_activity.py
+++ b/src/bioetl/application/pipelines/chembl_activity.py
@@ -21,20 +21,6 @@ class ChEMBLActivityPipeline(BasePipeline):
     - Bronze: Raw JSON from ChEMBL API
     - Silver: Normalized with entity_id, content_hash, metadata
     - Gold: Filtered high-quality activities (optional)
-
-    Example:
-        >>> from bioetl.infrastructure.adapters.chembl.client import ChemblAdapter
-        >>> from bioetl.infrastructure.storage.bronze_writer import BronzeWriter
-        >>> # ... initialize adapters
-        >>> pipeline = ChEMBLActivityPipeline(
-        ...     run_type=RunType.INCREMENTAL,
-        ...     data_source=chembl_adapter,
-        ...     storage=storage_adapter,
-        ...     lock=redis_lock,
-        ...     checkpoint=s3_checkpoint,
-        ...     quarantine=quarantine,
-        ... )
-        >>> await pipeline.run()
     """
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
@@ -51,290 +37,70 @@ async def transform_bronze_to_silver(
         self,
         record: dict[str, Any],
     ) -> dict[str, Any] | None:
-        """Transform raw ChEMBL activity to normalized format.
-
-        Args:
-            record: Raw activity record from ChEMBL
-
-        Returns:
-            Normalized record or None if should skip
-
-        Transformation logic:
-        1. Generate stable entity_id from activity_id
-        2. Extract key fields (molecule, target, assay)
-        3. Normalize units and values
-        4. Generate content_hash for deduplication
-        5. Add metadata fields
-        """
-        # Skip if missing critical fields
+        """Transform raw ChEMBL activity to normalized format."""
         if not record.get("activity_id"):
             return None
 
-        # Extract core fields
         activity_id = str(record["activity_id"])
-        molecule_chembl_id = record.get("molecule_chembl_id")
-        target_chembl_id = record.get("target_chembl_id")
-        assay_chembl_id = record.get("assay_chembl_id")
-
-        # Generate entity_id (stable identifier)
         entity_id = generate_entity_id(
             record={"activity_id": activity_id},
             provider=self.provider,
             id_field="activity_id",
         )
 
-        # Extract measurement data
-        standard_type = record.get("standard_type")  # IC50, Ki, EC50, etc.
         standard_value = record.get("standard_value")
-        standard_units = record.get("standard_units")
-        standard_relation = record.get("standard_relation")  # =, <, >, ~
-
-        # Convert value to float if present
         if standard_value is not None:
             try:
                 standard_value = float(standard_value)
             except (ValueError, TypeError):
                 standard_value = None
 
-        # Extract assay information
-        assay_type = record.get("assay_type")
-        assay_description = record.get("assay_description")
-
-        # Extract publication info
-        document_chembl_id = record.get("document_chembl_id")
-        document_year = record.get("document_year")
-
-        # Build normalized record
         normalized = {
             "entity_id": entity_id,
             "activity_id": activity_id,
-            "molecule_chembl_id": molecule_chembl_id,
-            "target_chembl_id": target_chembl_id,
-            "assay_chembl_id": assay_chembl_id,
-            "standard_type": standard_type,
+            "molecule_chembl_id": record.get("molecule_chembl_id"),
+            "target_chembl_id": record.get("target_chembl_id"),
+            "assay_chembl_id": record.get("assay_chembl_id"),
+            "standard_type": record.get("standard_type"),
             "standard_value": standard_value,
-            "standard_units": standard_units,
-            "standard_relation": standard_relation,
-            "assay_type": assay_type,
-            "assay_description": assay_description,
-            "document_chembl_id": document_chembl_id,
-            "document_year": document_year,
-            # Additional fields
-            "pchembl_value": record.get(
-                "pchembl_value"
-            ),  # -log10(molar IC50, XC50, etc)
+            "standard_units": record.get("standard_units"),
+            "standard_relation": record.get("standard_relation"),
+            "assay_type": record.get("assay_type"),
+            "assay_description": record.get("assay_description"),
+            "document_chembl_id": record.get("document_chembl_id"),
+            "document_year": record.get("document_year"),
+            "pchembl_value": record.get("pchembl_value"),
             "activity_comment": record.get("activity_comment"),
             "data_validity_comment": record.get("data_validity_comment"),
         }
 
-        # Generate content_hash for versioning
         content_hash = generate_content_hash(normalized, self.provider)
         normalized["content_hash"] = content_hash
-
         return normalized
 
     def should_write_gold(self, record: dict[str, Any]) -> bool:
-        """Filter records for Gold layer.
-
-        Gold layer criteria:
-        - Must have standard_value (not null)
-        - Must have standard_units
-        - Must have target_chembl_id
-        - Preferred standard_types: IC50, Ki, EC50, Kd
-        - No data validity issues
-
-        Args:
-            record: Silver record
-
-        Returns:
-            True if passes quality filters
-        """
-        # Must have measurement value
+        """Filter records for Gold layer based on quality criteria."""
         if record.get("standard_value") is None:
             return False
-
-        # Must have units
         if not record.get("standard_units"):
             return False
-
-        # Must have target
         if not record.get("target_chembl_id"):
             return False
 
-        # Prefer certain measurement types
         standard_type = record.get("standard_type")
         preferred_types = {"IC50", "Ki", "EC50", "Kd", "AC50", "GI50"}
-
         if standard_type not in preferred_types:
             return False
 
-        # Exclude if data validity issues
         if record.get("data_validity_comment"):
             return False
-
         return True
 
     def extract_watermark(self, record: dict[str, Any]) -> Watermark:
-        """Extract watermark from record.
-
-        Uses activity_id as watermark for incremental loading.
-
-        Args:
-            record: Activity record
-
-        Returns:
-            Watermark (activity_id)
-        """
+        """Extract watermark from record using activity_id."""
         activity_id = record.get("activity_id")
         if activity_id:
             return Watermark(str(activity_id))
 
-        # Fallback to timestamp
         from datetime import datetime, timezone
-
         return Watermark(datetime.now(timezone.utc))
-
-
-class ChEMBLActivityPipelineFactory:
-    """Factory for creating ChEMBL Activity pipelines.
-
-    Simplifies pipeline instantiation by handling adapter initialization.
-
-    Example:
-        >>> factory = ChEMBLActivityPipelineFactory()
-        >>> pipeline = await factory.create(
-        ...     run_type=RunType.INCREMENTAL,
-        ...     resume=False,
-        ... )
-        >>> await pipeline.run()
-    """
-
-    @staticmethod
-    async def create(
-        run_type: Any,  # RunType
-        resume: bool = False,
-        # Adapter configurations
-        chembl_url: str = "https://www.ebi.ac.uk/chembl/api/data",
-        s3_bucket_bronze: str = "bioetl-bronze",
-        s3_bucket_silver: str = "bioetl-silver",
-        s3_bucket_checkpoints: str = "bioetl-checkpoints",
-        redis_host: str = "localhost",
-        redis_port: int = 6379,
-        # S3/MinIO config
-        aws_endpoint_url: str | None = None,
-        aws_access_key: str | None = None,
-        aws_secret_key: str | None = None,
-    ) -> ChEMBLActivityPipeline:
-        """Create configured ChEMBL Activity pipeline.
-
-        Args:
-            run_type: Type of run (incremental, backfill, rebuild)
-            resume: Resume from checkpoint if available
-            chembl_url: ChEMBL API base URL
-            s3_bucket_bronze: Bronze bucket name
-            s3_bucket_silver: Silver bucket name
-            s3_bucket_checkpoints: Checkpoints bucket name
-            redis_host: Redis host
-            redis_port: Redis port
-            aws_endpoint_url: S3 endpoint (for MinIO)
-            aws_access_key: AWS access key
-            aws_secret_key: AWS secret key
-
-        Returns:
-            Configured pipeline instance
-        """
-        from bioetl.infrastructure.adapters.chembl.client import ChemblAdapter
-        from bioetl.infrastructure.adapters.http.circuit_breaker import CircuitBreaker
-        from bioetl.infrastructure.adapters.http.client import UnifiedHTTPClient
-        from bioetl.infrastructure.adapters.http.rate_limiter import TokenBucket
-        from bioetl.infrastructure.checkpoint.s3_checkpoint import S3Checkpoint
-        from bioetl.infrastructure.locking.redis_lock import RedisDistributedLock
-        from bioetl.infrastructure.quarantine.unified_quarantine import (
-            UnifiedQuarantine,
-        )
-        from bioetl.infrastructure.storage.bronze_writer import BronzeWriter
-        from bioetl.infrastructure.storage.delta_writer import DeltaWriter
-
-        # Create storage adapter wrapper
-        class StorageAdapter:
-            """Unified storage adapter for Bronze/Silver/Gold."""
-
-            def __init__(
-                self,
-                bronze_writer: BronzeWriter,
-                silver_writer: DeltaWriter,
-                gold_writer: DeltaWriter,
-            ):
-                self.bronze = bronze_writer
-                self.silver = silver_writer
-                self.gold = gold_writer
-
-            def write_bronze(self, *args: Any, **kwargs: Any) -> Any:
-                return self.bronze.write_bronze(*args, **kwargs)
-
-            def write_silver(self, *args: Any, **kwargs: Any) -> None:
-                return self.silver.write_silver(*args, **kwargs)
-
-            def write_gold(self, *args: Any, **kwargs: Any) -> None:
-                return self.gold.write_gold(*args, **kwargs)
-
-        # Initialize adapters
-        storage_options = {
-            "AWS_ENDPOINT_URL": aws_endpoint_url,
-            "AWS_ACCESS_KEY_ID": aws_access_key,
-            "AWS_SECRET_ACCESS_KEY": aws_secret_key,
-        }
-
-        # Data source (ChEMBL)
-        bucket = TokenBucket(rate=10.0, capacity=20)
-        circuit_breaker = CircuitBreaker(provider="chembl")
-        http_client = UnifiedHTTPClient(bucket, circuit_breaker)
-        data_source = ChemblAdapter(http_client=http_client)
-
-        # Storage
-        bronze_writer = BronzeWriter(
-            bucket=s3_bucket_bronze,
-            endpoint_url=aws_endpoint_url,
-            access_key=aws_access_key,
-            secret_key=aws_secret_key,
-        )
-        silver_writer = DeltaWriter(
-            base_path=f"s3://{s3_bucket_silver}",
-            storage_options=storage_options if aws_endpoint_url else None,
-        )
-        gold_writer = DeltaWriter(
-            base_path=f"s3://{s3_bucket_silver}",  # Same bucket, different tables
-            storage_options=storage_options if aws_endpoint_url else None,
-        )
-        storage = StorageAdapter(bronze_writer, silver_writer, gold_writer)
-
-        # Lock (Redis)
-        import redis.asyncio as aioredis
-
-        redis_client = aioredis.Redis(host=redis_host, port=redis_port)
-        lock = RedisDistributedLock(redis_client=redis_client)
-
-        # Checkpoint (S3)
-        checkpoint = S3Checkpoint(
-            bucket=s3_bucket_checkpoints,
-            endpoint_url=aws_endpoint_url,
-            access_key=aws_access_key,
-            secret_key=aws_secret_key,
-        )
-
-        # Quarantine
-        quarantine = UnifiedQuarantine(
-            base_path=f"s3://{s3_bucket_silver}/common/quarantine",
-            storage_options=storage_options if aws_endpoint_url else None,
-        )
-
-        # Create pipeline
-        return ChEMBLActivityPipeline(
-            run_type=run_type,
-            data_source=data_source,
-            storage=storage,
-            lock=lock,
-            checkpoint=checkpoint,
-            quarantine=quarantine,
-            resume=resume,
-        )
diff --git a/src/bioetl/cli.py b/src/bioetl/cli.py
index 865e50296d..b200d9054a 100644
--- a/src/bioetl/cli.py
+++ b/src/bioetl/cli.py
@@ -101,9 +101,7 @@ async def _run_chembl_activity(
         resume: Resume from checkpoint
         limit: Max records
     """
-    from bioetl.application.pipelines.chembl_activity import (
-        ChEMBLActivityPipelineFactory,
-    )
+    from bioetl.infrastructure.factories import ChEMBLActivityPipelineFactory
 
     # Load configuration from centralized config
     aws_config = get_aws_config()
diff --git a/src/bioetl/infrastructure/factories/__init__.py b/src/bioetl/infrastructure/factories/__init__.py
new file mode 100644
index 0000000000..5769736f79
--- /dev/null
+++ b/src/bioetl/infrastructure/factories/__init__.py
@@ -0,0 +1,5 @@
+"""Pipeline factories for infrastructure wiring."""
+
+from bioetl.infrastructure.factories.chembl_factory import ChEMBLActivityPipelineFactory
+
+__all__ = ["ChEMBLActivityPipelineFactory"]
diff --git a/src/bioetl/infrastructure/factories/chembl_factory.py b/src/bioetl/infrastructure/factories/chembl_factory.py
new file mode 100644
index 0000000000..b4283d881d
--- /dev/null
+++ b/src/bioetl/infrastructure/factories/chembl_factory.py
@@ -0,0 +1,147 @@
+"""ChEMBL pipeline factory.
+
+Creates fully configured ChEMBL pipelines with all infrastructure dependencies.
+"""
+
+from typing import Any
+
+from bioetl.application.pipelines.chembl_activity import ChEMBLActivityPipeline
+from bioetl.domain.types import RunType
+
+
+class ChEMBLActivityPipelineFactory:
+    """Factory for creating ChEMBL Activity pipelines.
+
+    Simplifies pipeline instantiation by handling adapter initialization.
+
+    Example:
+        >>> factory = ChEMBLActivityPipelineFactory()
+        >>> pipeline = await factory.create(
+        ...     run_type=RunType.INCREMENTAL,
+        ...     resume=False,
+        ... )
+        >>> await pipeline.run()
+    """
+
+    @staticmethod
+    async def create(
+        run_type: RunType,
+        resume: bool = False,
+        s3_bucket_bronze: str = "bioetl-bronze",
+        s3_bucket_silver: str = "bioetl-silver",
+        s3_bucket_checkpoints: str = "bioetl-checkpoints",
+        redis_host: str = "localhost",
+        redis_port: int = 6379,
+        aws_endpoint_url: str | None = None,
+        aws_access_key: str | None = None,
+        aws_secret_key: str | None = None,
+    ) -> ChEMBLActivityPipeline:
+        """Create configured ChEMBL Activity pipeline.
+
+        Args:
+            run_type: Type of run (incremental, backfill, rebuild)
+            resume: Resume from checkpoint if available
+            s3_bucket_bronze: Bronze bucket name
+            s3_bucket_silver: Silver bucket name
+            s3_bucket_checkpoints: Checkpoints bucket name
+            redis_host: Redis host
+            redis_port: Redis port
+            aws_endpoint_url: S3 endpoint (for MinIO)
+            aws_access_key: AWS access key
+            aws_secret_key: AWS secret key
+
+        Returns:
+            Configured pipeline instance
+        """
+        from bioetl.infrastructure.adapters.chembl.client import ChemblAdapter
+        from bioetl.infrastructure.adapters.http.circuit_breaker import CircuitBreaker
+        from bioetl.infrastructure.adapters.http.client import UnifiedHTTPClient
+        from bioetl.infrastructure.adapters.http.rate_limiter import TokenBucket
+        from bioetl.infrastructure.checkpoint.s3_checkpoint import S3Checkpoint
+        from bioetl.infrastructure.locking.redis_lock import RedisDistributedLock
+        from bioetl.infrastructure.quarantine.unified_quarantine import (
+            UnifiedQuarantine,
+        )
+        from bioetl.infrastructure.storage.bronze_writer import BronzeWriter
+        from bioetl.infrastructure.storage.delta_writer import DeltaWriter
+
+        class StorageAdapter:
+            """Unified storage adapter for Bronze/Silver/Gold."""
+
+            def __init__(
+                self,
+                bronze_writer: BronzeWriter,
+                silver_writer: DeltaWriter,
+                gold_writer: DeltaWriter,
+            ):
+                self.bronze = bronze_writer
+                self.silver = silver_writer
+                self.gold = gold_writer
+
+            def write_bronze(self, *args: Any, **kwargs: Any) -> Any:
+                return self.bronze.write_bronze(*args, **kwargs)
+
+            def write_silver(self, *args: Any, **kwargs: Any) -> None:
+                return self.silver.write_silver(*args, **kwargs)
+
+            def write_gold(self, *args: Any, **kwargs: Any) -> None:
+                return self.gold.write_gold(*args, **kwargs)
+
+        storage_options = {
+            "AWS_ENDPOINT_URL": aws_endpoint_url,
+            "AWS_ACCESS_KEY_ID": aws_access_key,
+            "AWS_SECRET_ACCESS_KEY": aws_secret_key,
+        }
+
+        # Data source (ChEMBL)
+        bucket = TokenBucket(rate=10.0, capacity=20)
+        circuit_breaker = CircuitBreaker(provider="chembl")
+        http_client = UnifiedHTTPClient(bucket, circuit_breaker)
+        data_source = ChemblAdapter(http_client=http_client)
+
+        # Storage
+        bronze_writer = BronzeWriter(
+            bucket=s3_bucket_bronze,
+            endpoint_url=aws_endpoint_url,
+            access_key=aws_access_key,
+            secret_key=aws_secret_key,
+        )
+        silver_writer = DeltaWriter(
+            base_path=f"s3://{s3_bucket_silver}",
+            storage_options=storage_options if aws_endpoint_url else None,
+        )
+        gold_writer = DeltaWriter(
+            base_path=f"s3://{s3_bucket_silver}",
+            storage_options=storage_options if aws_endpoint_url else None,
+        )
+        storage = StorageAdapter(bronze_writer, silver_writer, gold_writer)
+
+        # Lock (Redis)
+        import redis.asyncio as aioredis
+
+        redis_client = aioredis.Redis(host=redis_host, port=redis_port)
+        lock = RedisDistributedLock(redis_client=redis_client)
+
+        # Checkpoint (S3)
+        checkpoint = S3Checkpoint(
+            bucket=s3_bucket_checkpoints,
+            endpoint_url=aws_endpoint_url,
+            access_key=aws_access_key,
+            secret_key=aws_secret_key,
+        )
+
+        # Quarantine
+        quarantine = UnifiedQuarantine(
+            base_path=f"s3://{s3_bucket_silver}/common/quarantine",
+            storage_options=storage_options if aws_endpoint_url else None,
+        )
+
+        return ChEMBLActivityPipeline(
+            run_type=run_type,
+            data_source=data_source,
+            storage=storage,
+            lock=lock,
+            checkpoint=checkpoint,
+            quarantine=quarantine,
+            resume=resume,
+        )