SoftwareDevLabs
diff --git a/‎src/pipelines/document_pipeline.py‎
Lines changed: 253 additions & 0 deletions b/‎src/pipelines/document_pipeline.py‎
Lines changed: 253 additions & 0 deletions
@@ -0,0 +1,253 @@
+"""Document processing pipeline for end-to-end document workflows."""
+
+from collections.abc import Callable
+import logging
+from pathlib import Path
+from typing import Any
+
+from ..agents.document_agent import DocumentAgent
+from ..memory.short_term import ShortTermMemory
+from ..utils.file_utils import get_file_hash
+from .base_pipeline import BasePipeline
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentPipeline(BasePipeline):
+    """Complete pipeline for document processing workflows."""
+
+    def __init__(self, config: dict[str, Any] | None = None):
+        super().__init__(config)
+        self.document_agent = DocumentAgent(config.get("agent", {}) if config else {})
+        self.memory = ShortTermMemory()
+        self.processors = []
+        self.output_handlers = []
+
+    def process(self, input_data: Any) -> dict[str, Any]:
+        """Process input through the pipeline (implements BasePipeline interface)."""
+        if isinstance(input_data, str | Path):
+            return self.process_single_document(input_data)
+        elif isinstance(input_data, list):
+            return self.process_batch(input_data)
+        else:
+            return {"error": "Unsupported input type for document pipeline"}
+
+    def add_processor(self, processor: Callable[[dict[str, Any]], dict[str, Any]]) -> "DocumentPipeline":
+        """Add a custom processor to the pipeline."""
+        self.processors.append(processor)
+        return self
+
+    def add_output_handler(self, handler: Callable[[dict[str, Any]], None]) -> "DocumentPipeline":
+        """Add an output handler to the pipeline."""
+        self.output_handlers.append(handler)
+        return self
+
+    def process_single_document(self, file_path: str | Path) -> dict[str, Any]:
+        """Process a single document through the complete pipeline."""
+        file_path = Path(file_path)
+        logger.info(f"Starting document pipeline for: {file_path}")
+
+        try:
+            # Check if already processed (using file hash)
+            file_hash = get_file_hash(file_path)
+            cached_result = self.memory.get(f"doc_{file_hash}")
+
+            if cached_result and self.config.get("use_cache", True):
+                logger.info(f"Using cached result for {file_path}")
+                return cached_result
+
+            # Process with document agent
+            result = self.document_agent.extract_requirements(str(file_path))
+
+            if not result["success"]:
+                return result
+
+            # Apply custom processors
+            for processor in self.processors:
+                try:
+                    result = processor(result)
+                except Exception as e:
+                    logger.error(f"Processor failed: {e}")
+                    result["processing_errors"] = result.get("processing_errors", []) + [str(e)]
+
+            # Store in memory
+            if self.config.get("use_cache", True):
+                self.memory.store(f"doc_{file_hash}", result)
+
+            # Apply output handlers
+            for handler in self.output_handlers:
+                try:
+                    handler(result)
+                except Exception as e:
+                    logger.error(f"Output handler failed: {e}")
+
+            logger.info(f"Successfully processed document: {file_path}")
+            return result
+
+        except Exception as e:
+            logger.error(f"Document pipeline failed for {file_path}: {e}")
+            return {
+                "success": False,
+                "file_path": str(file_path),
+                "error": str(e),
+                "pipeline": "DocumentPipeline"
+            }
+
+    def process_batch(self, file_paths: list[str | Path]) -> dict[str, Any]:
+        """Process multiple documents."""
+        logger.info(f"Starting batch processing for {len(file_paths)} documents")
+
+        results = []
+        success_count = 0
+
+        for file_path in file_paths:
+            try:
+                result = self.process_single_document(file_path)
+                results.append(result)
+
+                if result["success"]:
+                    success_count += 1
+
+            except Exception as e:
+                logger.error(f"Batch item failed {file_path}: {e}")
+                results.append({
+                    "success": False,
+                    "file_path": str(file_path),
+                    "error": str(e)
+                })
+
+        batch_result = {
+            "success": success_count > 0,
+            "total_documents": len(file_paths),
+            "successful_documents": success_count,
+            "failed_documents": len(file_paths) - success_count,
+            "results": results,
+            "pipeline": "DocumentPipeline"
+        }
+
+        logger.info(f"Batch processing complete: {success_count}/{len(file_paths)} successful")
+        return batch_result
+
+    def process_directory(self, directory_path: str | Path,
+                         pattern: str = "**/*",
+                         recursive: bool = True) -> dict[str, Any]:
+        """Process all documents in a directory."""
+        directory_path = Path(directory_path)
+
+        if not directory_path.exists():
+            raise FileNotFoundError(f"Directory not found: {directory_path}")
+
+        # Find all supported files (Docling supports these formats)
+        supported_formats = [".pdf", ".docx", ".pptx", ".html", ".md"]
+        file_paths = []
+
+        for file_path in directory_path.glob(pattern):
+            if file_path.is_file() and file_path.suffix.lower() in supported_formats:
+                file_paths.append(file_path)
+
+        logger.info(f"Found {len(file_paths)} documents in {directory_path}")
+
+        if not file_paths:
+            return {
+                "success": False,
+                "error": "No supported documents found",
+                "directory": str(directory_path),
+                "supported_formats": supported_formats
+            }
+
+        return self.process_batch(file_paths)
+
+    def extract_requirements(self, processed_docs: list[dict[str, Any]]) -> dict[str, Any]:
+        """Extract and consolidate requirements from processed documents."""
+        logger.info(f"Extracting requirements from {len(processed_docs)} documents")
+
+        requirements = {
+            "functional": [],
+            "non_functional": [],
+            "business": [],
+            "technical": [],
+            "constraints": [],
+            "assumptions": []
+        }
+
+        sources = []
+
+        for doc in processed_docs:
+            if not doc.get("success"):
+                continue
+
+            content = doc.get("processed_content", {})
+
+            # Extract from AI analysis if available
+            if "ai_analysis" in content:
+                ai_analysis = content["ai_analysis"]
+                if "key_info" in ai_analysis:
+                    # Parse requirements from key information
+                    self._parse_requirements_from_text(ai_analysis["key_info"], requirements)
+
+            # Extract from structured content
+            if "content" in content:
+                self._parse_requirements_from_text(content["content"], requirements)
+
+            sources.append({
+                "file": doc.get("file_path"),
+                "title": content.get("metadata", {}).get("title", "Unknown")
+            })
+
+        return {
+            "requirements": requirements,
+            "sources": sources,
+            "extraction_method": "DocumentPipeline",
+            "total_documents": len(processed_docs),
+            "timestamp": self._get_timestamp()
+        }
+
+    def _parse_requirements_from_text(self, text: str, requirements: dict[str, list]) -> None:
+        """Parse requirements from text content (basic implementation)."""
+        # This is a basic implementation - can be enhanced with NLP/LLM
+        text.lower()
+
+        # Simple keyword-based classification
+        lines = text.split('\n')
+
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+
+            line_lower = line.lower()
+
+            # Functional requirements
+            if any(keyword in line_lower for keyword in ['shall', 'must', 'will', 'should']):
+                if any(keyword in line_lower for keyword in ['system', 'user', 'function', 'feature']):
+                    requirements['functional'].append(line)
+
+            # Non-functional requirements
+            elif any(keyword in line_lower for keyword in ['performance', 'security', 'usability', 'reliability']):
+                requirements['non_functional'].append(line)
+
+            # Business requirements
+            elif any(keyword in line_lower for keyword in ['business', 'stakeholder', 'goal', 'objective']):
+                requirements['business'].append(line)
+
+            # Technical requirements
+            elif any(keyword in line_lower for keyword in ['technical', 'architecture', 'platform', 'technology']):
+                requirements['technical'].append(line)
+
+            # Constraints
+            elif any(keyword in line_lower for keyword in ['constraint', 'limitation', 'restriction']):
+                requirements['constraints'].append(line)
+
+            # Assumptions
+            elif any(keyword in line_lower for keyword in ['assumption', 'assume', 'presume']):
+                requirements['assumptions'].append(line)
+
+    def get_pipeline_info(self) -> dict[str, Any]:
+        """Get information about the pipeline configuration."""
+        return {
+            "name": "DocumentPipeline",
+            "agent": "DocumentAgent",
+            "processors_count": len(self.processors),
+            "output_handlers_count": len(self.output_handlers),
+            "caching_enabled": self.config.get("use_cache", True)
+        }