SoftwareDevLabs
diff --git a/‎src/agents/ai_document_agent.py‎
Lines changed: 348 additions & 0 deletions b/‎src/agents/ai_document_agent.py‎
Lines changed: 348 additions & 0 deletions
@@ -0,0 +1,348 @@
+"""AI-enhanced document agent with advanced processing capabilities."""
+
+import logging
+from pathlib import Path
+from typing import Any
+
+from .document_agent import DocumentAgent
+
+try:
+    from ..analyzers.semantic_analyzer import SemanticAnalyzer
+    from ..processors.ai_document_processor import AIDocumentProcessor
+    from ..processors.vision_processor import VisionProcessor
+    AI_PROCESSORS_AVAILABLE = True
+except ImportError:
+    AI_PROCESSORS_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+class AIDocumentAgent(DocumentAgent):
+    """Enhanced document agent with AI-powered analysis capabilities."""
+
+    def __init__(self, config: dict[str, Any] | None = None):
+        # Initialize base document agent
+        super().__init__(config)
+
+        # AI-specific configuration
+        self.ai_config = self.config.get('ai_processing', {})
+
+        # Initialize AI processors if available
+        self._ai_processors = {}
+        if AI_PROCESSORS_AVAILABLE:
+            self._initialize_ai_processors()
+        else:
+            logger.warning(
+                "AI processors not available. Install with: "
+                "pip install 'unstructuredDataHandler[ai-processing]'"
+            )
+
+    def _initialize_ai_processors(self):
+        """Initialize AI processing components."""
+        try:
+            # AI Document Processor for NLP
+            ai_config = self.ai_config.get('nlp', {})
+            self._ai_processors['nlp'] = AIDocumentProcessor(ai_config)
+
+            # Vision Processor for images and layout
+            vision_config = self.ai_config.get('vision', {})
+            self._ai_processors['vision'] = VisionProcessor(vision_config)
+
+            # Semantic Analyzer for understanding
+            semantic_config = self.ai_config.get('semantic', {})
+            self._ai_processors['semantic'] = SemanticAnalyzer(semantic_config)
+
+            logger.info("AI processors initialized successfully")
+
+        except Exception as e:
+            logger.error(f"Error initializing AI processors: {e}")
+
+    def process_document_with_ai(self, file_path: str | Path,
+                                enable_vision: bool = True,
+                                enable_nlp: bool = True,
+                                enable_semantic: bool = False) -> dict[str, Any]:
+        """Process document with full AI enhancement."""
+        try:
+            # Start with base document processing
+            base_result = self.process_document(file_path)
+
+            if not AI_PROCESSORS_AVAILABLE:
+                base_result["ai_message"] = "AI processing not available. Install with pip install 'unstructuredDataHandler[ai-processing]'"
+                return base_result
+
+            # Extract content for AI analysis
+            content = base_result.get('content', '')
+            if not content:
+                logger.warning(f"No content extracted from {file_path}")
+                return base_result
+
+            # AI Analysis Results
+            ai_results = {
+                "ai_available": True,
+                "processors_used": []
+            }
+
+            # NLP Analysis
+            if enable_nlp and 'nlp' in self._ai_processors:
+                try:
+                    nlp_processor = self._ai_processors['nlp']
+                    if nlp_processor.is_available:
+                        nlp_results = nlp_processor.process_document_advanced(content)
+                        ai_results["nlp_analysis"] = nlp_results
+                        ai_results["processors_used"].append("nlp")
+                        logger.info("NLP analysis completed")
+                    else:
+                        ai_results["nlp_analysis"] = {"error": "NLP processor not available"}
+                except Exception as e:
+                    logger.error(f"NLP analysis failed: {e}")
+                    ai_results["nlp_analysis"] = {"error": str(e)}
+
+            # Vision Analysis (if document has images or is image-based)
+            if enable_vision and 'vision' in self._ai_processors:
+                try:
+                    vision_processor = self._ai_processors['vision']
+                    if vision_processor.is_available:
+                        # For PDF files, try to analyze layout
+                        file_ext = Path(file_path).suffix.lower()
+                        if file_ext in ['.pdf', '.png', '.jpg', '.jpeg']:
+                            # Note: This would need document-to-image conversion for PDFs
+                            # For now, we'll skip direct image analysis
+                            ai_results["vision_analysis"] = {
+                                "message": "Vision analysis available but requires image conversion",
+                                "supported_formats": [".png", ".jpg", ".jpeg"]
+                            }
+                        ai_results["processors_used"].append("vision")
+                except Exception as e:
+                    logger.error(f"Vision analysis failed: {e}")
+                    ai_results["vision_analysis"] = {"error": str(e)}
+
+            # Semantic Analysis
+            if enable_semantic and 'semantic' in self._ai_processors:
+                try:
+                    semantic_processor = self._ai_processors['semantic']
+                    if semantic_processor.is_available:
+                        # Prepare document for semantic analysis
+                        documents = [{
+                            'content': content,
+                            'source': str(file_path),
+                            'metadata': base_result.get('metadata', {})
+                        }]
+
+                        semantic_results = semantic_processor.extract_semantic_structure(documents)
+                        ai_results["semantic_analysis"] = semantic_results
+                        ai_results["processors_used"].append("semantic")
+                        logger.info("Semantic analysis completed")
+                    else:
+                        ai_results["semantic_analysis"] = {"error": "Semantic processor not available"}
+                except Exception as e:
+                    logger.error(f"Semantic analysis failed: {e}")
+                    ai_results["semantic_analysis"] = {"error": str(e)}
+
+            # Combine results
+            base_result["ai_analysis"] = ai_results
+
+            logger.info(f"AI-enhanced processing completed for {file_path}")
+            return base_result
+
+        except Exception as e:
+            logger.error(f"Error in AI-enhanced document processing: {e}")
+            result = self.process_document(file_path)  # Fallback to base processing
+            result["ai_error"] = str(e)
+            return result
+
+    def analyze_document_similarity(self, file_paths: list[str | Path]) -> dict[str, Any]:
+        """Analyze semantic similarity between multiple documents."""
+        if not AI_PROCESSORS_AVAILABLE or 'semantic' not in self._ai_processors:
+            return {"error": "Semantic analysis not available"}
+
+        try:
+            # Process all documents first
+            documents = []
+            for file_path in file_paths:
+                result = self.process_document(file_path)
+                content = result.get('content', '')
+                if content:
+                    documents.append({
+                        'content': content,
+                        'source': str(file_path),
+                        'metadata': result.get('metadata', {})
+                    })
+
+            if len(documents) < 2:
+                return {"error": "Need at least 2 documents for similarity analysis"}
+
+            # Perform semantic analysis
+            semantic_processor = self._ai_processors['semantic']
+            similarity_results = semantic_processor.extract_semantic_structure(documents)
+
+            # Add document paths for reference
+            similarity_results["analyzed_files"] = [str(path) for path in file_paths]
+            similarity_results["analysis_type"] = "multi_document_similarity"
+
+            return similarity_results
+
+        except Exception as e:
+            logger.error(f"Error in document similarity analysis: {e}")
+            return {"error": str(e)}
+
+    def extract_key_insights(self, file_path: str | Path) -> dict[str, Any]:
+        """Extract key insights and summaries from a document."""
+        try:
+            # Process with AI enhancement
+            result = self.process_document_with_ai(file_path, enable_nlp=True, enable_semantic=True)
+
+            # Extract key insights from AI analysis
+            insights = {
+                "document_path": str(file_path),
+                "processing_timestamp": result.get('timestamp'),
+                "content_summary": {}
+            }
+
+            # Basic content info
+            content = result.get('content', '')
+            insights["content_summary"].update({
+                "character_count": len(content),
+                "word_count": len(content.split()),
+                "estimated_reading_time_minutes": len(content.split()) / 200  # Average reading speed
+            })
+
+            # AI-generated insights
+            ai_analysis = result.get('ai_analysis', {})
+
+            # NLP insights
+            nlp_analysis = ai_analysis.get('nlp_analysis', {})
+            if 'summary' in nlp_analysis and not nlp_analysis.get('summary', {}).get('error'):
+                insights["ai_summary"] = nlp_analysis['summary']
+
+            if 'entities' in nlp_analysis:
+                insights["key_entities"] = nlp_analysis['entities'][:10]  # Top 10 entities
+
+            if 'classification' in nlp_analysis:
+                insights["document_sentiment"] = nlp_analysis['classification']
+
+            # Semantic insights
+            semantic_analysis = ai_analysis.get('semantic_analysis', {})
+            if 'semantic_analysis' in semantic_analysis:
+                semantic_data = semantic_analysis['semantic_analysis']
+
+                # Topics
+                if 'topics' in semantic_data:
+                    topics = semantic_data['topics']
+                    if 'topics' in topics and topics['topics']:
+                        insights["main_topics"] = topics['topics'][:3]  # Top 3 topics
+
+                # TF-IDF keywords
+                if 'tfidf' in semantic_data:
+                    tfidf = semantic_data['tfidf']
+                    if 'global_top_terms' in tfidf:
+                        insights["key_terms"] = tfidf['global_top_terms'][:10]  # Top 10 terms
+
+            return insights
+
+        except Exception as e:
+            logger.error(f"Error extracting key insights: {e}")
+            return {"error": str(e), "document_path": str(file_path)}
+
+    def batch_process_with_ai(self, file_paths: list[str | Path],
+                             enable_similarity_analysis: bool = True) -> dict[str, Any]:
+        """Process multiple documents with AI analysis and cross-document insights."""
+        try:
+            results = {
+                "total_documents": len(file_paths),
+                "processed_documents": [],
+                "batch_insights": {},
+                "processing_summary": {}
+            }
+
+            # Process each document individually
+            all_contents = []
+            successful_processes = 0
+
+            for i, file_path in enumerate(file_paths):
+                logger.info(f"Processing document {i+1}/{len(file_paths)}: {file_path}")
+
+                try:
+                    doc_result = self.process_document_with_ai(
+                        file_path,
+                        enable_nlp=True,
+                        enable_semantic=False  # We'll do batch semantic analysis
+                    )
+
+                    # Extract key insights
+                    insights = self.extract_key_insights(file_path)
+                    doc_result["key_insights"] = insights
+
+                    results["processed_documents"].append(doc_result)
+
+                    # Collect content for batch analysis
+                    content = doc_result.get('content', '')
+                    if content:
+                        all_contents.append({
+                            'content': content,
+                            'source': str(file_path),
+                            'index': i
+                        })
+
+                    successful_processes += 1
+
+                except Exception as e:
+                    logger.error(f"Error processing {file_path}: {e}")
+                    results["processed_documents"].append({
+                        "file_path": str(file_path),
+                        "error": str(e)
+                    })
+
+            results["processing_summary"] = {
+                "successful": successful_processes,
+                "failed": len(file_paths) - successful_processes,
+                "success_rate": successful_processes / len(file_paths) if file_paths else 0
+            }
+
+            # Cross-document analysis
+            if enable_similarity_analysis and len(all_contents) > 1:
+                try:
+                    if AI_PROCESSORS_AVAILABLE and 'semantic' in self._ai_processors:
+                        semantic_processor = self._ai_processors['semantic']
+                        batch_semantic = semantic_processor.extract_semantic_structure(all_contents)
+                        results["batch_insights"]["semantic_analysis"] = batch_semantic
+
+                        # Add cross-document insights
+                        results["batch_insights"]["cross_document_insights"] = {
+                            "total_analyzed": len(all_contents),
+                            "similarity_matrix_available": "embeddings" in batch_semantic.get("semantic_analysis", {}),
+                            "topics_identified": len(batch_semantic.get("semantic_analysis", {}).get("topics", {}).get("topics", [])),
+                            "clusters_found": batch_semantic.get("semantic_analysis", {}).get("clusters", {}).get("n_clusters", 0)
+                        }
+
+                except Exception as e:
+                    logger.error(f"Error in batch semantic analysis: {e}")
+                    results["batch_insights"]["semantic_error"] = str(e)
+
+            logger.info(f"Batch AI processing completed: {successful_processes}/{len(file_paths)} successful")
+            return results
+
+        except Exception as e:
+            logger.error(f"Error in batch AI processing: {e}")
+            return {"error": str(e)}
+
+    @property
+    def ai_capabilities(self) -> dict[str, bool]:
+        """Return available AI capabilities."""
+        if not AI_PROCESSORS_AVAILABLE:
+            return {
+                "ai_available": False,
+                "message": "Install with: pip install 'unstructuredDataHandler[ai-processing]'"
+            }
+
+        capabilities = {"ai_available": True}
+
+        for name, processor in self._ai_processors.items():
+            capabilities[f"{name}_available"] = processor.is_available
+
+            # Specific features for each processor
+            if hasattr(processor, 'available_features'):
+                capabilities[f"{name}_features"] = processor.available_features
+            elif hasattr(processor, 'available_models'):
+                capabilities[f"{name}_models"] = processor.available_models
+
+        return capabilities