ModelEngine-Group · yzAiden · Mar 24, 2026 · Mar 30, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -38,7 +38,10 @@ jobs:
       - name: Check if model is cached locally
         id: check-model
         run: |
-          if [ -f ~/model-assets/clip-vit-base-patch32/config.json ] && [ -d ~/model-assets/nltk_data ]; then
+          if [ -f ~/model-assets/clip-vit-base-patch32/config.json ] && \
+             [ -d ~/model-assets/nltk_data ] && \
+             [ -d ~/model-assets/table-transformer-structure-recognition ] && \
+             [ -d ~/model-assets/yolox ]; then
             echo "cache-hit=true" >> "$GITHUB_OUTPUT"
             cp -r ~/model-assets ./
           else
@@ -105,4 +108,4 @@ jobs:
             ./deploy.sh --mode 3 --is-mainland N --enable-terminal N --version 2 --root-dir "$HOME/nexent-production-data"
           else
             ./deploy.sh --mode 1 --is-mainland N --enable-terminal N --version 2 --root-dir "$HOME/nexent-development-data"
-          fi
+          fi
@@ -1,4 +1,4 @@
-import threading
+import threading
 import logging
 from typing import List, Optional
 from urllib.parse import urljoin
@@ -469,6 +469,7 @@
             rerank = param_dict.get("rerank", False)
             rerank_model_name = param_dict.get("rerank_model_name", "")
             rerank_model = None
+            is_multimodal = bool(tool_config.params.pop("multimodal", False))
             if rerank and rerank_model_name:
                 rerank_model = get_rerank_model(
                     tenant_id=tenant_id, model_name=rerank_model_name

@@ -126,12 +126,13 @@ async def upload_files(
 
 @file_management_config_router.post("/process")
 async def process_files(
-        files: List[dict] = Body(
-            ..., description="List of file details to process, including path_or_url and filename"),
-        chunking_strategy: Optional[str] = Body("basic"),
-        index_name: str = Body(...),
-        destination: str = Body(...),
-        authorization: Optional[str] = Header(None)
+        files: Annotated[List[dict], Body(
+            ..., description="List of file details to process, including path_or_url and filename")],
+        index_name: Annotated[str, Body(...)],
+        destination: Annotated[str, Body(...)],
+        chunking_strategy: Annotated[Optional[str], Body(...)] = "basic",
+        model_id: Annotated[Optional[int], Body(...)] = None,
+        authorization: Annotated[Optional[str], Header()] = None
 ):
     """
     Trigger data processing for a list of uploaded files.
@@ -144,7 +145,8 @@ async def process_files(
         chunking_strategy=chunking_strategy,
         source_type=destination,
         index_name=index_name,
-        authorization=authorization
+        authorization=authorization,
+        model_id=model_id
     )
 
     process_result = await trigger_data_process(files, process_params)

@@ -33,7 +33,7 @@
 from fastapi.responses import JSONResponse
 from fastapi.encoders import jsonable_encoder
 from http import HTTPStatus
-from typing import List, Optional
+from typing import Annotated, List, Optional
 from services.model_health_service import (
     check_model_connectivity,
     verify_model_config_connectivity,
@@ -297,7 +297,8 @@ async def get_llm_model_list(authorization: Optional[str] = Header(None)):
 
 @router.post("/healthcheck")
 async def check_model_health(
-        display_name: str = Query(..., description="Display name to check"),
+        display_name: Annotated[str, Query(..., description="Display name to check")],
+        model_type: Annotated[str, Query(..., description="...")],
         authorization: Optional[str] = Header(None)
 ):
     """Check and update model connectivity, returning the latest status.
@@ -308,7 +309,7 @@ async def check_model_health(
     """
     try:
         _, tenant_id = get_current_user_id(authorization)
-        result = await check_model_connectivity(display_name, tenant_id)
+        result = await check_model_connectivity(display_name, tenant_id, model_type)
         return JSONResponse(status_code=HTTPStatus.OK, content={
             "message": "Successfully checked model connectivity",
             "data": result

@@ -82,11 +82,13 @@ def create_new_index(
         # Extract optional fields from request body
         ingroup_permission = None
         group_ids = None
-        embedding_model_name = None
+        embedding_model_name: Optional[str] = None
+        is_multimodal: Optional[bool] = None
         if request:
             ingroup_permission = request.get("ingroup_permission")
             group_ids = request.get("group_ids")
-            embedding_model_name = request.get("embedding_model_name")
+            embedding_model_name = request.get("embeddingModel")
+            is_multimodal = request.get("is_multimodal")
 
         # Treat path parameter as user-facing knowledge base name for new creations
         return ElasticSearchService.create_knowledge_base(
@@ -98,6 +100,7 @@ def create_new_index(
             ingroup_permission=ingroup_permission,
             group_ids=group_ids,
             embedding_model_name=embedding_model_name,
+            is_multimodal=is_multimodal,
         )
     except Exception as e:
         raise HTTPException(
@@ -664,6 +667,7 @@ def update_chunk(
             chunk_request=payload,
             vdb_core=vdb_core,
             user_id=user_id,
+            tenant_id=tenant_id,
         )
         return JSONResponse(status_code=HTTPStatus.OK, content=result)
     except ValueError as e:
@@ -730,8 +734,17 @@ async def hybrid_search(
     """Run a hybrid (accurate + semantic) search across indices."""
     try:
         _, tenant_id = get_current_user_id(authorization)
+        resolved_index_names: List[str] = []
+        for requested_name in payload.index_names:
+            try:
+                resolved_name = get_index_name_by_knowledge_name(
+                    requested_name, tenant_id
+                )
+            except Exception:
+                resolved_name = requested_name
+            resolved_index_names.append(resolved_name)
         result = ElasticSearchService.search_hybrid(
-            index_names=payload.index_names,
+            index_names=resolved_index_names,
             query=payload.query,
             tenant_id=tenant_id,
             top_k=payload.top_k,

@@ -31,6 +31,10 @@ class VectorDatabaseType(str, Enum):
 # Data Processing Service Configuration
 DATA_PROCESS_SERVICE = os.getenv("DATA_PROCESS_SERVICE")
 CLIP_MODEL_PATH = os.getenv("CLIP_MODEL_PATH")
+TABLE_TRANSFORMER_MODEL_PATH = os.getenv("TABLE_TRANSFORMER_MODEL_PATH")
+UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH = os.getenv(
+    "UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH"
+)
 
 
 # Upload Configuration
@@ -129,6 +133,7 @@ class VectorDatabaseType(str, Enum):
 MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")
 MINIO_REGION = os.getenv("MINIO_REGION")
 MINIO_DEFAULT_BUCKET = os.getenv("MINIO_DEFAULT_BUCKET")
+S3_URL_PREFIX = "s3://"
 
 
 # Postgres Configuration

@@ -300,6 +300,7 @@ class ProcessParams(BaseModel):
     source_type: str
     index_name: str
     authorization: Optional[str] = None
+    model_id: Optional[int] = None
 
 
 class OpinionRequest(BaseModel):

@@ -1,12 +1,20 @@
+from io import BytesIO
 import logging
 import json
 import time
 from typing import Any, Dict, List, Optional
 
 import ray
 
-from consts.const import RAY_ACTOR_NUM_CPUS, REDIS_BACKEND_URL, DEFAULT_EXPECTED_CHUNK_SIZE, DEFAULT_MAXIMUM_CHUNK_SIZE
-from database.attachment_db import get_file_stream
+from consts.const import (
+    RAY_ACTOR_NUM_CPUS,
+    REDIS_BACKEND_URL,
+    DEFAULT_EXPECTED_CHUNK_SIZE,
+    DEFAULT_MAXIMUM_CHUNK_SIZE,
+    TABLE_TRANSFORMER_MODEL_PATH,
+    UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH,
+)
+from database.attachment_db import build_s3_url, get_file_stream, upload_fileobj
 from database.model_management_db import get_model_by_model_id
 from nexent.data_process import DataProcessCore
 
@@ -43,35 +51,16 @@
         Normalize task/model-related processing params.
         """
         process_params = dict(params)
+        self._apply_model_paths(process_params)
         if task_id:
             process_params["task_id"] = task_id
 
-        if not (model_id and tenant_id):
-            return process_params
-
-        try:
-            model_record = get_model_by_model_id(
-                model_id=model_id, tenant_id=tenant_id)
-            if not model_record:
-                logger.warning(
-                    f"[RayActor] Embedding model with ID {model_id} not found for tenant '{tenant_id}', using default chunk sizes")
-                return process_params
-
-            expected_chunk_size = model_record.get(
-                "expected_chunk_size", DEFAULT_EXPECTED_CHUNK_SIZE)
-            maximum_chunk_size = model_record.get(
-                "maximum_chunk_size", DEFAULT_MAXIMUM_CHUNK_SIZE)
-            model_name = model_record.get("display_name")
-
-            process_params["max_characters"] = maximum_chunk_size
-            process_params["new_after_n_chars"] = expected_chunk_size
-
-            logger.info(
-                f"[RayActor] Using chunk sizes from embedding model '{model_name}' (ID: {model_id}): "
-                f"max_characters={maximum_chunk_size}, new_after_n_chars={expected_chunk_size}")
-        except Exception as e:
-            logger.warning(
-                f"[RayActor] Failed to retrieve chunk sizes from embedding model ID {model_id}: {e}. Using default chunk sizes")
+        # Reuse shared model param logic so we also keep extra fields
+        self._apply_model_chunk_sizes(
+            model_id=model_id,
+            tenant_id=tenant_id,
+            params=process_params,
+        )
         return process_params
 
     def _run_file_process(
@@ -82,24 +71,19 @@
         process_params: Dict[str, Any],
         log_subject: str,
     ) -> List[Dict[str, Any]]:
-        chunks = self._processor.file_process(
+        result = self._processor.file_process(
             file_data=file_data,
             filename=filename,
             chunking_strategy=chunking_strategy,
             **process_params
         )
-
-        if chunks is None:
-            logger.warning(
-                f"[RayActor] file_process returned None for {log_subject}='{filename}'")
-            return []
-        if not isinstance(chunks, list):
-            logger.error(
-                f"[RayActor] file_process returned non-list type {type(chunks)} for {log_subject}='{filename}'")
-            return []
-        if len(chunks) == 0:
-            logger.warning(
-                f"[RayActor] file_process returned empty list for {log_subject}='{filename}'")
+
+        chunks, images_info = self._normalize_processor_result(result)
+        if images_info:
+            self._append_image_chunks(
+                source=filename, chunks=chunks, images_info=images_info)
+        chunks = self._validate_chunks(chunks, filename)
+        if not chunks:
             return []
 
         logger.info(
@@ -161,8 +145,129 @@
             chunking_strategy=chunking_strategy,
             process_params=process_params,
             log_subject="source",
-        )
+        ) 
+
+    def _apply_model_paths(self, params: Dict[str, Any]) -> None:
+        params["table_transformer_model_path"] = TABLE_TRANSFORMER_MODEL_PATH
+        params[
+            "unstructured_default_model_initialize_params_json_path"
+        ] = UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH
+
+    def _apply_model_chunk_sizes(
+        self,
+        model_id: Optional[int],
+        tenant_id: Optional[str],
+        params: Dict[str, Any],
+    ) -> None:
+        if not (model_id and tenant_id):
+            return
+
+        try:
+            model_record = get_model_by_model_id(
+                model_id=model_id, tenant_id=tenant_id)
+            if not model_record:
+                logger.warning(
+                    f"[RayActor] Embedding model with ID {model_id} not found for tenant '{tenant_id}', using default chunk sizes")
+                return
+
+            expected_chunk_size = model_record.get(
+                'expected_chunk_size', DEFAULT_EXPECTED_CHUNK_SIZE)
+            maximum_chunk_size = model_record.get(
+                'maximum_chunk_size', DEFAULT_MAXIMUM_CHUNK_SIZE)
+            model_name = model_record.get('display_name')
+            model_type = model_record.get('model_type')
+
+            params['max_characters'] = maximum_chunk_size
+            params['new_after_n_chars'] = expected_chunk_size
+            if model_type:
+                params['model_type'] = model_type
+
+            logger.info(
+                f"[RayActor] Using chunk sizes from embedding model '{model_name}' (ID: {model_id}): "
+                f"max_characters={maximum_chunk_size}, new_after_n_chars={expected_chunk_size}")
+        except Exception as e:
+            logger.warning(
+                f"[RayActor] Failed to retrieve chunk sizes from embedding model ID {model_id}: {e}. Using default chunk sizes")
+
+    def _read_file_bytes(self, source: str) -> bytes:
+        try:
+            file_stream = get_file_stream(source)
+            if file_stream is None:
+                raise FileNotFoundError(
+                    f"Unable to fetch file from URL: {source}")
+            return file_stream.read()
+        except Exception as e:
+            logger.error(f"Failed to fetch file from {source}: {e}")
+            raise
 
+    def _normalize_processor_result(
+        self, result: Any
+    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+        if isinstance(result, tuple) and len(result) == 2:
+            chunks, images_info = result
+            return chunks or [], images_info or []
+        return result or [], []
+
+    def _append_image_chunks(
+        self,
+        source: str,
+        chunks: List[Dict[str, Any]],
+        images_info: List[Dict[str, Any]],
+    ) -> None:
+        folder = "images_in_attachments"
+        for index, image_data in enumerate(images_info):
+            if not isinstance(image_data, dict):
+                logger.warning(
+                    f"[RayActor] Skipping image entry at index {index}: unexpected type {type(image_data)}"
+                )
+                continue
+            if "image_bytes" not in image_data:
+                logger.warning(
+                    f"[RayActor] Skipping image entry at index {index}: missing image_bytes"
+                )
+                continue
+
+            img_obj = BytesIO(image_data["image_bytes"])
+            result = upload_fileobj(
+                file_obj=img_obj,
+                file_name=f"{index}.{image_data['image_format']}",
+                prefix=folder)
+            image_url = build_s3_url(result.get("object_name", ""))
+
+            image_data["source_file"] = source
+            image_data["image_url"] = image_url
+
+            chunks.append({
+                "content": json.dumps({
+                    "source_file": source,
+                    "position": image_data["position"],
+                    "image_url": image_url,
+                }),
+                "filename": source,
+                "metadata": {
+                    "chunk_index": len(chunks) + index,
+                    "process_source": "UniversalImageExtractor",
+                    "image_url": image_url,
+                }
+            })
+
+    def _validate_chunks(
+        self, chunks: Any, source: str
+    ) -> List[Dict[str, Any]]:
+        if chunks is None:
+            logger.warning(
+                f"[RayActor] file_process returned None for source='{source}'")
+            return []
+        if not isinstance(chunks, list):
+            logger.error(
+                f"[RayActor] file_process returned non-list type {type(chunks)} for source='{source}'")
+            return []
+        if len(chunks) == 0:
+            logger.warning(
+                f"[RayActor] file_process returned empty list for source='{source}'")
+            return []
+        return chunks
+
     def process_bytes(
         self,
         file_bytes: bytes,