Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions backend/apps/data_process_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
ConvertStateRequest,
TaskRequest,
)
from consts.exceptions import OfficeConversionException
from data_process.tasks import process_and_forward, process_sync
from services.data_process_service import get_data_process_service

Expand Down Expand Up @@ -311,3 +312,35 @@ async def convert_state(request: ConvertStateRequest):
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Error converting state: {str(e)}"
)


@router.post("/convert_to_pdf")
async def convert_office_to_pdf(
object_name: str = Form(...),
pdf_object_name: str = Form(...)
):
"""
Convert an Office document stored in MinIO to PDF.

Parameters:
object_name: Source Office file path in MinIO
pdf_object_name: Destination PDF path in MinIO
"""
try:
await service.convert_office_to_pdf_impl(
object_name=object_name,
pdf_object_name=pdf_object_name,
)
return JSONResponse(status_code=HTTPStatus.OK, content={"success": True})
except OfficeConversionException as exc:
logger.error(f"Office conversion failed for '{object_name}': {exc}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=str(exc)
)
except Exception as exc:
logger.error(f"Unexpected error during conversion for '{object_name}': {exc}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Office conversion failed: {exc}"
)
85 changes: 79 additions & 6 deletions backend/apps/file_management_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,29 @@
from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile
from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse

from consts.exceptions import FileTooLargeException, NotFoundException, OfficeConversionException, UnsupportedFileTypeException
from consts.model import ProcessParams
from services.file_management_service import upload_to_minio, upload_files_impl, \
get_file_url_impl, get_file_stream_impl, delete_file_impl, list_files_impl
get_file_url_impl, get_file_stream_impl, delete_file_impl, list_files_impl, \
preview_file_impl
from utils.file_management_utils import trigger_data_process

logger = logging.getLogger("file_management_app")


def build_content_disposition_header(filename: Optional[str]) -> str:
def build_content_disposition_header(filename: Optional[str], inline: bool = False) -> str:
"""
Build a Content-Disposition header that keeps the original filename.

Args:
filename: Original filename to include in header
inline: If True, use 'inline' disposition (for preview); otherwise 'attachment' (for download)

- ASCII filenames are returned directly.
- Non-ASCII filenames include both an ASCII fallback and RFC 5987 encoded value
so modern browsers keep the original name.
"""
disposition = "inline" if inline else "attachment"
safe_name = (filename or "download").strip() or "download"

def _sanitize_ascii(value: str) -> str:
Expand All @@ -40,26 +47,26 @@ def _sanitize_ascii(value: str) -> str:

try:
safe_name.encode("ascii")
return f'attachment; filename="{_sanitize_ascii(safe_name)}"'
return f'{disposition}; filename="{_sanitize_ascii(safe_name)}"'
except UnicodeEncodeError:
try:
encoded = quote(safe_name, safe="")
except Exception:
# quote failure, fallback to sanitized ASCII only
logger.warning("Failed to encode filename '%s', using fallback", safe_name)
return f'attachment; filename="{_sanitize_ascii(safe_name)}"'
return f'{disposition}; filename="{_sanitize_ascii(safe_name)}"'

fallback = _sanitize_ascii(
safe_name.encode("ascii", "ignore").decode("ascii") or "download"
)
return f'attachment; filename="{fallback}"; filename*=UTF-8\'\'{encoded}'
return f'{disposition}; filename="{fallback}"; filename*=UTF-8\'\'{encoded}'
except Exception as exc: # pragma: no cover
logger.warning(
"Failed to encode filename '%s': %s. Using fallback.",
safe_name,
exc,
)
return 'attachment; filename="download"'
return f'{disposition}; filename="download"'

# Create API router
file_management_runtime_router = APIRouter(prefix="/file")
Expand Down Expand Up @@ -567,3 +574,69 @@ async def get_storage_file_batch_urls(
"failed_count": sum(1 for r in results if not r.get("success", False)),
"results": results
}

@file_management_config_router.get("/preview/{object_name:path}")
async def preview_file(
object_name: str = PathParam(..., description="File object name to preview"),
filename: Optional[str] = Query(None, description="Original filename for display (optional)")
):
"""
Preview file inline in browser

- **object_name**: File object name in storage
- **filename**: Original filename for Content-Disposition header (optional)

Returns file stream with Content-Disposition: inline for browser preview
"""
try:
# Get file stream from preview service
file_stream, content_type = await preview_file_impl(object_name=object_name)

# Use provided filename or extract from object_name
display_filename = filename
if not display_filename:
display_filename = object_name.split("/")[-1] if "/" in object_name else object_name

# Build Content-Disposition header for inline display
content_disposition = build_content_disposition_header(display_filename, inline=True)

return StreamingResponse(
file_stream,
media_type=content_type,
headers={
"Content-Disposition": content_disposition,
"Cache-Control": "public, max-age=3600",
"ETag": f'"{object_name}"',
}
)

except FileTooLargeException as e:
logger.warning(f"[preview_file] File too large: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.REQUEST_ENTITY_TOO_LARGE,
detail=str(e)
)
except NotFoundException as e:
logger.error(f"[preview_file] File not found: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.NOT_FOUND,
detail=f"File not found: {object_name}"
)
except UnsupportedFileTypeException as e:
logger.error(f"[preview_file] Unsupported file type: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST,
detail=f"File format not supported for preview: {str(e)}"
)
except OfficeConversionException as e:
logger.error(f"[preview_file] Conversion failed: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Failed to preview file: {str(e)}"
)
except Exception as e:
logger.error(f"[preview_file] Unexpected error: object_name={object_name}, error={str(e)}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Failed to preview file: {str(e)}"
)
15 changes: 15 additions & 0 deletions backend/consts/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@ class VectorDatabaseType(str, Enum):
ROOT_DIR = os.getenv("ROOT_DIR")


# Preview Configuration
FILE_PREVIEW_SIZE_LIMIT = 100 * 1024 * 1024 # 100MB
# Limit concurrent Office-to-PDF conversions
MAX_CONCURRENT_CONVERSIONS = 5
# Supported Office file MIME types
OFFICE_MIME_TYPES = [
'application/msword', # .doc
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
'application/vnd.ms-excel', # .xls
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
'application/vnd.ms-powerpoint', # .ppt
'application/vnd.openxmlformats-officedocument.presentationml.presentation' # .pptx
]


# Supabase Configuration
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
Expand Down
15 changes: 15 additions & 0 deletions backend/consts/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,21 @@ class IncorrectInviteCodeException(Exception):
pass


class OfficeConversionException(Exception):
"""Raised when Office-to-PDF conversion via data-process service fails."""
pass


class UnsupportedFileTypeException(Exception):
"""Raised when a file type is not supported for the requested operation."""
pass


class FileTooLargeException(Exception):
"""Raised when a file exceeds the maximum allowed size for the requested operation."""
pass


class UserRegistrationException(Exception):
"""Raised when user registration fails."""
pass
Expand Down
37 changes: 37 additions & 0 deletions backend/database/attachment_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,42 @@ def get_file_size_from_minio(object_name: str, bucket: Optional[str] = None) ->
return minio_client.get_file_size(object_name, bucket)


def file_exists(object_name: str, bucket: Optional[str] = None) -> bool:
"""
Check if a file exists in the bucket.

Args:
object_name: Object name in storage
bucket: Bucket name, if not specified will use default bucket

Returns:
bool: True if file exists, False otherwise
"""
try:
return minio_client.file_exists(object_name, bucket)
except Exception:
return False


def copy_file(source_object: str, dest_object: str, bucket: Optional[str] = None) -> Dict[str, Any]:
"""
Copy a file within the same bucket (atomic operation in MinIO).

Args:
source_object: Source object name
dest_object: Destination object name
bucket: Bucket name, if not specified will use default bucket

Returns:
Dict[str, Any]: Result containing success flag and error message (if any)
"""
success, result = minio_client.copy_file(source_object, dest_object, bucket)
if success:
return {"success": True, "object_name": result}
else:
return {"success": False, "error": result}


def list_files(prefix: str = "", bucket: Optional[str] = None) -> List[Dict[str, Any]]:
"""
List files in bucket
Expand Down Expand Up @@ -269,6 +305,7 @@ def get_content_type(file_path: str) -> str:
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.txt': 'text/plain',
'.csv': 'text/csv',
'.md': 'text/markdown',
'.html': 'text/html',
'.htm': 'text/html',
'.json': 'application/json',
Expand Down
27 changes: 27 additions & 0 deletions backend/database/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,33 @@ def get_file_stream(self, object_name: str, bucket: Optional[str] = None) -> Tup
"""
return self._storage_client.get_file_stream(object_name, bucket)

def file_exists(self, object_name: str, bucket: Optional[str] = None) -> bool:
"""
Check if file exists in MinIO

Args:
object_name: Object name
bucket: Bucket name, if not specified use default bucket

Returns:
bool: True if file exists, False otherwise
"""
return self._storage_client.exists(object_name, bucket)

def copy_file(self, source_object: str, dest_object: str, bucket: Optional[str] = None) -> Tuple[bool, str]:
"""
Copy a file within the same bucket (atomic operation)

Args:
source_object: Source object name
dest_object: Destination object name
bucket: Bucket name, if not specified use default bucket

Returns:
Tuple[bool, str]: (Success status, Destination object name or error message)
"""
return self._storage_client.copy_file(source_object, dest_object, bucket)


# Create global database and MinIO client instances
db_client = PostgresClient()
Expand Down
Loading
Loading