diff --git a/backend/src/main/java/com/storage/engine/service/adapter/StorageUtils.java b/backend/src/main/java/com/storage/engine/service/adapter/StorageUtils.java index 6de7d80..4eddcb8 100644 --- a/backend/src/main/java/com/storage/engine/service/adapter/StorageUtils.java +++ b/backend/src/main/java/com/storage/engine/service/adapter/StorageUtils.java @@ -125,7 +125,6 @@ public static String toFileLeafPath(String iginxPath, String originalFileName) { if (slash >= 0 && slash < base.length() - 1) { base = base.substring(slash + 1); } - base = base.replaceAll("[^a-zA-Z0-9._-]", "_"); if (base.isEmpty()) { base = "content.bin"; } diff --git a/backend/src/main/resources/udf/metadata/extractors/document_extractor.py b/backend/src/main/resources/udf/metadata/extractors/document_extractor.py index d8556e7..a0909e8 100644 --- a/backend/src/main/resources/udf/metadata/extractors/document_extractor.py +++ b/backend/src/main/resources/udf/metadata/extractors/document_extractor.py @@ -6,6 +6,7 @@ class DocumentMetadataExtractor(BaseMetadataExtractor): USER_PROMPT_TEMPLATE = ( "请从输入的文档文本中抽取语义三元组。 " + "尽可能提取代表元数据的语义,而非具体的数值或事实。 " "请严格只返回 JSON,不要输出 Markdown 或解释。 " "返回格式要求:" "{{\"entities\":[\"entity\"],\"triples\":[{{\"subject\":\"entityA\",\"predicate\":\"relation\",\"object\":\"entityB\"}}]}}. " @@ -15,7 +16,7 @@ class DocumentMetadataExtractor(BaseMetadataExtractor): ) USER_RETRY_PROMPT_TEMPLATE = ( - "你上一轮可能没有返回可用三元组。请再次检查内容并尽量抽取核心语义关系;若确实无关系,triples 返回空数组。 " + "你上一轮可能没有返回可用三元组。请再次检查内容并尽量抽取核心语义关系;尽可能提取代表元数据的语义,而非具体的数值或事实;若确实无关系,triples 返回空数组。 " "请严格只返回 JSON,不要输出 Markdown 或解释。 " "返回格式要求:" "{{\"entities\":[\"entity\"],\"triples\":[{{\"subject\":\"entityA\",\"predicate\":\"relation\",\"object\":\"entityB\"}}]}}. " diff --git a/frontend/app.js b/frontend/app.js index efdf641..13e76a4 100644 --- a/frontend/app.js +++ b/frontend/app.js @@ -113,7 +113,7 @@ function renderPagination(containerId, stateKey, totalPages, total, onPageChange function formatBytes(bytes) { const value = Number(bytes || 0); if (!Number.isFinite(value) || value <= 0) return '0 B'; - const units = ['B', 'KB', 'MB', 'GB', 'TB']; + const units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']; let size = value; let idx = 0; while (size >= 1024 && idx < units.length - 1) { @@ -2982,7 +2982,7 @@ function escapeHtml(str) { function formatFileSize(bytes) { if (!bytes || bytes === 0) return '0 B'; - const units = ['B', 'KB', 'MB', 'GB', 'TB']; + const units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']; const i = Math.floor(Math.log(bytes) / Math.log(1024)); return (bytes / Math.pow(1024, i)).toFixed(1) + ' ' + units[i]; } diff --git a/test/batch_add_datasource.py b/test/batch_add_datasource.py index 8961a8e..b70c08b 100644 --- a/test/batch_add_datasource.py +++ b/test/batch_add_datasource.py @@ -25,6 +25,8 @@ SSH_PORT = 22 REQUEST_DELAY = 0.1 # 请求间隔(秒) +BATCH_SIZE = 10 # 每批请求数量 +BATCH_REQUEST_DELAY = 0 # 每批请求间隔(秒) TIMEOUT = 60 # 请求超时(秒) # ==================== 颜色输出 ==================== @@ -124,8 +126,8 @@ def main(): Colors.RED ) - # 每10个打印进度 - if i % 10 == 0: + # 每 BATCH_SIZE 个打印进度 + if i % BATCH_SIZE == 0: progress = (i * 100.0) / TOTAL_COUNT print() print_colored( @@ -134,6 +136,7 @@ def main(): Colors.YELLOW ) print() + time.sleep(BATCH_REQUEST_DELAY) # 延迟 time.sleep(REQUEST_DELAY)