Skip to content

Commit 470db47

Browse files
committed
hotfix: OCR 처리 최적화
1 parent e8aa843 commit 470db47

2 files changed

Lines changed: 12 additions & 15 deletions

File tree

src/main/java/com/bigpicture/moonrabbit/domain/fine/service/FineTuningService.java

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package com.bigpicture.moonrabbit.domain.fine.service;
22

33
import lombok.extern.slf4j.Slf4j;
4-
import org.springframework.core.io.ByteArrayResource;
54
import org.springframework.core.io.FileSystemResource;
65
import org.springframework.http.MediaType;
76
import org.springframework.stereotype.Service;
@@ -12,7 +11,6 @@
1211
import org.springframework.web.reactive.function.client.WebClientResponseException;
1312

1413
import java.io.File;
15-
import java.nio.file.Files;
1614
import java.util.Map;
1715

1816
@Service
@@ -52,17 +50,8 @@ public String startFineTuningWithBaseModel(String jsonlPath, String baseModel) {
5250
// Step 1: Upload JSONL file
5351
MultiValueMap<String, Object> multipartBody = new LinkedMultiValueMap<>();
5452

55-
// [수정된 로직 시작: ByteArrayResource를 사용하여 filename 명시]
56-
// 1. 파일 내용을 byte 배열로 읽기 (IOException 발생 가능)
57-
byte[] fileContent = Files.readAllBytes(jsonlFile.toPath());
58-
59-
// 2. ByteArrayResource를 생성하고, getFilename()을 오버라이드하여 파일 이름(.jsonl)을 강제 주입
60-
ByteArrayResource resource = new ByteArrayResource(fileContent) {
61-
@Override
62-
public String getFilename() {
63-
return jsonlFile.getName(); // "fine_dataset_....jsonl" 파일명 사용
64-
}
65-
};
53+
// FileSystemResource를 사용하여 스트리밍 방식으로 전송 (메모리 효율성 향상)
54+
FileSystemResource resource = new FileSystemResource(jsonlFile);
6655

6756
multipartBody.add("file", resource);
6857
multipartBody.add("purpose", "fine-tune");

src/main/java/com/bigpicture/moonrabbit/domain/ocr/service/OcrPdfProcessor.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,16 @@ public String extractTextFromPdf(File pdfFile) {
5555
for (int page = 0; page < totalPages; page++) {
5656
long pageStart = System.currentTimeMillis();
5757

58-
BufferedImage image = renderer.renderImageWithDPI(page, 300);
58+
BufferedImage image = renderer.renderImageWithDPI(page, 150); // DPI 300 → 150으로 최적화 (필요시 72로 변경 가능)
5959
Tesseract t = borrowTesseract();
6060
try {
6161
String text = t.doOCR(image);
6262
result.append("\n--- Page ").append(page + 1).append(" ---\n").append(text);
6363
} finally {
64+
// 명시적 메모리 해제
65+
if (image != null) {
66+
image.flush();
67+
}
6468
returnTesseract(t);
6569
}
6670

@@ -85,7 +89,7 @@ public String extractPageText(File pdfFile, int pageIndex) throws Exception {
8589
long start = System.currentTimeMillis();
8690
try (PDDocument document = PDDocument.load(pdfFile)) {
8791
PDFRenderer renderer = new PDFRenderer(document);
88-
BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300);
92+
BufferedImage image = renderer.renderImageWithDPI(pageIndex, 150); // DPI 300 → 150으로 최적화 (필요시 72로 변경 가능)
8993

9094
Tesseract t = borrowTesseract();
9195
try {
@@ -94,6 +98,10 @@ public String extractPageText(File pdfFile, int pageIndex) throws Exception {
9498
System.out.println("[DEBUG] Single page OCR completed (" + (end - start) + "ms)");
9599
return text;
96100
} finally {
101+
// 명시적 메모리 해제
102+
if (image != null) {
103+
image.flush();
104+
}
97105
returnTesseract(t);
98106
}
99107
}

0 commit comments

Comments
 (0)