From b17f0736af23b83dbab2baf6e83ff8894398bca8 Mon Sep 17 00:00:00 2001 From: Balearica Date: Mon, 30 Mar 2026 08:09:53 -0700 Subject: [PATCH] Added support for Textract as supp data; Added support for Textract multi-file imports --- js/import/import.js | 2 +- js/recognizeConvert.js | 19 +++++++-- tests/module/evaluate.spec.js | 73 +++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/js/import/import.js b/js/import/import.js index df24acfc..76f5ea73 100644 --- a/js/import/import.js +++ b/js/import/import.js @@ -586,5 +586,5 @@ export async function importFilesSupp(files, ocrName) { const format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format); - await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode); + await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode, pageMetricsAll); } diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js index 9f0fde84..d8dd9e1d 100644 --- a/js/recognizeConvert.js +++ b/js/recognizeConvert.js @@ -393,9 +393,22 @@ export async function convertOCR(ocrRawArr, mainData, format, engineName, scribe if (format === 'textract') { if (!pageMetrics || !pageMetrics[0]?.dims) throw new Error('Page metrics must be provided for Textract data.'); const pageDims = pageMetrics.map((metrics) => (metrics.dims)); - const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims }); - for (let n = 0; n < res.length; n++) { - await convertPageCallback(res[n], n, mainData, engineName); + + // When multiple Textract entries exist (per-page files), each file contains + // blocks with Page=1. Process each individually with the correct pageNum + // to avoid merging all pages into page 0. + if (ocrRawArr.length > 1) { + for (let i = 0; i < ocrRawArr.length; i++) { + const res = await gs.convertDocTextract({ ocrStr: [ocrRawArr[i]], pageDims: [pageDims[i]], pageNum: i }); + if (res.length > 0) { + await convertPageCallback(res[0], i, mainData, engineName); + } + } + } else { + const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims }); + for (let n = 0; n < res.length; n++) { + await convertPageCallback(res[n], n, mainData, engineName); + } } return; } diff --git a/tests/module/evaluate.spec.js b/tests/module/evaluate.spec.js index 9c702753..dc9d2cd3 100644 --- a/tests/module/evaluate.spec.js +++ b/tests/module/evaluate.spec.js @@ -35,3 +35,76 @@ describe('Check evaluate function.', function () { await scribe.terminate(); }); }).timeout(120000); + +describe('Check importFilesSupp works with Textract data.', function () { + this.timeout(20000); + + it('Should import Textract data as supplementary OCR without error', async () => { + // Import image first to populate page metrics (required by Textract format). + await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test.png`, + `${ASSETS_PATH_KARMA}/ascenders_descenders_test.abbyy.xml`]); + + // Import Textract as supplementary data — this would throw + // "Page metrics must be provided for Textract data." before the fix. + await scribe.importFilesSupp( + [`${ASSETS_PATH_KARMA}/ascenders_descenders_test_AwsTextractLayout.json`], + 'Textract', + ); + + // Verify the imported data matches expected content exactly. + const textractPage = scribe.data.ocr.Textract[0]; + assert.strictEqual(textractPage.lines.length, 3); + assert.strictEqual(textractPage.lines[0].words.map((x) => x.text).join(' '), 'Ascenders On'); + assert.strictEqual(textractPage.lines[1].words.map((x) => x.text).join(' '), 'query png'); + assert.strictEqual(textractPage.lines[2].words.map((x) => x.text).join(' '), 'we can'); + + + }).timeout(20000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check importFilesSupp works with per-page Textract files.', function () { + this.timeout(30000); + + it('Should import per-page Textract files and assign each to the correct page', async () => { + const subDir = `${ASSETS_PATH_KARMA}/trident_v_connecticut_general`; + + // Import the multi-page PDF first to populate page metrics. + await scribe.importFiles([`${ASSETS_PATH_KARMA}/trident_v_connecticut_general.pdf`]); + + // Import 7 per-page Textract JSON files as supplementary data. + const textractFiles = []; + for (let i = 0; i < 7; i++) { + textractFiles.push(`${subDir}/awsTextract/trident_v_connecticut_general_${String(i).padStart(3, '0')}-AwsTextractLayoutSync.json`); + } + await scribe.importFilesSupp(textractFiles, 'Textract'); + + // Verify each page has the expected line/word counts and correct first line. + // Before the fix, all data merged into page 0 (~4500 words) and pages 1-6 were empty. + const expected = [ + { lines: 98, words: 614, firstLine: '564' }, + { lines: 33, words: 192, firstLine: '565' }, + { lines: 102, words: 674, firstLine: '566' }, + { lines: 118, words: 834, firstLine: '567' }, + { lines: 120, words: 831, firstLine: '568' }, + { lines: 109, words: 732, firstLine: '569' }, + { lines: 100, words: 659, firstLine: '570' }, + ]; + + for (let i = 0; i < 7; i++) { + const page = scribe.data.ocr.Textract[i]; + const wordCount = page.lines.reduce((sum, l) => sum + l.words.length, 0); + const firstLine = page.lines[0].words.map((w) => w.text).join(' '); + assert.strictEqual(page.lines.length, expected[i].lines, `Page ${i} line count`); + assert.strictEqual(wordCount, expected[i].words, `Page ${i} word count`); + assert.strictEqual(firstLine, expected[i].firstLine, `Page ${i} first line text`); + } + }).timeout(30000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000);