From b17f0736af23b83dbab2baf6e83ff8894398bca8 Mon Sep 17 00:00:00 2001
From: Balearica <admin@scribeocr.com>
Date: Mon, 30 Mar 2026 08:09:53 -0700
Subject: [PATCH] Added support for Textract as supp data; Added support for
 Textract multi-file imports

---
 js/import/import.js           |  2 +-
 js/recognizeConvert.js        | 19 +++++++--
 tests/module/evaluate.spec.js | 73 +++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/js/import/import.js b/js/import/import.js
index df24acfc..76f5ea73 100644
--- a/js/import/import.js
+++ b/js/import/import.js
@@ -586,5 +586,5 @@ export async function importFilesSupp(files, ocrName) {
 
   const format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
 
-  await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode);
+  await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode, pageMetricsAll);
 }
diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js
index 9f0fde84..d8dd9e1d 100644
--- a/js/recognizeConvert.js
+++ b/js/recognizeConvert.js
@@ -393,9 +393,22 @@ export async function convertOCR(ocrRawArr, mainData, format, engineName, scribe
   if (format === 'textract') {
     if (!pageMetrics || !pageMetrics[0]?.dims) throw new Error('Page metrics must be provided for Textract data.');
     const pageDims = pageMetrics.map((metrics) => (metrics.dims));
-    const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims });
-    for (let n = 0; n < res.length; n++) {
-      await convertPageCallback(res[n], n, mainData, engineName);
+
+    // When multiple Textract entries exist (per-page files), each file contains
+    // blocks with Page=1. Process each individually with the correct pageNum
+    // to avoid merging all pages into page 0.
+    if (ocrRawArr.length > 1) {
+      for (let i = 0; i < ocrRawArr.length; i++) {
+        const res = await gs.convertDocTextract({ ocrStr: [ocrRawArr[i]], pageDims: [pageDims[i]], pageNum: i });
+        if (res.length > 0) {
+          await convertPageCallback(res[0], i, mainData, engineName);
+        }
+      }
+    } else {
+      const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims });
+      for (let n = 0; n < res.length; n++) {
+        await convertPageCallback(res[n], n, mainData, engineName);
+      }
     }
     return;
   }
diff --git a/tests/module/evaluate.spec.js b/tests/module/evaluate.spec.js
index 9c702753..dc9d2cd3 100644
--- a/tests/module/evaluate.spec.js
+++ b/tests/module/evaluate.spec.js
@@ -35,3 +35,76 @@ describe('Check evaluate function.', function () {
     await scribe.terminate();
   });
 }).timeout(120000);
+
+describe('Check importFilesSupp works with Textract data.', function () {
+  this.timeout(20000);
+
+  it('Should import Textract data as supplementary OCR without error', async () => {
+    // Import image first to populate page metrics (required by Textract format).
+    await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test.png`,
+      `${ASSETS_PATH_KARMA}/ascenders_descenders_test.abbyy.xml`]);
+
+    // Import Textract as supplementary data — this would throw
+    // "Page metrics must be provided for Textract data." before the fix.
+    await scribe.importFilesSupp(
+      [`${ASSETS_PATH_KARMA}/ascenders_descenders_test_AwsTextractLayout.json`],
+      'Textract',
+    );
+
+    // Verify the imported data matches expected content exactly.
+    const textractPage = scribe.data.ocr.Textract[0];
+    assert.strictEqual(textractPage.lines.length, 3);
+    assert.strictEqual(textractPage.lines[0].words.map((x) => x.text).join(' '), 'Ascenders On');
+    assert.strictEqual(textractPage.lines[1].words.map((x) => x.text).join(' '), 'query png');
+    assert.strictEqual(textractPage.lines[2].words.map((x) => x.text).join(' '), 'we can');
+
+
+  }).timeout(20000);
+
+  after(async () => {
+    await scribe.terminate();
+  });
+}).timeout(120000);
+
+describe('Check importFilesSupp works with per-page Textract files.', function () {
+  this.timeout(30000);
+
+  it('Should import per-page Textract files and assign each to the correct page', async () => {
+    const subDir = `${ASSETS_PATH_KARMA}/trident_v_connecticut_general`;
+
+    // Import the multi-page PDF first to populate page metrics.
+    await scribe.importFiles([`${ASSETS_PATH_KARMA}/trident_v_connecticut_general.pdf`]);
+
+    // Import 7 per-page Textract JSON files as supplementary data.
+    const textractFiles = [];
+    for (let i = 0; i < 7; i++) {
+      textractFiles.push(`${subDir}/awsTextract/trident_v_connecticut_general_${String(i).padStart(3, '0')}-AwsTextractLayoutSync.json`);
+    }
+    await scribe.importFilesSupp(textractFiles, 'Textract');
+
+    // Verify each page has the expected line/word counts and correct first line.
+    // Before the fix, all data merged into page 0 (~4500 words) and pages 1-6 were empty.
+    const expected = [
+      { lines: 98, words: 614, firstLine: '564' },
+      { lines: 33, words: 192, firstLine: '565' },
+      { lines: 102, words: 674, firstLine: '566' },
+      { lines: 118, words: 834, firstLine: '567' },
+      { lines: 120, words: 831, firstLine: '568' },
+      { lines: 109, words: 732, firstLine: '569' },
+      { lines: 100, words: 659, firstLine: '570' },
+    ];
+
+    for (let i = 0; i < 7; i++) {
+      const page = scribe.data.ocr.Textract[i];
+      const wordCount = page.lines.reduce((sum, l) => sum + l.words.length, 0);
+      const firstLine = page.lines[0].words.map((w) => w.text).join(' ');
+      assert.strictEqual(page.lines.length, expected[i].lines, `Page ${i} line count`);
+      assert.strictEqual(wordCount, expected[i].words, `Page ${i} word count`);
+      assert.strictEqual(firstLine, expected[i].firstLine, `Page ${i} first line text`);
+    }
+  }).timeout(30000);
+
+  after(async () => {
+    await scribe.terminate();
+  });
+}).timeout(120000);