Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion js/import/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -586,5 +586,5 @@ export async function importFilesSupp(files, ocrName) {

const format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);

await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode);
await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode, pageMetricsAll);
}
19 changes: 16 additions & 3 deletions js/recognizeConvert.js
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,22 @@ export async function convertOCR(ocrRawArr, mainData, format, engineName, scribe
if (format === 'textract') {
if (!pageMetrics || !pageMetrics[0]?.dims) throw new Error('Page metrics must be provided for Textract data.');
const pageDims = pageMetrics.map((metrics) => (metrics.dims));
const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims });
for (let n = 0; n < res.length; n++) {
await convertPageCallback(res[n], n, mainData, engineName);

// When multiple Textract entries exist (per-page files), each file contains
// blocks with Page=1. Process each individually with the correct pageNum
// to avoid merging all pages into page 0.
if (ocrRawArr.length > 1) {
for (let i = 0; i < ocrRawArr.length; i++) {
const res = await gs.convertDocTextract({ ocrStr: [ocrRawArr[i]], pageDims: [pageDims[i]], pageNum: i });
if (res.length > 0) {
await convertPageCallback(res[0], i, mainData, engineName);
}
}
} else {
const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims });
for (let n = 0; n < res.length; n++) {
await convertPageCallback(res[n], n, mainData, engineName);
}
}
return;
}
Expand Down
73 changes: 73 additions & 0 deletions tests/module/evaluate.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,76 @@ describe('Check evaluate function.', function () {
await scribe.terminate();
});
}).timeout(120000);

describe('Check importFilesSupp works with Textract data.', function () {
this.timeout(20000);

it('Should import Textract data as supplementary OCR without error', async () => {
// Import image first to populate page metrics (required by Textract format).
await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test.png`,
`${ASSETS_PATH_KARMA}/ascenders_descenders_test.abbyy.xml`]);

// Import Textract as supplementary data — this would throw
// "Page metrics must be provided for Textract data." before the fix.
await scribe.importFilesSupp(
[`${ASSETS_PATH_KARMA}/ascenders_descenders_test_AwsTextractLayout.json`],
'Textract',
);

// Verify the imported data matches expected content exactly.
const textractPage = scribe.data.ocr.Textract[0];
assert.strictEqual(textractPage.lines.length, 3);
assert.strictEqual(textractPage.lines[0].words.map((x) => x.text).join(' '), 'Ascenders On');
assert.strictEqual(textractPage.lines[1].words.map((x) => x.text).join(' '), 'query png');
assert.strictEqual(textractPage.lines[2].words.map((x) => x.text).join(' '), 'we can');


}).timeout(20000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

describe('Check importFilesSupp works with per-page Textract files.', function () {
this.timeout(30000);

it('Should import per-page Textract files and assign each to the correct page', async () => {
const subDir = `${ASSETS_PATH_KARMA}/trident_v_connecticut_general`;

// Import the multi-page PDF first to populate page metrics.
await scribe.importFiles([`${ASSETS_PATH_KARMA}/trident_v_connecticut_general.pdf`]);

// Import 7 per-page Textract JSON files as supplementary data.
const textractFiles = [];
for (let i = 0; i < 7; i++) {
textractFiles.push(`${subDir}/awsTextract/trident_v_connecticut_general_${String(i).padStart(3, '0')}-AwsTextractLayoutSync.json`);
}
await scribe.importFilesSupp(textractFiles, 'Textract');

// Verify each page has the expected line/word counts and correct first line.
// Before the fix, all data merged into page 0 (~4500 words) and pages 1-6 were empty.
const expected = [
{ lines: 98, words: 614, firstLine: '564' },
{ lines: 33, words: 192, firstLine: '565' },
{ lines: 102, words: 674, firstLine: '566' },
{ lines: 118, words: 834, firstLine: '567' },
{ lines: 120, words: 831, firstLine: '568' },
{ lines: 109, words: 732, firstLine: '569' },
{ lines: 100, words: 659, firstLine: '570' },
];

for (let i = 0; i < 7; i++) {
const page = scribe.data.ocr.Textract[i];
const wordCount = page.lines.reduce((sum, l) => sum + l.words.length, 0);
const firstLine = page.lines[0].words.map((w) => w.text).join(' ');
assert.strictEqual(page.lines.length, expected[i].lines, `Page ${i} line count`);
assert.strictEqual(wordCount, expected[i].words, `Page ${i} word count`);
assert.strictEqual(firstLine, expected[i].firstLine, `Page ${i} first line text`);
}
}).timeout(30000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);
Loading