Skip to content

Commit 8b0bc85

Browse files
committed
Fixed bug where importing data with missing page data caused a crash
1 parent 5033ebb commit 8b0bc85

6 files changed

Lines changed: 51 additions & 8 deletions

File tree

js/import/import.js

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import { calcSuppFontInfo } from '../fontSupp.js';
2323
import { gs } from '../generalWorkerMain.js';
2424
import { imageUtils, ImageWrapper } from '../objects/imageObjects.js';
2525
import { addCircularRefsDataTables, LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
26-
import { addCircularRefsOcr, updateOcrFormat } from '../objects/ocrObjects.js';
26+
import { OcrPage, addCircularRefsOcr, updateOcrFormat } from '../objects/ocrObjects.js';
2727
import { PageMetrics } from '../objects/pageMetricsObjects.js';
2828
import { checkCharWarn, convertOCR } from '../recognizeConvert.js';
2929
import { importImageFileToBase64 } from '../utils/imageUtils.js';
@@ -199,10 +199,11 @@ const restoreSessionFromFile = async (scribeFile) => {
199199
ocrAll.active = ocrAll[oemName];
200200

201201
for (let i = 0; i < ocrAll[oemName].length; i++) {
202-
inputData.xmlMode[i] = true;
203-
if (ocrAll[oemName][i].dims.height && ocrAll[oemName][i].dims.width) {
204-
pageMetricsAll[i] = new PageMetrics(ocrAll[oemName][i].dims);
202+
if (!ocrAll[oemName][i]) {
203+
ocrAll[oemName][i] = new OcrPage(i, { height: 1920, width: 1080 });
205204
}
205+
inputData.xmlMode[i] = true;
206+
pageMetricsAll[i] = new PageMetrics(ocrAll[oemName][i].dims);
206207
pageMetricsAll[i].angle = ocrAll[oemName][i].angle;
207208
}
208209
};

js/objects/ocrObjects.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,6 +868,7 @@ export const removeCircularRefsOcr = (pages, options = {}) => {
868868
const { includeText = false } = options;
869869
const pagesClone = structuredClone(pages);
870870
pagesClone.forEach((page) => {
871+
if (!page) return;
871872
// Add page-level text if requested (must be done before modifying lines)
872873
if (includeText) {
873874
// @ts-ignore
@@ -937,6 +938,7 @@ export const removeCircularRefsOcr = (pages, options = {}) => {
937938
*/
938939
export const addCircularRefsOcr = (pages) => {
939940
pages.forEach((page) => {
941+
if (!page) return;
940942
// Remove text property if present (added during export with includeText option)
941943
// @ts-ignore
942944
delete page.text;
@@ -1019,6 +1021,7 @@ export const addCircularRefsOcr = (pages) => {
10191021
*/
10201022
export const updateOcrFormat = (pages) => {
10211023
pages.forEach((page) => {
1024+
if (!page) return;
10221025
page.lines.forEach((line) => {
10231026
if (!line.debug) {
10241027
line.debug = new LineDebugInfo();

js/recognizeConvert.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import { calcCharMetricsFromPages } from './fontStatistics.js';
1313
import { gs } from './generalWorkerMain.js';
1414
import { ImageWrapper } from './objects/imageObjects.js';
1515
import { LayoutDataTablePage, LayoutPage } from './objects/layoutObjects.js';
16+
import { OcrPage } from './objects/ocrObjects.js';
1617
import { PageMetrics } from './objects/pageMetricsObjects.js';
1718
import { clearObjectProperties } from './utils/miscUtils.js';
1819

@@ -659,6 +660,7 @@ async function recognizeCustomModel(options) {
659660
const nativeN = await ImageCache.getNative(n);
660661
if (!nativeN) {
661662
opt.warningHandler(`No image found for page ${n}, skipping.`);
663+
ocrAll[engineName][n] = new OcrPage(n, pageMetricsAll[n].dims);
662664
return;
663665
}
664666

@@ -715,6 +717,7 @@ async function recognizeCustomModel(options) {
715717
const errMsg = result.error ? result.error.message : 'Unknown error';
716718
failedPages.push(n);
717719
opt.warningHandler(`Recognition failed for page ${n}: ${errMsg}`);
720+
ocrAll[engineName][n] = new OcrPage(n, pageMetricsAll[n].dims);
718721
consecutiveFailures++;
719722
lastErrorMessage = errMsg;
720723
if (consecutiveFailures >= maxConsecutiveFailures) {

tests/module/exportMarkdown.spec.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,7 @@ describe('Check markdown table export.', function () {
7979
this.timeout(10000);
8080

8181
it('Should export tables as markdown pipe tables', async () => {
82-
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`,
83-
`${ASSETS_PATH_KARMA}/border_patrol_tables.abbyy.xml`]);
82+
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.abbyy.xml`]);
8483

8584
const exportedMd = await scribe.exportData('md', { pageArr: [2] });
8685

tests/module/importAbbyy.spec.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,7 @@ describe('Check Abbyy XML table import.', function () {
191191
this.timeout(20000);
192192

193193
it('Should import Abbyy XML with PDF document', async () => {
194-
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`,
195-
`${ASSETS_PATH_KARMA}/border_patrol_tables.abbyy.xml`]);
194+
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.abbyy.xml`]);
196195

197196
assert.isTrue(scribe.data.ocr.active[0].lines.length > 0);
198197
}).timeout(20000);

tests/module/importAwsTextract.spec.js

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,44 @@ describe('Check AWS Textract JSON import correctly handles angle brackets.', fun
110110
});
111111
}).timeout(120000);
112112

113+
describe('Check scribe JSON import handles null OCR pages (blank pages).', function () {
114+
this.timeout(10000);
115+
116+
it('Should import scribe JSON that has null entries in the OCR array without crashing', async () => {
117+
// Import a known-good scribe file, export it, then inject null pages to simulate blank pages.
118+
await scribe.importFiles([`${ASSETS_PATH_KARMA}/E.D.Mich._2_12-cv-13821-AC-DRG_1_0.pdf`]);
119+
120+
scribe.opt.compressScribe = false;
121+
const scribeStr = await scribe.exportData('scribe');
122+
const scribeObj = JSON.parse(scribeStr);
123+
124+
// Inject null at the beginning and end to simulate blank/cover pages.
125+
scribeObj.ocr.unshift(null);
126+
scribeObj.ocr.push(null);
127+
128+
const modified = JSON.stringify(scribeObj);
129+
const encoder = new TextEncoder();
130+
const buffer = encoder.encode(modified).buffer;
131+
132+
await scribe.terminate();
133+
await scribe.importFiles({ scribeFiles: [buffer] });
134+
135+
// Page 0 was null in the input, so it should be an empty placeholder page with default dims.
136+
assert.strictEqual(scribe.data.ocr.active[0].lines.length, 0);
137+
assert.strictEqual(scribe.data.ocr.active[0].dims.width, 1080);
138+
assert.strictEqual(scribe.data.ocr.active[0].dims.height, 1920);
139+
// Page 1 should be the original first page with real OCR data.
140+
assert.isTrue(scribe.data.ocr.active[1].lines.length > 0);
141+
assert.strictEqual(scribe.data.ocr.active[1].lines[0].words[0].text, 'UNITED');
142+
}).timeout(10000);
143+
144+
after(async () => {
145+
scribe.opt.compressScribe = true;
146+
await scribe.clear();
147+
await scribe.terminate();
148+
});
149+
}).timeout(120000);
150+
113151
describe('Check AWS Textract properly splits unicode superscript footnotes.', function () {
114152
this.timeout(10000);
115153

0 commit comments

Comments
 (0)