@@ -110,6 +110,44 @@ describe('Check AWS Textract JSON import correctly handles angle brackets.', fun
110110 } ) ;
111111} ) . timeout ( 120000 ) ;
112112
113+ describe ( 'Check scribe JSON import handles null OCR pages (blank pages).' , function ( ) {
114+ this . timeout ( 10000 ) ;
115+
116+ it ( 'Should import scribe JSON that has null entries in the OCR array without crashing' , async ( ) => {
117+ // Import a known-good scribe file, export it, then inject null pages to simulate blank pages.
118+ await scribe . importFiles ( [ `${ ASSETS_PATH_KARMA } /E.D.Mich._2_12-cv-13821-AC-DRG_1_0.pdf` ] ) ;
119+
120+ scribe . opt . compressScribe = false ;
121+ const scribeStr = await scribe . exportData ( 'scribe' ) ;
122+ const scribeObj = JSON . parse ( scribeStr ) ;
123+
124+ // Inject null at the beginning and end to simulate blank/cover pages.
125+ scribeObj . ocr . unshift ( null ) ;
126+ scribeObj . ocr . push ( null ) ;
127+
128+ const modified = JSON . stringify ( scribeObj ) ;
129+ const encoder = new TextEncoder ( ) ;
130+ const buffer = encoder . encode ( modified ) . buffer ;
131+
132+ await scribe . terminate ( ) ;
133+ await scribe . importFiles ( { scribeFiles : [ buffer ] } ) ;
134+
135+ // Page 0 was null in the input, so it should be an empty placeholder page with default dims.
136+ assert . strictEqual ( scribe . data . ocr . active [ 0 ] . lines . length , 0 ) ;
137+ assert . strictEqual ( scribe . data . ocr . active [ 0 ] . dims . width , 1080 ) ;
138+ assert . strictEqual ( scribe . data . ocr . active [ 0 ] . dims . height , 1920 ) ;
139+ // Page 1 should be the original first page with real OCR data.
140+ assert . isTrue ( scribe . data . ocr . active [ 1 ] . lines . length > 0 ) ;
141+ assert . strictEqual ( scribe . data . ocr . active [ 1 ] . lines [ 0 ] . words [ 0 ] . text , 'UNITED' ) ;
142+ } ) . timeout ( 10000 ) ;
143+
144+ after ( async ( ) => {
145+ scribe . opt . compressScribe = true ;
146+ await scribe . clear ( ) ;
147+ await scribe . terminate ( ) ;
148+ } ) ;
149+ } ) . timeout ( 120000 ) ;
150+
113151describe ( 'Check AWS Textract properly splits unicode superscript footnotes.' , function ( ) {
114152 this . timeout ( 10000 ) ;
115153
0 commit comments