Skip to content

Commit 19d00db

Browse files
committed
Updated
1 parent 4c6ae80 commit 19d00db

1 file changed

Lines changed: 113 additions & 13 deletions

File tree

mcp/index.js

Lines changed: 113 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ const scribe = scribeModule.default;
2929
const writeTextModule = await import(resolve(__dirname, '..', 'js', 'export', 'writeText.js'));
3030
const { writeText } = writeTextModule;
3131

32+
// Import assignParagraphs for paragraph detection when not already assigned.
33+
const reflowParsModule = await import(resolve(__dirname, '..', 'js', 'utils', 'reflowPars.js'));
34+
const { assignParagraphs } = reflowParsModule;
35+
36+
// Import pageMetricsAll for angle data needed by assignParagraphs.
37+
const dataContainerModule = await import(resolve(__dirname, '..', 'js', 'containers', 'dataContainer.js'));
38+
const { pageMetricsAll } = dataContainerModule;
39+
3240
const SUPPORTED_EXTENSIONS = ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.tif'];
3341
const DATA_EXTENSIONS = ['.scribe.json', '.json', '.json.gz', '.hocr', '.xml', '.stext', '.txt', '.docx'];
3442

@@ -168,6 +176,7 @@ async function listDocuments({ directory, dataDir }) {
168176

169177
async function extractDocumentText({
170178
file, startPage, maxChars, preserveSpacing, dataFile,
179+
parAnnots, footnoteAnnots,
171180
}) {
172181
let filePath;
173182
let dataFilePath;
@@ -196,17 +205,27 @@ async function extractDocumentText({
196205

197206
let text = '';
198207
let endPage = start;
199-
for (let p = start; p < pageCount; p++) {
200-
// Call writeText directly with lineNumbers param instead of mutating scribe.opt.
201-
const pageText = writeText({
202-
ocrCurrent: scribe.data.ocr.active,
203-
pageArr: [p],
204-
lineNumbers: true,
205-
preserveSpacing: preserveSpacing || false,
206-
});
207-
if (text.length > 0 && text.length + pageText.length > limit) break;
208-
text += pageText;
209-
endPage = p;
208+
209+
if (parAnnots || footnoteAnnots) {
210+
for (let p = start; p < pageCount; p++) {
211+
const pageText = buildStructuredPageText(p, { parAnnots, footnoteAnnots });
212+
if (text.length > 0 && text.length + pageText.length > limit) break;
213+
text += pageText;
214+
endPage = p;
215+
}
216+
} else {
217+
for (let p = start; p < pageCount; p++) {
218+
// Call writeText directly with lineNumbers param instead of mutating scribe.opt.
219+
const pageText = writeText({
220+
ocrCurrent: scribe.data.ocr.active,
221+
pageArr: [p],
222+
lineNumbers: true,
223+
preserveSpacing: preserveSpacing || false,
224+
});
225+
if (text.length > 0 && text.length + pageText.length > limit) break;
226+
text += pageText;
227+
endPage = p;
228+
}
210229
}
211230

212231
return {
@@ -218,6 +237,73 @@ async function extractDocumentText({
218237
};
219238
}
220239

240+
/**
241+
* Build structured text for a single page, with optional paragraph boundaries and footnote annotations.
242+
* @param {number} pageIdx - 0-based page index
243+
* @param {Object} opts
244+
* @param {boolean} [opts.parAnnots]
245+
* @param {boolean} [opts.footnoteAnnots]
246+
* @returns {string}
247+
*/
248+
function buildStructuredPageText(pageIdx, { parAnnots, footnoteAnnots }) {
249+
const pageObj = scribe.data.ocr.active[pageIdx];
250+
if (!pageObj || pageObj.lines.length === 0) return '';
251+
252+
// Ensure paragraphs are assigned if not already present.
253+
const hasPars = pageObj.pars && pageObj.pars.length > 0;
254+
if (!hasPars && parAnnots) {
255+
const angle = pageMetricsAll[pageIdx]?.angle || 0;
256+
assignParagraphs(pageObj, angle);
257+
}
258+
259+
let out = '';
260+
let currentParId = null;
261+
262+
for (let h = 0; h < pageObj.lines.length; h++) {
263+
const line = pageObj.lines[h];
264+
if (!line || line.words.length === 0) continue;
265+
266+
// Insert paragraph header when the paragraph changes.
267+
const par = line.par || null;
268+
const parId = par?.id || null;
269+
270+
if (parAnnots && parId !== currentParId) {
271+
let header = `\n--- par:${parId || 'unknown'} [${par?.type || 'body'}]`;
272+
// For footnote paragraphs, show what word/line they reference.
273+
if (footnoteAnnots && par?.type === 'footnote' && par.footnoteRefId) {
274+
const refWordId = par.footnoteRefId;
275+
// Find the reference word to show its location.
276+
let refInfo = refWordId;
277+
for (let li = 0; li < pageObj.lines.length; li++) {
278+
const refWord = pageObj.lines[li].words.find((w) => w.id === refWordId);
279+
if (refWord) {
280+
refInfo = `${pageIdx}:${li} "${refWord.text}"`;
281+
break;
282+
}
283+
}
284+
header += ` ref:${refInfo}`;
285+
}
286+
header += ' ---';
287+
out += header;
288+
currentParId = parId;
289+
}
290+
291+
// Build line text.
292+
const lineText = line.words.map((w) => w.text).join(' ');
293+
out += `\n${pageIdx}:${h} ${lineText}`;
294+
295+
// Annotate footnote references on this line.
296+
if (footnoteAnnots) {
297+
const fnWords = line.words.filter((w) => w.footnoteParId);
298+
for (const w of fnWords) {
299+
out += ` [footnote "${w.text}" → par:${w.footnoteParId}]`;
300+
}
301+
}
302+
}
303+
304+
return out;
305+
}
306+
221307
async function recognizeDocument({ file, langs, dataFile }) {
222308
let filePath;
223309
let dataFilePath;
@@ -561,7 +647,8 @@ const TOOLS = [
561647
name: 'extract_document_text',
562648
description: 'Extract text from a PDF or image document. Returns text with page:line number prefixes (e.g. "0:5 some text") so lines can be referenced for highlighting. '
563649
+ 'Handles text-native and image-based PDFs (via OCR). For large documents, returns text in chunks — check "hasMore" and use "startPage" to get the next chunk. '
564-
+ 'A companion data file can be provided to use existing OCR data instead of re-running recognition.',
650+
+ 'A companion data file can be provided to use existing OCR data instead of re-running recognition. '
651+
+ 'Use parAnnots and/or footnoteAnnots to add document structure annotations to the output.',
565652
inputSchema: {
566653
type: 'object',
567654
properties: {
@@ -583,7 +670,20 @@ const TOOLS = [
583670
},
584671
preserveSpacing: {
585672
type: 'boolean',
586-
description: 'Preserve horizontal spacing from the document layout by padding words with spaces based on their position. Makes table columns visually aligned in the output. Default: false.',
673+
description: 'Preserve horizontal spacing from the document layout by padding words with spaces based on their position. '
674+
+ 'Makes table columns visually aligned in the output. Default: false.',
675+
},
676+
parAnnots: {
677+
type: 'boolean',
678+
description: 'Annotate each group of lines with its paragraph ID and type '
679+
+ '(body, title, or footnote), e.g. "--- par:abc123 [body] ---". '
680+
+ 'Use this to identify which lines belong to the same paragraph. Default: false.',
681+
},
682+
footnoteAnnots: {
683+
type: 'boolean',
684+
description: 'Include footnote cross-reference annotations. Words that reference a footnote are annotated with '
685+
+ '[footnote "word" → par:ID], and footnote paragraphs show which line/word they are linked from. '
686+
+ 'Best used together with parAnnots. Default: false.',
587687
},
588688
},
589689
required: [],

0 commit comments

Comments
 (0)