Skip to content

Commit 2f98e40

Browse files
committed
fix references offsets, fix missing starting/end offsets, show files not overriden without --verbose,
1 parent 7deac6d commit 2f98e40

13 files changed

Lines changed: 22405 additions & 55 deletions

grobid_client/format/TEI2LossyJSON.py

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
139139
box_to_dict(coord.split(","))
140140
for coord in sentence['coords'].split(";")
141141
] if sentence.has_attr("coords") else [],
142-
"refs": [
143-
{
144-
"type": ref["type"],
145-
"target": ref["target"] if "target" in ref.attrs else "",
146-
"text": ref.text
147-
}
148-
for ref in sentence.find_all("ref", type="bibr")
149-
]
142+
"refs": get_refs_with_offsets(sentence)
150143
}
151144
for id, sentence in enumerate(paragraph.find_all("s"))
152145
]
@@ -161,14 +154,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
161154
box_to_dict(coord.split(","))
162155
for coord in paragraph['coords'].split(";")
163156
] if paragraph.has_attr("coords") else [],
164-
"refs": [
165-
{
166-
"type": ref["type"],
167-
"target": ref["target"] if "target" in ref.attrs else "",
168-
"text": ref.text
169-
}
170-
for ref in paragraph.find_all("ref", type="bibr")
171-
]
157+
"refs": get_refs_with_offsets(paragraph)
172158
}
173159
for id, paragraph in enumerate(abstract_paragraph_nodes)
174160
]
@@ -826,51 +812,89 @@ def get_random_id(prefix=""):
826812
def get_refs_with_offsets(element):
827813
"""Extract references with their text offsets from an element."""
828814
refs = []
829-
text = element.get_text()
830815

831-
# Use BeautifulSoup's position tracking by traversing the DOM tree
832-
# and keeping track of character positions as we encounter text
816+
# Apply the same text cleaning as get_formatted_passage
817+
def _clean_text(text: str) -> str:
818+
if not text:
819+
return ""
820+
import re
821+
import html
822+
text = re.sub(r'\s+', ' ', text.strip())
823+
text = html.unescape(text)
824+
return text
833825

834-
def find_ref_positions(node, start_pos=0):
826+
# Now extract references with offsets based on the cleaned text
827+
def traverse_and_collect(node, current_pos=0):
835828
"""
836-
Recursively traverse the DOM tree to find reference positions.
837-
Returns the next character position after processing this node.
829+
Recursively traverse the DOM tree, building cleaned text content and tracking exact positions.
830+
Returns tuple: (text_content, next_position)
838831
"""
839832
if hasattr(node, 'name') and node.name:
840833
# This is an element node
841834
if node.name == "ref" and node.get("type") == "bibr":
842-
# Found a reference - get its text and calculate position
843-
ref_text = node.get_text()
844-
# Find this reference text starting from current position
845-
actual_start = text.find(ref_text, start_pos)
846-
if actual_start != -1:
847-
actual_end = actual_start + len(ref_text)
835+
# Found a reference - get its cleaned text and record its exact position
836+
ref_text = _clean_text(node.get_text())
837+
if ref_text: # Only record non-empty references
848838
refs.append({
849839
"type": node.get("type", ""),
850840
"target": node.get("target", ""),
851841
"text": ref_text,
852-
"offset_start": actual_start,
853-
"offset_end": actual_end
842+
"offset_start": current_pos,
843+
"offset_end": current_pos + len(ref_text)
854844
})
855-
return actual_end
856-
else:
857-
logger.warning(f"Could not find reference text '{ref_text}' at position {start_pos}")
858-
return start_pos
845+
# Return the cleaned reference text and advance position
846+
return ref_text, current_pos + len(ref_text)
859847
else:
860-
# Process children in document order
861-
current_pos = start_pos
848+
# Process children in document order and accumulate their cleaned text
849+
text_parts = []
850+
pos = current_pos
862851
for child in node.children:
863-
current_pos = find_ref_positions(child, current_pos)
864-
return current_pos
852+
child_text, new_pos = traverse_and_collect(child, pos)
853+
if child_text is not None:
854+
text_parts.append(child_text)
855+
pos = new_pos
856+
return "".join(text_parts), pos
865857
else:
866-
# This is a text node (NavigableString) - advance position by text length
858+
# This is a text node (NavigableString) - be more careful with cleaning
867859
text_content = str(node)
868-
return start_pos + len(text_content)
869-
870-
# Start traversal from the beginning
871-
find_ref_positions(element, 0)
872860

873-
return refs
861+
# For text nodes, we need to be more careful about whitespace
862+
# Only apply the full cleaning at the end for the complete text
863+
return text_content, current_pos + len(text_content)
864+
865+
# Build raw text with accurate positions first
866+
raw_text, _ = traverse_and_collect(element, 0)
867+
868+
# Now apply the same cleaning as get_formatted_passage to the complete text
869+
final_text = _clean_text(raw_text)
870+
871+
# Adjust all reference offsets to match the cleaned text
872+
final_refs = []
873+
for ref in refs:
874+
# Find the reference text in the cleaned text to get correct offsets
875+
ref_text = ref['text']
876+
877+
# The reference text was also cleaned, so we need to find it in the final cleaned text
878+
# We can search around the original position to find the correct occurrence
879+
search_start = max(0, ref['offset_start'] - 10) # Look a bit before the original position
880+
search_end = min(len(final_text), ref['offset_start'] + 10) # Look a bit after
881+
search_area = final_text[search_start:search_end]
882+
883+
# Find the reference in the search area
884+
relative_pos = search_area.find(ref_text)
885+
if relative_pos != -1:
886+
final_start = search_start + relative_pos
887+
final_end = final_start + len(ref_text)
888+
889+
final_refs.append({
890+
"type": ref["type"],
891+
"target": ref["target"],
892+
"text": ref_text,
893+
"offset_start": final_start,
894+
"offset_end": final_end
895+
})
896+
897+
return final_refs
874898

875899

876900
def get_formatted_passage(head_paragraph, head_section, paragraph_id, element):

grobid_client/grobid_client.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -362,11 +362,12 @@ def process(
362362
self.logger.warning(f"No eligible files found in {input_path}")
363363
return
364364

365-
print(f"Found {total_files} file(s) to process")
366-
367-
# Counter for actually processed files
365+
# Counters for processing statistics (initialize before early return)
368366
processed_files_count = 0
369367
errors_files_count = 0
368+
skipped_files_count = 0
369+
370+
print(f"Found {total_files} file(s) to process")
370371
input_files = []
371372

372373
for input_file in all_input_files:
@@ -383,7 +384,7 @@ def process(
383384
input_files.append(input_file)
384385

385386
if len(input_files) == batch_size_pdf:
386-
batch_processed, batch_errors = self.process_batch(
387+
batch_processed, batch_errors, batch_skipped = self.process_batch(
387388
service,
388389
input_files,
389390
input_path,
@@ -404,11 +405,12 @@ def process(
404405
)
405406
processed_files_count += batch_processed
406407
errors_files_count += batch_errors
408+
skipped_files_count += batch_skipped
407409
input_files = []
408410

409411
# last batch
410412
if len(input_files) > 0:
411-
batch_processed, batch_errors = self.process_batch(
413+
batch_processed, batch_errors, batch_skipped = self.process_batch(
412414
service,
413415
input_files,
414416
input_path,
@@ -429,10 +431,13 @@ def process(
429431
)
430432
processed_files_count += batch_processed
431433
errors_files_count += batch_errors
434+
skipped_files_count += batch_skipped
432435

433436
# Log final statistics - always visible
434437
print(f"Processing completed: {processed_files_count} out of {total_files} files processed")
435438
print(f"Errors: {errors_files_count} out of {total_files} files processed")
439+
if skipped_files_count > 0:
440+
print(f"Skipped: {skipped_files_count} out of {total_files} files (already existed, use --force to reprocess)")
436441

437442
def process_batch(
438443
self,
@@ -459,6 +464,7 @@ def process_batch(
459464

460465
processed_count = 0
461466
error_count = 0
467+
skipped_count = 0
462468

463469
# we use ThreadPoolExecutor and not ProcessPoolExecutor because it is an I/O intensive process
464470
with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
@@ -470,6 +476,7 @@ def process_batch(
470476
if not force and os.path.isfile(filename):
471477
self.logger.info(
472478
f"{filename} already exists, skipping... (use --force to reprocess pdf input files)")
479+
skipped_count += 1
473480

474481
# Check if JSON output is needed but JSON file doesn't exist
475482
if json_output:
@@ -606,7 +613,7 @@ def process_batch(
606613
except OSError as e:
607614
self.logger.error(f"Failed to write TEI XML file {filename}: {str(e)}")
608615

609-
return processed_count, error_count
616+
return processed_count, error_count, skipped_count
610617

611618
def process_pdf(
612619
self,

0 commit comments

Comments
 (0)