Skip to content

Commit 22b0785

Browse files
committed
Offset fixes and new tests
1 parent 8abe2fc commit 22b0785

13 files changed

Lines changed: 11653 additions & 54 deletions

Readme.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ A simple, efficient Python client for [GROBID](https://github.com/kermitt2/grobi
4343

4444

4545
> [!IMPORTANT]
46-
> GROBID supports Windows only through Docker containers. See the [Docker documentation](https://grobid.readthedocs.io/en/latest/Grobid-docker/) for details.
46+
> GROBID supports Windows only through Docker containers. See
47+
> the [Docker documentation](https://grobid.readthedocs.io/en/latest/Grobid-docker/) for details.
4748
4849
## 🚀 Installation
4950

@@ -239,7 +240,8 @@ Configuration can be provided via a JSON file. When using the CLI, the `--server
239240
| `logging` | Logging configuration (level, format, file output) | See Logging section |
240241

241242
> [!TIP]
242-
> Since version 0.0.12, the config file is optional. The client will use default localhost settings if no configuration is provided.
243+
> Since version 0.0.12, the config file is optional. The client will use default localhost settings if no configuration
244+
> is provided.
243245
244246
### Logging Configuration
245247

grobid_client/format/TEI2LossyJSON.py

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
139139
box_to_dict(coord.split(","))
140140
for coord in sentence['coords'].split(";")
141141
] if sentence.has_attr("coords") else [],
142-
"refs": [
143-
{
144-
"type": ref["type"],
145-
"target": ref["target"] if "target" in ref.attrs else "",
146-
"text": ref.text
147-
}
148-
for ref in sentence.find_all("ref", type="bibr")
149-
]
142+
"refs": get_refs_with_offsets(sentence)
150143
}
151144
for id, sentence in enumerate(paragraph.find_all("s"))
152145
]
@@ -161,14 +154,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
161154
box_to_dict(coord.split(","))
162155
for coord in paragraph['coords'].split(";")
163156
] if paragraph.has_attr("coords") else [],
164-
"refs": [
165-
{
166-
"type": ref["type"],
167-
"target": ref["target"] if "target" in ref.attrs else "",
168-
"text": ref.text
169-
}
170-
for ref in paragraph.find_all("ref", type="bibr")
171-
]
157+
"refs": get_refs_with_offsets(paragraph)
172158
}
173159
for id, paragraph in enumerate(abstract_paragraph_nodes)
174160
]
@@ -826,51 +812,89 @@ def get_random_id(prefix=""):
826812
def get_refs_with_offsets(element):
827813
"""Extract references with their text offsets from an element."""
828814
refs = []
829-
text = element.get_text()
830815

831-
# Use BeautifulSoup's position tracking by traversing the DOM tree
832-
# and keeping track of character positions as we encounter text
816+
# Apply the same text cleaning as get_formatted_passage
817+
def _clean_text(text: str) -> str:
818+
if not text:
819+
return ""
820+
import re
821+
import html
822+
text = re.sub(r'\s+', ' ', text.strip())
823+
text = html.unescape(text)
824+
return text
833825

834-
def find_ref_positions(node, start_pos=0):
826+
# Now extract references with offsets based on the cleaned text
827+
def traverse_and_collect(node, current_pos=0):
835828
"""
836-
Recursively traverse the DOM tree to find reference positions.
837-
Returns the next character position after processing this node.
829+
Recursively traverse the DOM tree, building cleaned text content and tracking exact positions.
830+
Returns tuple: (text_content, next_position)
838831
"""
839832
if hasattr(node, 'name') and node.name:
840833
# This is an element node
841834
if node.name == "ref" and node.get("type") == "bibr":
842-
# Found a reference - get its text and calculate position
843-
ref_text = node.get_text()
844-
# Find this reference text starting from current position
845-
actual_start = text.find(ref_text, start_pos)
846-
if actual_start != -1:
847-
actual_end = actual_start + len(ref_text)
835+
# Found a reference - get its cleaned text and record its exact position
836+
ref_text = _clean_text(node.get_text())
837+
if ref_text: # Only record non-empty references
848838
refs.append({
849839
"type": node.get("type", ""),
850840
"target": node.get("target", ""),
851841
"text": ref_text,
852-
"offset_start": actual_start,
853-
"offset_end": actual_end
842+
"offset_start": current_pos,
843+
"offset_end": current_pos + len(ref_text)
854844
})
855-
return actual_end
856-
else:
857-
logger.warning(f"Could not find reference text '{ref_text}' at position {start_pos}")
858-
return start_pos
845+
# Return the cleaned reference text and advance position
846+
return ref_text, current_pos + len(ref_text)
859847
else:
860-
# Process children in document order
861-
current_pos = start_pos
848+
# Process children in document order and accumulate their cleaned text
849+
text_parts = []
850+
pos = current_pos
862851
for child in node.children:
863-
current_pos = find_ref_positions(child, current_pos)
864-
return current_pos
852+
child_text, new_pos = traverse_and_collect(child, pos)
853+
if child_text is not None:
854+
text_parts.append(child_text)
855+
pos = new_pos
856+
return "".join(text_parts), pos
865857
else:
866-
# This is a text node (NavigableString) - advance position by text length
858+
# This is a text node (NavigableString) - be more careful with cleaning
867859
text_content = str(node)
868-
return start_pos + len(text_content)
869-
870-
# Start traversal from the beginning
871-
find_ref_positions(element, 0)
872860

873-
return refs
861+
# For text nodes, we need to be more careful about whitespace
862+
# Only apply the full cleaning at the end for the complete text
863+
return text_content, current_pos + len(text_content)
864+
865+
# Build raw text with accurate positions first
866+
raw_text, _ = traverse_and_collect(element, 0)
867+
868+
# Now apply the same cleaning as get_formatted_passage to the complete text
869+
final_text = _clean_text(raw_text)
870+
871+
# Adjust all reference offsets to match the cleaned text
872+
final_refs = []
873+
for ref in refs:
874+
# Find the reference text in the cleaned text to get correct offsets
875+
ref_text = ref['text']
876+
877+
# The reference text was also cleaned, so we need to find it in the final cleaned text
878+
# We can search around the original position to find the correct occurrence
879+
search_start = max(0, ref['offset_start'] - 10) # Look a bit before the original position
880+
search_end = min(len(final_text), ref['offset_start'] + 10) # Look a bit after
881+
search_area = final_text[search_start:search_end]
882+
883+
# Find the reference in the search area
884+
relative_pos = search_area.find(ref_text)
885+
if relative_pos != -1:
886+
final_start = search_start + relative_pos
887+
final_end = final_start + len(ref_text)
888+
889+
final_refs.append({
890+
"type": ref["type"],
891+
"target": ref["target"],
892+
"text": ref_text,
893+
"offset_start": final_start,
894+
"offset_end": final_end
895+
})
896+
897+
return final_refs
874898

875899

876900
def get_formatted_passage(head_paragraph, head_section, paragraph_id, element):

grobid_client/grobid_client.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -361,11 +361,12 @@ def process(
361361
self.logger.warning(f"No eligible files found in {input_path}")
362362
return
363363

364-
print(f"Found {total_files} file(s) to process")
365-
366-
# Counter for actually processed files
364+
# Counters for processing statistics (initialize before early return)
367365
processed_files_count = 0
368366
errors_files_count = 0
367+
skipped_files_count = 0
368+
369+
print(f"Found {total_files} file(s) to process")
369370
input_files = []
370371

371372
for input_file in all_input_files:
@@ -382,7 +383,7 @@ def process(
382383
input_files.append(input_file)
383384

384385
if len(input_files) == batch_size_pdf:
385-
batch_processed, batch_errors = self.process_batch(
386+
batch_processed, batch_errors, batch_skipped = self.process_batch(
386387
service,
387388
input_files,
388389
input_path,
@@ -406,7 +407,7 @@ def process(
406407

407408
# last batch
408409
if len(input_files) > 0:
409-
batch_processed, batch_errors = self.process_batch(
410+
batch_processed, batch_errors, batch_skipped = self.process_batch(
410411
service,
411412
input_files,
412413
input_path,
@@ -426,10 +427,13 @@ def process(
426427
)
427428
processed_files_count += batch_processed
428429
errors_files_count += batch_errors
430+
skipped_files_count += batch_skipped
429431

430432
# Log final statistics - always visible
431433
print(f"Processing completed: {processed_files_count} out of {total_files} files processed")
432434
print(f"Errors: {errors_files_count} out of {total_files} files processed")
435+
if skipped_files_count > 0:
436+
print(f"Skipped: {skipped_files_count} out of {total_files} files (already existed, use --force to reprocess)")
433437

434438
def process_batch(
435439
self,
@@ -455,6 +459,7 @@ def process_batch(
455459

456460
processed_count = 0
457461
error_count = 0
462+
skipped_count = 0
458463

459464
# we use ThreadPoolExecutor and not ProcessPoolExecutor because it is an I/O intensive process
460465
with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
@@ -466,6 +471,7 @@ def process_batch(
466471
if not force and os.path.isfile(filename):
467472
self.logger.info(
468473
f"{filename} already exists, skipping... (use --force to reprocess pdf input files)")
474+
skipped_count += 1
469475

470476
# Check if JSON output is needed but JSON file doesn't exist
471477
if json_output:
@@ -562,7 +568,7 @@ def process_batch(
562568
except OSError as e:
563569
self.logger.error(f"Failed to write TEI XML file {filename}: {str(e)}")
564570

565-
return processed_count, error_count
571+
return processed_count, error_count, skipped_count
566572

567573
def process_pdf(
568574
self,

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ markers =
1919
filterwarnings =
2020
ignore::DeprecationWarning
2121
ignore::PendingDeprecationWarning
22+
ignore::UserWarning:asyncio

resources/test_out/0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml renamed to tests/resources/0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml

File renamed without changes.

tests/resources/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import os
2+
3+
TEST_DATA_PATH = os.path.dirname(__file__)

0 commit comments

Comments
 (0)