Skip to content

Commit 7deac6d

Browse files
committed
update tests
1 parent 60ac656 commit 7deac6d

6 files changed

Lines changed: 601 additions & 1096 deletions

File tree

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ markers =
1919
filterwarnings =
2020
ignore::DeprecationWarning
2121
ignore::PendingDeprecationWarning
22+
ignore::UserWarning:asyncio

resources/test_out/test_1/0f318087-b211-4ace-8ac9-3d9372a73c1c.tei.xml

Lines changed: 0 additions & 1094 deletions
This file was deleted.

resources/test_out/0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml renamed to tests/resources/0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml

File renamed without changes.

tests/resources/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import os
2+
3+
TEST_DATA_PATH = os.path.dirname(__file__)

tests/test_conversions.py

Lines changed: 362 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,362 @@
1+
"""
2+
Unit tests for TEI to JSON and TEI to Markdown conversion functionality.
3+
"""
4+
import pytest
5+
from unittest.mock import Mock, patch, MagicMock, mock_open
6+
import json
7+
import os
8+
import tempfile
9+
from grobid_client.grobid_client import GrobidClient
10+
from tests.resources import TEST_DATA_PATH
11+
12+
13+
class TestTEIConversions:
14+
"""Test cases for TEI to JSON and Markdown conversions."""
15+
16+
def setup_method(self):
17+
"""Set up test fixtures."""
18+
self.sample_tei_content = """<?xml version="1.0" encoding="UTF-8"?>
19+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
20+
<teiHeader>
21+
<fileDesc>
22+
<titleStmt>
23+
<title>Sample Document Title</title>
24+
</titleStmt>
25+
<publicationStmt>
26+
<publisher>Sample Publisher</publisher>
27+
<date when="2023-01-01">2023-01-01</date>
28+
</publicationStmt>
29+
</fileDesc>
30+
</teiHeader>
31+
<text>
32+
<body>
33+
<div>
34+
<head>Introduction</head>
35+
<p>This is a sample paragraph with a citation <ref type="bibr" target="#b1">[1]</ref>.</p>
36+
</div>
37+
</body>
38+
</text>
39+
</TEI>"""
40+
41+
self.test_config = {
42+
'grobid_server': 'http://localhost:8070',
43+
'batch_size': 10,
44+
'sleep_time': 5,
45+
'timeout': 180,
46+
'logging': {
47+
'level': 'WARNING',
48+
'format': '%(asctime)s - %(levelname)s - %(message)s',
49+
'console': True,
50+
'file': None
51+
}
52+
}
53+
54+
@patch('grobid_client.grobid_client.GrobidClient._test_server_connection')
55+
@patch('grobid_client.grobid_client.GrobidClient._configure_logging')
56+
def test_json_conversion_with_existing_tei_file(self, mock_configure_logging, mock_test_server):
57+
"""Test JSON conversion when TEI file exists but JSON doesn't."""
58+
mock_test_server.return_value = (True, 200)
59+
60+
client = GrobidClient(check_server=False)
61+
client.logger = Mock()
62+
63+
# Create a temporary TEI file for testing
64+
with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
65+
tei_file.write(self.sample_tei_content)
66+
tei_path = tei_file.name
67+
68+
try:
69+
# Test actual conversion
70+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
71+
converter = TEI2LossyJSONConverter()
72+
json_data = converter.convert_tei_file(tei_path, stream=False)
73+
74+
# Verify the conversion result
75+
assert json_data is not None, "JSON conversion should not return None"
76+
assert isinstance(json_data, dict), "JSON conversion should return a dictionary"
77+
78+
# Check that the converted data has expected structure
79+
if 'biblio' in json_data:
80+
assert 'title' in json_data['biblio'], "Converted JSON should have title in biblio"
81+
82+
# The conversion should preserve some content from the TEI
83+
if json_data.get('biblio', {}).get('title'):
84+
assert 'Sample Document Title' in json_data['biblio']['title']
85+
86+
finally:
87+
# Clean up temporary file
88+
os.unlink(tei_path)
89+
90+
@patch('grobid_client.grobid_client.GrobidClient._test_server_connection')
91+
@patch('grobid_client.grobid_client.GrobidClient._configure_logging')
92+
def test_json_conversion_with_empty_tei(self, mock_configure_logging, mock_test_server):
93+
"""Test JSON conversion with empty or malformed TEI content."""
94+
mock_test_server.return_value = (True, 200)
95+
96+
client = GrobidClient(check_server=False)
97+
client.logger = Mock()
98+
99+
# Test with empty TEI content
100+
empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
101+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
102+
</TEI>"""
103+
104+
# Create a temporary TEI file with empty content
105+
with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
106+
tei_file.write(empty_tei)
107+
tei_path = tei_file.name
108+
109+
try:
110+
# Test actual conversion
111+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
112+
converter = TEI2LossyJSONConverter()
113+
json_data = converter.convert_tei_file(tei_path, stream=False)
114+
115+
# Verify that conversion still produces a valid structure even with empty TEI
116+
assert json_data is not None, "Even empty TEI should produce some JSON structure"
117+
assert isinstance(json_data, dict), "Result should still be a dictionary"
118+
119+
finally:
120+
# Clean up temporary file
121+
os.unlink(tei_path)
122+
123+
def test_json_conversion_with_nonexistent_file(self):
124+
"""Test JSON conversion with nonexistent TEI file."""
125+
126+
# Test with nonexistent file
127+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
128+
converter = TEI2LossyJSONConverter()
129+
130+
# Should handle nonexistent file gracefully
131+
try:
132+
json_data = converter.convert_tei_file('/nonexistent/file.xml', stream=False)
133+
# This should either return None or raise an appropriate exception
134+
assert json_data is None, "Nonexistent file should return None"
135+
except Exception as e:
136+
# It's acceptable to raise an exception for nonexistent files
137+
assert True, "Exception is acceptable for nonexistent files"
138+
139+
@patch('grobid_client.grobid_client.GrobidClient._test_server_connection')
140+
@patch('grobid_client.grobid_client.GrobidClient._configure_logging')
141+
def test_markdown_conversion_with_existing_tei_file(self, mock_configure_logging, mock_test_server):
142+
"""Test Markdown conversion when TEI file exists but Markdown doesn't."""
143+
mock_test_server.return_value = (True, 200)
144+
145+
client = GrobidClient(check_server=False)
146+
client.logger = Mock()
147+
148+
# Create a temporary TEI file for testing
149+
with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
150+
tei_file.write(self.sample_tei_content)
151+
tei_path = tei_file.name
152+
153+
try:
154+
# Test actual conversion
155+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
156+
converter = TEI2MarkdownConverter()
157+
markdown_data = converter.convert_tei_file(tei_path)
158+
159+
# Verify the conversion result
160+
assert markdown_data is not None, "Markdown conversion should not return None"
161+
assert isinstance(markdown_data, str), "Markdown conversion should return a string"
162+
assert len(markdown_data) > 0, "Markdown conversion should produce non-empty content"
163+
164+
# Check that the converted content contains expected elements
165+
assert '#' in markdown_data or 'Sample Document Title' in markdown_data, "Markdown should contain title"
166+
167+
finally:
168+
# Clean up temporary file
169+
os.unlink(tei_path)
170+
171+
def test_markdown_conversion_with_empty_tei(self):
172+
"""Test Markdown conversion with empty TEI content."""
173+
174+
# Test with empty TEI content
175+
empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
176+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
177+
</TEI>"""
178+
179+
# Create a temporary TEI file with empty content
180+
with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
181+
tei_file.write(empty_tei)
182+
tei_path = tei_file.name
183+
184+
try:
185+
# Test actual conversion
186+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
187+
converter = TEI2MarkdownConverter()
188+
markdown_data = converter.convert_tei_file(tei_path)
189+
190+
# Verify that conversion still produces some content even with empty TEI
191+
assert markdown_data is not None, "Even empty TEI should produce some markdown content"
192+
assert isinstance(markdown_data, str), "Result should be a string"
193+
194+
finally:
195+
# Clean up temporary file
196+
os.unlink(tei_path)
197+
198+
@patch('grobid_client.grobid_client.GrobidClient._test_server_connection')
199+
@patch('grobid_client.grobid_client.GrobidClient._configure_logging')
200+
def test_both_conversions_same_tei_file(self, mock_configure_logging, mock_test_server):
201+
"""Test both JSON and Markdown conversions for the same TEI file."""
202+
mock_test_server.return_value = (True, 200)
203+
204+
client = GrobidClient(check_server=False)
205+
client.logger = Mock()
206+
207+
# Create a temporary TEI file for testing
208+
with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file:
209+
tei_file.write(self.sample_tei_content)
210+
tei_path = tei_file.name
211+
212+
try:
213+
# Test JSON conversion
214+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
215+
json_converter = TEI2LossyJSONConverter()
216+
json_data = json_converter.convert_tei_file(tei_path, stream=False)
217+
218+
# Test Markdown conversion
219+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
220+
md_converter = TEI2MarkdownConverter()
221+
markdown_data = md_converter.convert_tei_file(tei_path)
222+
223+
# Verify both conversions produced valid results
224+
assert json_data is not None, "JSON conversion should not return None"
225+
assert isinstance(json_data, dict), "JSON conversion should return a dictionary"
226+
227+
assert markdown_data is not None, "Markdown conversion should not return None"
228+
assert isinstance(markdown_data, str), "Markdown conversion should return a string"
229+
assert len(markdown_data) > 0, "Markdown should have content"
230+
231+
# Both conversions should be from the same source, so they should extract similar information
232+
if 'biblio' in json_data and 'title' in json_data['biblio']:
233+
title = json_data['biblio']['title']
234+
# The title should appear in the markdown output
235+
assert title in markdown_data or 'Sample Document Title' in markdown_data, "Title should appear in markdown"
236+
237+
finally:
238+
# Clean up temporary file
239+
os.unlink(tei_path)
240+
241+
def test_process_batch_with_json_output(self):
242+
"""Test process_batch method with JSON output functionality using real TEI resources."""
243+
244+
# Use the actual TEI file from test resources
245+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
246+
247+
# Verify the test TEI file exists
248+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
249+
250+
# Test actual conversion using the same converter that process_batch would use
251+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
252+
converter = TEI2LossyJSONConverter()
253+
json_data = converter.convert_tei_file(tei_file, stream=False)
254+
255+
# Verify conversion worked
256+
assert json_data is not None, "JSON conversion should succeed"
257+
assert isinstance(json_data, dict), "Should return dictionary"
258+
259+
# Test that JSON contains expected content from the real TEI file
260+
if 'biblio' in json_data:
261+
biblio = json_data['biblio']
262+
assert 'title' in biblio, "Should extract title"
263+
assert 'Multi-contact functional electrical stimulation' in biblio['title']
264+
265+
if 'authors' in biblio:
266+
assert len(biblio['authors']) > 0, "Should extract authors"
267+
268+
# Test filename generation logic (same as used in process_batch)
269+
json_filename = tei_file.replace('.tei.xml', '.json')
270+
assert json_filename.endswith('.json'), "Should generate .json filename"
271+
272+
def test_real_tei_json_conversion_integration(self):
273+
"""Test complete TEI to JSON conversion workflow with realistic TEI content."""
274+
275+
# Use the actual TEI file from test resources
276+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
277+
278+
# Verify the test TEI file exists
279+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
280+
281+
# Test actual conversion
282+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
283+
converter = TEI2LossyJSONConverter()
284+
json_data = converter.convert_tei_file(tei_file, stream=False)
285+
286+
# Verify comprehensive conversion results
287+
assert json_data is not None, "Conversion should not return None"
288+
assert isinstance(json_data, dict), "Result should be a dictionary"
289+
290+
# Test bibliography extraction
291+
if 'biblio' in json_data:
292+
biblio = json_data['biblio']
293+
294+
# Should extract title
295+
if 'title' in biblio:
296+
assert 'Multi-contact functional electrical stimulation' in biblio['title']
297+
298+
# Should extract authors
299+
if 'authors' in biblio and len(biblio['authors']) > 0:
300+
assert isinstance(biblio['authors'], list)
301+
# Check that first author has expected name
302+
first_author = biblio['authors'][0]
303+
if 'name' in first_author:
304+
assert 'De Marchis' in first_author['name'] or 'Cristiano' in first_author['name']
305+
306+
# Should extract publication date
307+
if 'publication_date' in biblio:
308+
assert biblio['publication_date'] == '2016-03-08'
309+
310+
# Test body text extraction
311+
if 'body_text' in json_data and len(json_data['body_text']) > 0:
312+
body_text = json_data['body_text']
313+
314+
# Should have at least one paragraph
315+
paragraphs = [p for p in body_text if p.get('text')]
316+
assert len(paragraphs) > 0, "Should extract at least one paragraph"
317+
318+
# Should have references in some paragraphs
319+
refs_found = []
320+
for paragraph in paragraphs:
321+
if 'refs' in paragraph and paragraph['refs']:
322+
refs_found.extend(paragraph['refs'])
323+
324+
# Should find bibliographic references if any exist
325+
if refs_found:
326+
ref_types = {ref.get('type') for ref in refs_found}
327+
# Check for common reference types
328+
assert len(ref_types) > 0, "Should find some reference types"
329+
330+
# Test reference structure
331+
for ref in refs_found[:3]: # Check first few references
332+
assert 'type' in ref, "Reference should have type"
333+
assert 'text' in ref, "Reference should have text"
334+
assert 'offset_start' in ref, "Reference should have offset_start"
335+
assert 'offset_end' in ref, "Reference should have offset_end"
336+
assert ref['offset_start'] < ref['offset_end'], "offset_start should be less than offset_end"
337+
338+
def test_markdown_conversion_with_real_tei_file(self):
339+
"""Test Markdown conversion with real TEI file from test resources."""
340+
341+
# Use the actual TEI file from test resources
342+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
343+
344+
# Verify the test TEI file exists
345+
assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}"
346+
347+
# Test actual conversion
348+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
349+
converter = TEI2MarkdownConverter()
350+
markdown_data = converter.convert_tei_file(tei_file)
351+
352+
# Verify the conversion result
353+
assert markdown_data is not None, "Markdown conversion should not return None"
354+
assert isinstance(markdown_data, str), "Markdown conversion should return a string"
355+
assert len(markdown_data) > 0, "Markdown conversion should produce non-empty content"
356+
357+
# Check that the converted content contains expected elements from real TEI
358+
assert '#' in markdown_data, "Markdown should contain headers"
359+
assert 'Multi-contact functional electrical stimulation' in markdown_data, "Markdown should contain the paper title"
360+
361+
# Check for author information
362+
assert 'De Marchis' in markdown_data or 'Cristiano' in markdown_data, "Markdown should contain author information"

0 commit comments

Comments
 (0)