1+ """
2+ Unit tests for TEI to JSON and TEI to Markdown conversion functionality.
3+ """
4+ import pytest
5+ from unittest .mock import Mock , patch , MagicMock , mock_open
6+ import json
7+ import os
8+ import tempfile
9+ from grobid_client .grobid_client import GrobidClient
10+ from tests .resources import TEST_DATA_PATH
11+
12+
13+ class TestTEIConversions :
14+ """Test cases for TEI to JSON and Markdown conversions."""
15+
16+ def setup_method (self ):
17+ """Set up test fixtures."""
18+ self .sample_tei_content = """<?xml version="1.0" encoding="UTF-8"?>
19+ <TEI xmlns="http://www.tei-c.org/ns/1.0">
20+ <teiHeader>
21+ <fileDesc>
22+ <titleStmt>
23+ <title>Sample Document Title</title>
24+ </titleStmt>
25+ <publicationStmt>
26+ <publisher>Sample Publisher</publisher>
27+ <date when="2023-01-01">2023-01-01</date>
28+ </publicationStmt>
29+ </fileDesc>
30+ </teiHeader>
31+ <text>
32+ <body>
33+ <div>
34+ <head>Introduction</head>
35+ <p>This is a sample paragraph with a citation <ref type="bibr" target="#b1">[1]</ref>.</p>
36+ </div>
37+ </body>
38+ </text>
39+ </TEI>"""
40+
41+ self .test_config = {
42+ 'grobid_server' : 'http://localhost:8070' ,
43+ 'batch_size' : 10 ,
44+ 'sleep_time' : 5 ,
45+ 'timeout' : 180 ,
46+ 'logging' : {
47+ 'level' : 'WARNING' ,
48+ 'format' : '%(asctime)s - %(levelname)s - %(message)s' ,
49+ 'console' : True ,
50+ 'file' : None
51+ }
52+ }
53+
54+ @patch ('grobid_client.grobid_client.GrobidClient._test_server_connection' )
55+ @patch ('grobid_client.grobid_client.GrobidClient._configure_logging' )
56+ def test_json_conversion_with_existing_tei_file (self , mock_configure_logging , mock_test_server ):
57+ """Test JSON conversion when TEI file exists but JSON doesn't."""
58+ mock_test_server .return_value = (True , 200 )
59+
60+ client = GrobidClient (check_server = False )
61+ client .logger = Mock ()
62+
63+ # Create a temporary TEI file for testing
64+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.tei.xml' , delete = False ) as tei_file :
65+ tei_file .write (self .sample_tei_content )
66+ tei_path = tei_file .name
67+
68+ try :
69+ # Test actual conversion
70+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
71+ converter = TEI2LossyJSONConverter ()
72+ json_data = converter .convert_tei_file (tei_path , stream = False )
73+
74+ # Verify the conversion result
75+ assert json_data is not None , "JSON conversion should not return None"
76+ assert isinstance (json_data , dict ), "JSON conversion should return a dictionary"
77+
78+ # Check that the converted data has expected structure
79+ if 'biblio' in json_data :
80+ assert 'title' in json_data ['biblio' ], "Converted JSON should have title in biblio"
81+
82+ # The conversion should preserve some content from the TEI
83+ if json_data .get ('biblio' , {}).get ('title' ):
84+ assert 'Sample Document Title' in json_data ['biblio' ]['title' ]
85+
86+ finally :
87+ # Clean up temporary file
88+ os .unlink (tei_path )
89+
90+ @patch ('grobid_client.grobid_client.GrobidClient._test_server_connection' )
91+ @patch ('grobid_client.grobid_client.GrobidClient._configure_logging' )
92+ def test_json_conversion_with_empty_tei (self , mock_configure_logging , mock_test_server ):
93+ """Test JSON conversion with empty or malformed TEI content."""
94+ mock_test_server .return_value = (True , 200 )
95+
96+ client = GrobidClient (check_server = False )
97+ client .logger = Mock ()
98+
99+ # Test with empty TEI content
100+ empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
101+ <TEI xmlns="http://www.tei-c.org/ns/1.0">
102+ </TEI>"""
103+
104+ # Create a temporary TEI file with empty content
105+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.tei.xml' , delete = False ) as tei_file :
106+ tei_file .write (empty_tei )
107+ tei_path = tei_file .name
108+
109+ try :
110+ # Test actual conversion
111+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
112+ converter = TEI2LossyJSONConverter ()
113+ json_data = converter .convert_tei_file (tei_path , stream = False )
114+
115+ # Verify that conversion still produces a valid structure even with empty TEI
116+ assert json_data is not None , "Even empty TEI should produce some JSON structure"
117+ assert isinstance (json_data , dict ), "Result should still be a dictionary"
118+
119+ finally :
120+ # Clean up temporary file
121+ os .unlink (tei_path )
122+
123+ def test_json_conversion_with_nonexistent_file (self ):
124+ """Test JSON conversion with nonexistent TEI file."""
125+
126+ # Test with nonexistent file
127+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
128+ converter = TEI2LossyJSONConverter ()
129+
130+ # Should handle nonexistent file gracefully
131+ try :
132+ json_data = converter .convert_tei_file ('/nonexistent/file.xml' , stream = False )
133+ # This should either return None or raise an appropriate exception
134+ assert json_data is None , "Nonexistent file should return None"
135+ except Exception as e :
136+ # It's acceptable to raise an exception for nonexistent files
137+ assert True , "Exception is acceptable for nonexistent files"
138+
139+ @patch ('grobid_client.grobid_client.GrobidClient._test_server_connection' )
140+ @patch ('grobid_client.grobid_client.GrobidClient._configure_logging' )
141+ def test_markdown_conversion_with_existing_tei_file (self , mock_configure_logging , mock_test_server ):
142+ """Test Markdown conversion when TEI file exists but Markdown doesn't."""
143+ mock_test_server .return_value = (True , 200 )
144+
145+ client = GrobidClient (check_server = False )
146+ client .logger = Mock ()
147+
148+ # Create a temporary TEI file for testing
149+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.tei.xml' , delete = False ) as tei_file :
150+ tei_file .write (self .sample_tei_content )
151+ tei_path = tei_file .name
152+
153+ try :
154+ # Test actual conversion
155+ from grobid_client .format .TEI2Markdown import TEI2MarkdownConverter
156+ converter = TEI2MarkdownConverter ()
157+ markdown_data = converter .convert_tei_file (tei_path )
158+
159+ # Verify the conversion result
160+ assert markdown_data is not None , "Markdown conversion should not return None"
161+ assert isinstance (markdown_data , str ), "Markdown conversion should return a string"
162+ assert len (markdown_data ) > 0 , "Markdown conversion should produce non-empty content"
163+
164+ # Check that the converted content contains expected elements
165+ assert '#' in markdown_data or 'Sample Document Title' in markdown_data , "Markdown should contain title"
166+
167+ finally :
168+ # Clean up temporary file
169+ os .unlink (tei_path )
170+
171+ def test_markdown_conversion_with_empty_tei (self ):
172+ """Test Markdown conversion with empty TEI content."""
173+
174+ # Test with empty TEI content
175+ empty_tei = """<?xml version="1.0" encoding="UTF-8"?>
176+ <TEI xmlns="http://www.tei-c.org/ns/1.0">
177+ </TEI>"""
178+
179+ # Create a temporary TEI file with empty content
180+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.tei.xml' , delete = False ) as tei_file :
181+ tei_file .write (empty_tei )
182+ tei_path = tei_file .name
183+
184+ try :
185+ # Test actual conversion
186+ from grobid_client .format .TEI2Markdown import TEI2MarkdownConverter
187+ converter = TEI2MarkdownConverter ()
188+ markdown_data = converter .convert_tei_file (tei_path )
189+
190+ # Verify that conversion still produces some content even with empty TEI
191+ assert markdown_data is not None , "Even empty TEI should produce some markdown content"
192+ assert isinstance (markdown_data , str ), "Result should be a string"
193+
194+ finally :
195+ # Clean up temporary file
196+ os .unlink (tei_path )
197+
198+ @patch ('grobid_client.grobid_client.GrobidClient._test_server_connection' )
199+ @patch ('grobid_client.grobid_client.GrobidClient._configure_logging' )
200+ def test_both_conversions_same_tei_file (self , mock_configure_logging , mock_test_server ):
201+ """Test both JSON and Markdown conversions for the same TEI file."""
202+ mock_test_server .return_value = (True , 200 )
203+
204+ client = GrobidClient (check_server = False )
205+ client .logger = Mock ()
206+
207+ # Create a temporary TEI file for testing
208+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.tei.xml' , delete = False ) as tei_file :
209+ tei_file .write (self .sample_tei_content )
210+ tei_path = tei_file .name
211+
212+ try :
213+ # Test JSON conversion
214+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
215+ json_converter = TEI2LossyJSONConverter ()
216+ json_data = json_converter .convert_tei_file (tei_path , stream = False )
217+
218+ # Test Markdown conversion
219+ from grobid_client .format .TEI2Markdown import TEI2MarkdownConverter
220+ md_converter = TEI2MarkdownConverter ()
221+ markdown_data = md_converter .convert_tei_file (tei_path )
222+
223+ # Verify both conversions produced valid results
224+ assert json_data is not None , "JSON conversion should not return None"
225+ assert isinstance (json_data , dict ), "JSON conversion should return a dictionary"
226+
227+ assert markdown_data is not None , "Markdown conversion should not return None"
228+ assert isinstance (markdown_data , str ), "Markdown conversion should return a string"
229+ assert len (markdown_data ) > 0 , "Markdown should have content"
230+
231+ # Both conversions should be from the same source, so they should extract similar information
232+ if 'biblio' in json_data and 'title' in json_data ['biblio' ]:
233+ title = json_data ['biblio' ]['title' ]
234+ # The title should appear in the markdown output
235+ assert title in markdown_data or 'Sample Document Title' in markdown_data , "Title should appear in markdown"
236+
237+ finally :
238+ # Clean up temporary file
239+ os .unlink (tei_path )
240+
241+ def test_process_batch_with_json_output (self ):
242+ """Test process_batch method with JSON output functionality using real TEI resources."""
243+
244+ # Use the actual TEI file from test resources
245+ tei_file = os .path .join (TEST_DATA_PATH , '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml' )
246+
247+ # Verify the test TEI file exists
248+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
249+
250+ # Test actual conversion using the same converter that process_batch would use
251+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
252+ converter = TEI2LossyJSONConverter ()
253+ json_data = converter .convert_tei_file (tei_file , stream = False )
254+
255+ # Verify conversion worked
256+ assert json_data is not None , "JSON conversion should succeed"
257+ assert isinstance (json_data , dict ), "Should return dictionary"
258+
259+ # Test that JSON contains expected content from the real TEI file
260+ if 'biblio' in json_data :
261+ biblio = json_data ['biblio' ]
262+ assert 'title' in biblio , "Should extract title"
263+ assert 'Multi-contact functional electrical stimulation' in biblio ['title' ]
264+
265+ if 'authors' in biblio :
266+ assert len (biblio ['authors' ]) > 0 , "Should extract authors"
267+
268+ # Test filename generation logic (same as used in process_batch)
269+ json_filename = tei_file .replace ('.tei.xml' , '.json' )
270+ assert json_filename .endswith ('.json' ), "Should generate .json filename"
271+
272+ def test_real_tei_json_conversion_integration (self ):
273+ """Test complete TEI to JSON conversion workflow with realistic TEI content."""
274+
275+ # Use the actual TEI file from test resources
276+ tei_file = os .path .join (TEST_DATA_PATH , '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml' )
277+
278+ # Verify the test TEI file exists
279+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
280+
281+ # Test actual conversion
282+ from grobid_client .format .TEI2LossyJSON import TEI2LossyJSONConverter
283+ converter = TEI2LossyJSONConverter ()
284+ json_data = converter .convert_tei_file (tei_file , stream = False )
285+
286+ # Verify comprehensive conversion results
287+ assert json_data is not None , "Conversion should not return None"
288+ assert isinstance (json_data , dict ), "Result should be a dictionary"
289+
290+ # Test bibliography extraction
291+ if 'biblio' in json_data :
292+ biblio = json_data ['biblio' ]
293+
294+ # Should extract title
295+ if 'title' in biblio :
296+ assert 'Multi-contact functional electrical stimulation' in biblio ['title' ]
297+
298+ # Should extract authors
299+ if 'authors' in biblio and len (biblio ['authors' ]) > 0 :
300+ assert isinstance (biblio ['authors' ], list )
301+ # Check that first author has expected name
302+ first_author = biblio ['authors' ][0 ]
303+ if 'name' in first_author :
304+ assert 'De Marchis' in first_author ['name' ] or 'Cristiano' in first_author ['name' ]
305+
306+ # Should extract publication date
307+ if 'publication_date' in biblio :
308+ assert biblio ['publication_date' ] == '2016-03-08'
309+
310+ # Test body text extraction
311+ if 'body_text' in json_data and len (json_data ['body_text' ]) > 0 :
312+ body_text = json_data ['body_text' ]
313+
314+ # Should have at least one paragraph
315+ paragraphs = [p for p in body_text if p .get ('text' )]
316+ assert len (paragraphs ) > 0 , "Should extract at least one paragraph"
317+
318+ # Should have references in some paragraphs
319+ refs_found = []
320+ for paragraph in paragraphs :
321+ if 'refs' in paragraph and paragraph ['refs' ]:
322+ refs_found .extend (paragraph ['refs' ])
323+
324+ # Should find bibliographic references if any exist
325+ if refs_found :
326+ ref_types = {ref .get ('type' ) for ref in refs_found }
327+ # Check for common reference types
328+ assert len (ref_types ) > 0 , "Should find some reference types"
329+
330+ # Test reference structure
331+ for ref in refs_found [:3 ]: # Check first few references
332+ assert 'type' in ref , "Reference should have type"
333+ assert 'text' in ref , "Reference should have text"
334+ assert 'offset_start' in ref , "Reference should have offset_start"
335+ assert 'offset_end' in ref , "Reference should have offset_end"
336+ assert ref ['offset_start' ] < ref ['offset_end' ], "offset_start should be less than offset_end"
337+
338+ def test_markdown_conversion_with_real_tei_file (self ):
339+ """Test Markdown conversion with real TEI file from test resources."""
340+
341+ # Use the actual TEI file from test resources
342+ tei_file = os .path .join (TEST_DATA_PATH , '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml' )
343+
344+ # Verify the test TEI file exists
345+ assert os .path .exists (tei_file ), f"Test TEI file should exist at { tei_file } "
346+
347+ # Test actual conversion
348+ from grobid_client .format .TEI2Markdown import TEI2MarkdownConverter
349+ converter = TEI2MarkdownConverter ()
350+ markdown_data = converter .convert_tei_file (tei_file )
351+
352+ # Verify the conversion result
353+ assert markdown_data is not None , "Markdown conversion should not return None"
354+ assert isinstance (markdown_data , str ), "Markdown conversion should return a string"
355+ assert len (markdown_data ) > 0 , "Markdown conversion should produce non-empty content"
356+
357+ # Check that the converted content contains expected elements from real TEI
358+ assert '#' in markdown_data , "Markdown should contain headers"
359+ assert 'Multi-contact functional electrical stimulation' in markdown_data , "Markdown should contain the paper title"
360+
361+ # Check for author information
362+ assert 'De Marchis' in markdown_data or 'Cristiano' in markdown_data , "Markdown should contain author information"
0 commit comments