@@ -42,19 +42,22 @@ def test_read_data(self) -> None:
4242 """
4343 Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string.
4444 """
45- raw_data = "CC(=O)NC1 [Mg-2]"
45+ raw_data = "CC(=O)NC1CC1 [Mg-2]"
4646 # Expected output as per the tokens already in the cache, and ")" getting added to it.
4747 expected_output : List [int ] = [
4848 EMBEDDING_OFFSET + 0 , # C
4949 EMBEDDING_OFFSET + 0 , # C
50- EMBEDDING_OFFSET + 5 , # =
51- EMBEDDING_OFFSET + 3 , # O
52- EMBEDDING_OFFSET + 1 , # N
53- EMBEDDING_OFFSET + len (self .reader .cache ), # (
54- EMBEDDING_OFFSET + 2 , # C
50+ EMBEDDING_OFFSET + 5 , # (
51+ EMBEDDING_OFFSET + 3 , # =
52+ EMBEDDING_OFFSET + 1 , # O
53+ EMBEDDING_OFFSET + len (self .reader .cache ), # ) - new token
54+ EMBEDDING_OFFSET + 2 , # N
5555 EMBEDDING_OFFSET + 0 , # C
5656 EMBEDDING_OFFSET + 4 , # 1
57- EMBEDDING_OFFSET + len (self .reader .cache ) + 1 , # [Mg-2]
57+ EMBEDDING_OFFSET + 0 , # C
58+ EMBEDDING_OFFSET + 0 , # C
59+ EMBEDDING_OFFSET + 4 , # 1
60+ EMBEDDING_OFFSET + len (self .reader .cache ) + 1 , # [Mg-2] - new token
5861 ]
5962 result = self .reader ._read_data (raw_data )
6063 self .assertEqual (
@@ -99,13 +102,29 @@ def test_read_data_with_invalid_input(self) -> None:
99102 Test the _read_data method with an invalid input.
100103 The invalid token should prompt a return value None
101104 """
102- raw_data = "%INVALID%"
103-
104- result = self .reader ._read_data (raw_data )
105- self .assertIsNone (
106- result ,
107- "The output for invalid token '%INVALID%' should be None." ,
108- )
105+ # see https://github.com/ChEB-AI/python-chebai/issues/137
106+ raw_datas = ["%INVALID%" , "ADADAD" , "ADASDAD" , "CC(=O)NC1[Mg-2]" ]
107+ for raw_data in raw_datas :
108+ result = self .reader ._read_data (raw_data )
109+ self .assertIsNone (
110+ result ,
111+ f"The output for invalid token '{ raw_data } ' should be None." ,
112+ )
113+
114+ def test_read_data_with_invalid_input_with_no_canonicalize (self ) -> None :
115+ """
116+ Test the _read_data method with an invalid input.
117+ The invalid token should prompt a return value None
118+ """
119+ self .reader .canonicalize_smiles = False
120+ raw_datas = ["%INVALID%" , "ADADAD" , "ADASDAD" , "CC(=O)NC1[Mg-2]" ]
121+ for raw_data in raw_datas :
122+ result = self .reader ._read_data (raw_data )
123+ self .assertIsNone (
124+ result ,
125+ f"The output for invalid token '{ raw_data } ' should be None." ,
126+ )
127+ self .reader .canonicalize_smiles = True # Reset to original state
109128
110129 @patch ("builtins.open" , new_callable = mock_open )
111130 def test_finish_method_for_new_tokens (self , mock_file : mock_open ) -> None :
0 commit comments