@@ -67,7 +67,7 @@ def _id_to_text(self, input_ids, input_json, image_prefix="images"):
6767
6868 def _convert_response (self , input_response , input_json_path , image_prefix = "images" ):
6969 qa_list = []
70- with open (input_json_path , 'r' ) as infile :
70+ with open (input_json_path , 'r' , encoding = 'utf-8' ) as infile :
7171 input_json = list (json .load (infile ))
7272 # 提取title
7373 for chapter_block in re .findall (r'<chapter>(.*?)</chapter>' , input_response , flags = re .DOTALL ):
@@ -116,18 +116,18 @@ def run(self, storage: DataFlowStorage,
116116 qa_list = self ._convert_response (response , converted_json_path , image_prefix )
117117 output_qalist_path = os .path .join (self .output_dir , name , f"extracted_vqa.jsonl" )
118118 os .makedirs (os .path .dirname (output_qalist_path ), exist_ok = True )
119- with open (output_qalist_path , 'w' ) as outfile :
119+ with open (output_qalist_path , 'w' , encoding = 'utf-8' ) as outfile :
120120 for qa in qa_list :
121121 json .dump (qa , outfile , ensure_ascii = False )
122122 outfile .write ('\n ' )
123123
124124 # 复制图片
125- src_dir = converted_json_path . rpartition ( '/' )[ 0 ]
125+ src_dir = os . path . dirname ( converted_json_path )
126126 src_images = os .path .join (src_dir , 'vlm' , 'images' )
127127 if not os .path .exists (src_images ):
128128 src_images = os .path .join (src_dir , 'images' )
129129 if not os .path .exists (src_images ):
130- raise ValueError ("Images directory not found! There might be a change in Mineru API!" )
130+ raise ValueError (f "Images directory { src_images } not found! There might be a change in Mineru API!" )
131131 dst_images = os .path .join (self .output_dir , image_prefix )
132132
133133 try :
0 commit comments