diff --git a/twitter_data_ingestion.py b/twitter_data_ingestion.py index 2f02c7f..2546bde 100644 --- a/twitter_data_ingestion.py +++ b/twitter_data_ingestion.py @@ -300,6 +300,10 @@ def _save_to_excel(json_filename, output_filename="data/data.xlsx"): # Drop duplicates & save to Excel cur_df.drop_duplicates(subset=["url"], inplace=True) + # 定义非法字符的正则表达式 + ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') + # 使用正则表达式替换非法字符 + cur_df = cur_df.replace(ILLEGAL_CHARACTERS_RE, '', regex=True) cur_df.to_excel(output_filename, index=False) logger.info( f"\n\nDone saving to {output_filename}. Total of {len(cur_df)} unique tweets."