diff --git a/config/xhs_config.py b/config/xhs_config.py index 02cc96645..efc98f8a2 100644 --- a/config/xhs_config.py +++ b/config/xhs_config.py @@ -23,6 +23,10 @@ # Sorting method, the specific enumeration value is in media_platform/xhs/field.py SORT_TYPE = "popularity_descending" +# Stop pagination when search results contain notes older than the configured hours. +# Set to 24 for "within one day", or 0 to disable this cutoff. +XHS_NOTE_MAX_AGE_HOURS = 24 + # Specify the note URL list, which must carry the xsec_token parameter XHS_SPECIFIED_NOTE_URL_LIST = [ "https://www.xiaohongshu.com/explore/64b95d01000000000c034587?xsec_token=AB0EFqJvINCkj6xOCKCQgfNNh8GdnBC_6XecG4QOddo3Q=&xsec_source=pc_cfeed" diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index f797e791b..5b91cae0c 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -122,11 +122,38 @@ async def start(self) -> None: utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...") + @staticmethod + def _normalize_timestamp_to_milliseconds(timestamp: Optional[int]) -> int: + """Normalize 10/13-digit unix timestamps to milliseconds.""" + if timestamp is None: + return 0 + + try: + normalized_timestamp = int(timestamp) + except (TypeError, ValueError): + return 0 + + if normalized_timestamp < 1000000000000: + return normalized_timestamp * 1000 + return normalized_timestamp + async def search(self) -> None: """Search for notes and retrieve their comment information.""" utils.logger.info("[XiaoHongShuCrawler.search] Begin search Xiaohongshu keywords") xhs_limit_count = 20 # Xiaohongshu limit page fixed value - if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: + max_note_age_hours = int(getattr(config, "XHS_NOTE_MAX_AGE_HOURS", 0) or 0) + enable_time_cutoff = ( + max_note_age_hours > 0 + and config.SORT_TYPE == SearchSortType.LATEST.value + ) + + if max_note_age_hours > 0 and not enable_time_cutoff: + utils.logger.warning( + "[XiaoHongShuCrawler.search] XHS_NOTE_MAX_AGE_HOURS is configured, " + "but SORT_TYPE is not 'time_descending'. Skip auto-stop to avoid incorrect early termination." + ) + + if not enable_time_cutoff and config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): @@ -172,7 +199,35 @@ async def search(self) -> None: xsec_tokens.append(note_detail.get("xsec_token")) page += 1 utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") + # 时间筛选:检查是否有笔记早于阈值 + # see https://github.com/NanmiCoder/MediaCrawler/issues/848#issuecomment-4081551779 + should_stop = False + if enable_time_cutoff: + threshold_timestamp = ( + utils.get_current_timestamp() + - max_note_age_hours * 60 * 60 * 1000 + ) + valid_note_count = 0 + for note_detail in note_details: + if not note_detail: + continue + + note_time = self._normalize_timestamp_to_milliseconds(note_detail.get("time")) + if note_time and note_time < threshold_timestamp: + should_stop = True + note_ids = note_ids[:valid_note_count] + xsec_tokens = xsec_tokens[:valid_note_count] + utils.logger.info( + f"[XiaoHongShuCrawler.search] Found note older than {max_note_age_hours} hours, " + f"stop after current page. note_id: {note_detail.get('note_id')}, " + f"note_time: {utils.get_time_str_from_unix_time(note_time)}" + ) + break + + valid_note_count += 1 await self.batch_get_note_comments(note_ids, xsec_tokens) + if should_stop: + break # 跳出 while 循环 # Sleep after each page navigation await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)