From 4ebfe3eed72a2d16c855db096581266e209392c8 Mon Sep 17 00:00:00 2001 From: soulmachine Date: Wed, 18 Mar 2026 17:58:21 -0700 Subject: [PATCH 1/3] Add time cutoff for XHS search --- config/xhs_config.py | 6 +++- media_platform/xhs/core.py | 59 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/config/xhs_config.py b/config/xhs_config.py index 02cc96645..9ff6e2375 100644 --- a/config/xhs_config.py +++ b/config/xhs_config.py @@ -21,7 +21,11 @@ # Xiaohongshu platform configuration # Sorting method, the specific enumeration value is in media_platform/xhs/field.py -SORT_TYPE = "popularity_descending" +SORT_TYPE = "time_descending" + +# Stop pagination when search results contain notes older than the configured hours. +# Set to 24 for "within one day", or 0 to disable this cutoff. +XHS_NOTE_MAX_AGE_HOURS = 24 # Specify the note URL list, which must carry the xsec_token parameter XHS_SPECIFIED_NOTE_URL_LIST = [ diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index f797e791b..497ac96b2 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -122,11 +122,38 @@ async def start(self) -> None: utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...") + @staticmethod + def _normalize_timestamp_to_milliseconds(timestamp: Optional[int]) -> int: + """Normalize 10/13-digit unix timestamps to milliseconds.""" + if timestamp is None: + return 0 + + try: + normalized_timestamp = int(timestamp) + except (TypeError, ValueError): + return 0 + + if normalized_timestamp < 1000000000000: + return normalized_timestamp * 1000 + return normalized_timestamp + async def search(self) -> None: """Search for notes and retrieve their comment information.""" utils.logger.info("[XiaoHongShuCrawler.search] Begin search Xiaohongshu keywords") xhs_limit_count = 20 # Xiaohongshu limit page fixed value - if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: + max_note_age_hours = int(getattr(config, "XHS_NOTE_MAX_AGE_HOURS", 0) or 0) + enable_time_cutoff = ( + max_note_age_hours > 0 + and config.SORT_TYPE == SearchSortType.LATEST.value + ) + + if max_note_age_hours > 0 and not enable_time_cutoff: + utils.logger.warning( + "[XiaoHongShuCrawler.search] XHS_NOTE_MAX_AGE_HOURS is configured, " + "but SORT_TYPE is not 'time_descending'. Skip auto-stop to avoid incorrect early termination." + ) + + if not enable_time_cutoff and config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): @@ -134,7 +161,7 @@ async def search(self) -> None: utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") page = 1 search_id = get_search_id() - while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while enable_time_cutoff or (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: if page < start_page: utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}") page += 1 @@ -172,7 +199,35 @@ async def search(self) -> None: xsec_tokens.append(note_detail.get("xsec_token")) page += 1 utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") + # 时间筛选:检查是否有笔记早于阈值 + # see https://github.com/NanmiCoder/MediaCrawler/issues/848#issuecomment-4081551779 + should_stop = False + if enable_time_cutoff: + threshold_timestamp = ( + utils.get_current_timestamp() + - max_note_age_hours * 60 * 60 * 1000 + ) + valid_note_count = 0 + for note_detail in note_details: + if not note_detail: + continue + + note_time = self._normalize_timestamp_to_milliseconds(note_detail.get("time")) + if note_time and note_time < threshold_timestamp: + should_stop = True + note_ids = note_ids[:valid_note_count] + xsec_tokens = xsec_tokens[:valid_note_count] + utils.logger.info( + f"[XiaoHongShuCrawler.search] Found note older than {max_note_age_hours} hours, " + f"stop after current page. note_id: {note_detail.get('note_id')}, " + f"note_time: {utils.get_time_str_from_unix_time(note_time)}" + ) + break + + valid_note_count += 1 await self.batch_get_note_comments(note_ids, xsec_tokens) + if should_stop: + break # 跳出 while 循环 # Sleep after each page navigation await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) From 529a730cc1863d3f832b9ea9b312baf33262c7c4 Mon Sep 17 00:00:00 2001 From: soulmachine Date: Wed, 18 Mar 2026 17:59:57 -0700 Subject: [PATCH 2/3] Restored SORT_TYPE --- config/xhs_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/xhs_config.py b/config/xhs_config.py index 9ff6e2375..efc98f8a2 100644 --- a/config/xhs_config.py +++ b/config/xhs_config.py @@ -21,7 +21,7 @@ # Xiaohongshu platform configuration # Sorting method, the specific enumeration value is in media_platform/xhs/field.py -SORT_TYPE = "time_descending" +SORT_TYPE = "popularity_descending" # Stop pagination when search results contain notes older than the configured hours. # Set to 24 for "within one day", or 0 to disable this cutoff. From 2728b5db0e8e7e15caa921dd16600aa8e0eddb80 Mon Sep 17 00:00:00 2001 From: soulmachine Date: Wed, 18 Mar 2026 18:07:20 -0700 Subject: [PATCH 3/3] revert --- media_platform/xhs/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 497ac96b2..5b91cae0c 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -161,7 +161,7 @@ async def search(self) -> None: utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") page = 1 search_id = get_search_id() - while enable_time_cutoff or (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: if page < start_page: utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}") page += 1