From 4ebfe3eed72a2d16c855db096581266e209392c8 Mon Sep 17 00:00:00 2001
From: soulmachine <soulmachine@gmail.com>
Date: Wed, 18 Mar 2026 17:58:21 -0700
Subject: [PATCH 1/3] Add time cutoff for XHS search

---
 config/xhs_config.py       |  6 +++-
 media_platform/xhs/core.py | 59 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/config/xhs_config.py b/config/xhs_config.py
index 02cc96645..9ff6e2375 100644
--- a/config/xhs_config.py
+++ b/config/xhs_config.py
@@ -21,7 +21,11 @@
 # Xiaohongshu platform configuration
 
 # Sorting method, the specific enumeration value is in media_platform/xhs/field.py
-SORT_TYPE = "popularity_descending"
+SORT_TYPE = "time_descending"
+
+# Stop pagination when search results contain notes older than the configured hours.
+# Set to 24 for "within one day", or 0 to disable this cutoff.
+XHS_NOTE_MAX_AGE_HOURS = 24
 
 # Specify the note URL list, which must carry the xsec_token parameter
 XHS_SPECIFIED_NOTE_URL_LIST = [
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
index f797e791b..497ac96b2 100644
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -122,11 +122,38 @@ async def start(self) -> None:
 
             utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
 
+    @staticmethod
+    def _normalize_timestamp_to_milliseconds(timestamp: Optional[int]) -> int:
+        """Normalize 10/13-digit unix timestamps to milliseconds."""
+        if timestamp is None:
+            return 0
+
+        try:
+            normalized_timestamp = int(timestamp)
+        except (TypeError, ValueError):
+            return 0
+
+        if normalized_timestamp < 1000000000000:
+            return normalized_timestamp * 1000
+        return normalized_timestamp
+
     async def search(self) -> None:
         """Search for notes and retrieve their comment information."""
         utils.logger.info("[XiaoHongShuCrawler.search] Begin search Xiaohongshu keywords")
         xhs_limit_count = 20  # Xiaohongshu limit page fixed value
-        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
+        max_note_age_hours = int(getattr(config, "XHS_NOTE_MAX_AGE_HOURS", 0) or 0)
+        enable_time_cutoff = (
+            max_note_age_hours > 0
+            and config.SORT_TYPE == SearchSortType.LATEST.value
+        )
+
+        if max_note_age_hours > 0 and not enable_time_cutoff:
+            utils.logger.warning(
+                "[XiaoHongShuCrawler.search] XHS_NOTE_MAX_AGE_HOURS is configured, "
+                "but SORT_TYPE is not 'time_descending'. Skip auto-stop to avoid incorrect early termination."
+            )
+
+        if not enable_time_cutoff and config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
             config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
         start_page = config.START_PAGE
         for keyword in config.KEYWORDS.split(","):
@@ -134,7 +161,7 @@ async def search(self) -> None:
             utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
             page = 1
             search_id = get_search_id()
-            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while enable_time_cutoff or (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                 if page < start_page:
                     utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
                     page += 1
@@ -172,7 +199,35 @@ async def search(self) -> None:
                             xsec_tokens.append(note_detail.get("xsec_token"))
                     page += 1
                     utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
+                    # 时间筛选：检查是否有笔记早于阈值
+                    # see https://github.com/NanmiCoder/MediaCrawler/issues/848#issuecomment-4081551779
+                    should_stop = False
+                    if enable_time_cutoff:
+                        threshold_timestamp = (
+                            utils.get_current_timestamp()
+                            - max_note_age_hours * 60 * 60 * 1000
+                        )
+                        valid_note_count = 0
+                        for note_detail in note_details:
+                            if not note_detail:
+                                continue
+
+                            note_time = self._normalize_timestamp_to_milliseconds(note_detail.get("time"))
+                            if note_time and note_time < threshold_timestamp:
+                                should_stop = True
+                                note_ids = note_ids[:valid_note_count]
+                                xsec_tokens = xsec_tokens[:valid_note_count]
+                                utils.logger.info(
+                                    f"[XiaoHongShuCrawler.search] Found note older than {max_note_age_hours} hours, "
+                                    f"stop after current page. note_id: {note_detail.get('note_id')}, "
+                                    f"note_time: {utils.get_time_str_from_unix_time(note_time)}"
+                                )
+                                break
+
+                            valid_note_count += 1
                     await self.batch_get_note_comments(note_ids, xsec_tokens)
+                    if should_stop:
+                        break  # 跳出 while 循环
 
                     # Sleep after each page navigation
                     await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

From 529a730cc1863d3f832b9ea9b312baf33262c7c4 Mon Sep 17 00:00:00 2001
From: soulmachine <soulmachine@gmail.com>
Date: Wed, 18 Mar 2026 17:59:57 -0700
Subject: [PATCH 2/3] Restored SORT_TYPE

---
 config/xhs_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/xhs_config.py b/config/xhs_config.py
index 9ff6e2375..efc98f8a2 100644
--- a/config/xhs_config.py
+++ b/config/xhs_config.py
@@ -21,7 +21,7 @@
 # Xiaohongshu platform configuration
 
 # Sorting method, the specific enumeration value is in media_platform/xhs/field.py
-SORT_TYPE = "time_descending"
+SORT_TYPE = "popularity_descending"
 
 # Stop pagination when search results contain notes older than the configured hours.
 # Set to 24 for "within one day", or 0 to disable this cutoff.

From 2728b5db0e8e7e15caa921dd16600aa8e0eddb80 Mon Sep 17 00:00:00 2001
From: soulmachine <soulmachine@gmail.com>
Date: Wed, 18 Mar 2026 18:07:20 -0700
Subject: [PATCH 3/3] revert

---
 media_platform/xhs/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
index 497ac96b2..5b91cae0c 100644
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -161,7 +161,7 @@ async def search(self) -> None:
             utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
             page = 1
             search_id = get_search_id()
-            while enable_time_cutoff or (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                 if page < start_page:
                     utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
                     page += 1