Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@
# 爬取间隔时间
CRAWLER_MAX_SLEEP_SEC = 2

# 自动重试机制开关(仅针对小红书和抖音平台)
# 开启后,网络相关异常(超时、连接失败等)会自动重试3次,采用指数退避策略(1s、2s、4s)
ENABLE_AUTO_RETRY = True

from .bilibili_config import *
from .xhs_config import *
from .dy_config import *
Expand Down
3 changes: 3 additions & 0 deletions media_platform/douyin/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from base.base_crawler import AbstractApiClient
from proxy.proxy_mixin import ProxyRefreshMixin
from tools import utils
from tools.retry_decorator import auto_retry
from var import request_keyword_var

if TYPE_CHECKING:
Expand Down Expand Up @@ -112,6 +113,7 @@ async def __process_req_params(
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus

@auto_retry(max_retries=3, base_delay=1.0)
async def request(self, method, url, **kwargs):
# 每次请求前检测代理是否过期
await self._refresh_proxy_if_expired()
Expand Down Expand Up @@ -332,6 +334,7 @@ async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Ca
result.extend(aweme_list)
return result

@auto_retry(max_retries=3, base_delay=1.0)
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
async with httpx.AsyncClient(proxy=self.proxy) as client:
try:
Expand Down
6 changes: 3 additions & 3 deletions media_platform/xhs/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@

import httpx
from playwright.async_api import BrowserContext, Page
from tenacity import retry, stop_after_attempt, wait_fixed

import config
from base.base_crawler import AbstractApiClient
from proxy.proxy_mixin import ProxyRefreshMixin
from tools import utils
from tools.retry_decorator import auto_retry

if TYPE_CHECKING:
from proxy.proxy_ip_pool import ProxyIpPool
Expand Down Expand Up @@ -109,7 +109,7 @@ async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: O
self.headers.update(headers)
return self.headers

@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
@auto_retry(max_retries=3, base_delay=1.0)
async def request(self, method, url, **kwargs) -> Union[str, Any]:
"""
Wrapper for httpx common request method, processes request response
Expand Down Expand Up @@ -613,7 +613,7 @@ async def get_note_short_url(self, note_id: str) -> Dict:
data = {"original_url": f"{self._domain}/discovery/item/{note_id}"}
return await self.post(uri, data=data, return_response=True)

@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
@auto_retry(max_retries=3, base_delay=1.0)
async def get_note_by_id_from_html(
self,
note_id: str,
Expand Down
5 changes: 2 additions & 3 deletions media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,14 @@
Playwright,
async_playwright,
)
from tenacity import RetryError

import config
from base.base_crawler import AbstractCrawler
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from tools.retry_decorator import RetryExhaustedError
from var import crawler_type_var, source_keyword_var

from .client import XiaoHongShuClient
Expand Down Expand Up @@ -291,7 +290,7 @@ async def get_note_detail_async_task(
try:
try:
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
except RetryError:
except RetryExhaustedError:
pass

if not note_detail:
Expand Down
Loading