From 9c60a964e065e6680392a69864bf6441c0426219 Mon Sep 17 00:00:00 2001 From: soaringk Date: Thu, 26 Mar 2026 21:36:36 +0800 Subject: [PATCH] Goal: add an upstreamable static HTTP proxy option that fits the existing provider abstraction and can be exercised directly from the CLI. Primary changes: introduce a static proxy provider and enum entry, add --static_proxy, automatically select the static provider when the flag is supplied --- cmd_arg/arg.py | 16 ++++-- config/base_config.py | 2 +- proxy/providers/__init__.py | 1 + proxy/providers/static_http_proxy.py | 78 ++++++++++++++++++++++++++++ proxy/proxy_ip_pool.py | 2 + proxy/types.py | 1 + test/test_static_http_proxy.py | 60 +++++++++++++++++++++ 7 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 proxy/providers/static_http_proxy.py create mode 100644 test/test_static_http_proxy.py diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index cbbcc0383..3168f873a 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -235,6 +235,14 @@ def main( rich_help_panel="Account Configuration", ), ] = config.COOKIES, + static_proxy: Annotated[ + str, + typer.Option( + "--static_proxy", + help="Static HTTP proxy URL list (comma-separated, for example http://user:pass@host:port,http://host2:port). When set, overrides provider-based proxy rotation.", + rich_help_panel="Proxy Configuration", + ), + ] = "", specified_id: Annotated[ str, typer.Option( @@ -296,7 +304,7 @@ def main( str, typer.Option( "--ip_proxy_provider_name", - help="IP proxy provider name (kuaidaili | wandouhttp)", + help="IP proxy provider name (kuaidaili | wandouhttp | static)", rich_help_panel="Proxy Configuration", ), ] = config.IP_PROXY_PROVIDER_NAME, @@ -325,12 +333,13 @@ def main( config.CDP_HEADLESS = enable_headless config.SAVE_DATA_OPTION = save_data_option.value config.COOKIES = cookies + config.STATIC_PROXY_URL = static_proxy.strip() config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes config.MAX_CONCURRENCY_NUM = max_concurrency_num config.SAVE_DATA_PATH = save_data_path - config.ENABLE_IP_PROXY = enable_ip_proxy_value + config.ENABLE_IP_PROXY = enable_ip_proxy_value or bool(config.STATIC_PROXY_URL) config.IP_PROXY_POOL_COUNT = ip_proxy_pool_count - config.IP_PROXY_PROVIDER_NAME = ip_proxy_provider_name + config.IP_PROXY_PROVIDER_NAME = "static" if config.STATIC_PROXY_URL else ip_proxy_provider_name # Set platform-specific ID lists for detail/creator mode if specified_id_list: @@ -369,6 +378,7 @@ def main( save_data_option=config.SAVE_DATA_OPTION, init_db=init_db_value, cookies=config.COOKIES, + static_proxy=static_proxy.strip(), specified_id=specified_id, creator_id=creator_id, ) diff --git a/config/base_config.py b/config/base_config.py index 83571e06b..1b76a7977 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -32,7 +32,7 @@ IP_PROXY_POOL_COUNT = 2 # Proxy IP provider name -IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp +IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp | static # Setting to True will not open the browser (headless browser) # Setting False will open a browser diff --git a/proxy/providers/__init__.py b/proxy/providers/__init__.py index ed6b1943a..8c301b2ee 100644 --- a/proxy/providers/__init__.py +++ b/proxy/providers/__init__.py @@ -24,4 +24,5 @@ # @Desc : from .jishu_http_proxy import new_jisu_http_proxy from .kuaidl_proxy import new_kuai_daili_proxy +from .static_http_proxy import new_static_http_proxy from .wandou_http_proxy import new_wandou_http_proxy diff --git a/proxy/providers/static_http_proxy.py b/proxy/providers/static_http_proxy.py new file mode 100644 index 000000000..cce71f2ea --- /dev/null +++ b/proxy/providers/static_http_proxy.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/proxy/providers/static_http_proxy.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# + +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2026/3/26 +# @Desc : Static HTTP proxy provider implementation +from typing import List +from urllib.parse import urlparse + +import config + +from proxy import ProxyProvider +from proxy.types import IpInfoModel, ProviderNameEnum + + +def parse_static_http_proxy(proxy_url: str) -> IpInfoModel: + normalized = proxy_url.strip() + if not normalized: + raise ValueError("--static_proxy is empty") + if "://" not in normalized: + normalized = f"http://{normalized}" + + parsed = urlparse(normalized) + if parsed.scheme not in ("http",): + raise ValueError("--static_proxy must use the http scheme") + if not parsed.hostname or not parsed.port: + raise ValueError("--static_proxy must include host and port") + + return IpInfoModel( + ip=parsed.hostname, + port=parsed.port, + user=parsed.username or "", + password=parsed.password or "", + protocol="http://", + expired_time_ts=None, + ) + + +def parse_static_http_proxies(proxy_urls: str) -> List[IpInfoModel]: + proxies = [ + parse_static_http_proxy(item) + for item in proxy_urls.split(",") + if item.strip() + ] + if not proxies: + raise ValueError("--static_proxy is empty") + return proxies + + +class StaticHttpProxy(ProxyProvider): + def __init__(self): + self.proxy_brand_name = ProviderNameEnum.STATIC_HTTP_PROVIDER.value + + async def get_proxy(self, num: int) -> List[IpInfoModel]: + del num + return parse_static_http_proxies(getattr(config, "STATIC_PROXY_URL", "")) + + +def new_static_http_proxy() -> StaticHttpProxy: + return StaticHttpProxy() diff --git a/proxy/proxy_ip_pool.py b/proxy/proxy_ip_pool.py index 8f4a12969..854c1aa6a 100644 --- a/proxy/proxy_ip_pool.py +++ b/proxy/proxy_ip_pool.py @@ -31,6 +31,7 @@ import config from proxy.providers import ( new_kuai_daili_proxy, + new_static_http_proxy, new_wandou_http_proxy, ) from tools import utils @@ -153,6 +154,7 @@ async def _reload_proxies(self): IpProxyProvider: Dict[str, ProxyProvider] = { ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(), ProviderNameEnum.WANDOU_HTTP_PROVIDER.value: new_wandou_http_proxy(), + ProviderNameEnum.STATIC_HTTP_PROVIDER.value: new_static_http_proxy(), } diff --git a/proxy/types.py b/proxy/types.py index e20314105..cd75c7813 100644 --- a/proxy/types.py +++ b/proxy/types.py @@ -32,6 +32,7 @@ class ProviderNameEnum(Enum): KUAI_DAILI_PROVIDER: str = "kuaidaili" WANDOU_HTTP_PROVIDER: str = "wandouhttp" + STATIC_HTTP_PROVIDER: str = "static" class IpInfoModel(BaseModel): diff --git a/test/test_static_http_proxy.py b/test/test_static_http_proxy.py new file mode 100644 index 000000000..ba09c794e --- /dev/null +++ b/test/test_static_http_proxy.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/test/test_static_http_proxy.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# + +from unittest import IsolatedAsyncioTestCase +from unittest.mock import patch + +from proxy.providers.static_http_proxy import ( + new_static_http_proxy, + parse_static_http_proxies, + parse_static_http_proxy, +) + + +class TestStaticHttpProxy(IsolatedAsyncioTestCase): + def test_parse_static_http_proxy_with_auth(self): + proxy = parse_static_http_proxy("http://user:pass@127.0.0.1:8899") + + self.assertEqual(proxy.ip, "127.0.0.1") + self.assertEqual(proxy.port, 8899) + self.assertEqual(proxy.user, "user") + self.assertEqual(proxy.password, "pass") + self.assertIsNone(proxy.expired_time_ts) + + def test_parse_static_http_proxy_accepts_bare_host_port(self): + proxy = parse_static_http_proxy("127.0.0.1:8899") + + self.assertEqual(proxy.ip, "127.0.0.1") + self.assertEqual(proxy.port, 8899) + self.assertEqual(proxy.user, "") + self.assertEqual(proxy.password, "") + + def test_parse_static_http_proxies_accepts_comma_separated_list(self): + proxies = parse_static_http_proxies( + "http://user:pass@127.0.0.1:8899,127.0.0.2:9900", + ) + + self.assertEqual(len(proxies), 2) + self.assertEqual(proxies[0].ip, "127.0.0.1") + self.assertEqual(proxies[0].port, 8899) + self.assertEqual(proxies[0].user, "user") + self.assertEqual(proxies[1].ip, "127.0.0.2") + self.assertEqual(proxies[1].port, 9900) + + async def test_provider_reads_static_proxy_from_config(self): + provider = new_static_http_proxy() + + with patch("config.STATIC_PROXY_URL", "http://127.0.0.1:8899,http://127.0.0.2:9900", create=True): + proxies = await provider.get_proxy(3) + + self.assertEqual(len(proxies), 2) + self.assertEqual(proxies[0].ip, "127.0.0.1") + self.assertEqual(proxies[0].port, 8899) + self.assertEqual(proxies[1].ip, "127.0.0.2") + self.assertEqual(proxies[1].port, 9900)