From e975933f4fd99a22604324f85e56334e6678ece6 Mon Sep 17 00:00:00 2001 From: hivrich Date: Fri, 26 Dec 2025 17:26:36 +0300 Subject: [PATCH 1/2] feat: add proxy-aware clients, reports workflow, and tests --- .cursor/rules/.gitkeep | 0 Makefile | 21 ++++++++++ README.md | 39 ++++++++++++++++++ app/analysis_goals.py | 9 +++++ app/analysis_gsc.py | 9 +++++ app/analysis_pages.py | 9 +++++ app/analysis_sources.py | 9 +++++ app/analysis_ym_webmaster.py | 9 +++++ app/gsc_client.py | 35 +++++++++++++++++ app/http_client.py | 76 ++++++++++++++++++++++++++++++++++++ app/metrika_client.py | 33 ++++++++++++++++ app/ym_webmaster_client.py | 33 ++++++++++++++++ conftest.py | 11 ++++++ docs/cursor_tasks.md | 35 +++++++++++++++++ requirements.txt | 2 + tests/test_config.py | 9 +++++ tests/test_http_client.py | 16 ++++++++ 17 files changed, 355 insertions(+) create mode 100644 .cursor/rules/.gitkeep create mode 100644 Makefile create mode 100644 app/analysis_goals.py create mode 100644 app/analysis_gsc.py create mode 100644 app/analysis_pages.py create mode 100644 app/analysis_sources.py create mode 100644 app/analysis_ym_webmaster.py create mode 100644 app/gsc_client.py create mode 100644 app/http_client.py create mode 100644 app/metrika_client.py create mode 100644 app/ym_webmaster_client.py create mode 100644 conftest.py create mode 100644 docs/cursor_tasks.md create mode 100644 tests/test_config.py create mode 100644 tests/test_http_client.py diff --git a/.cursor/rules/.gitkeep b/.cursor/rules/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a93f000 --- /dev/null +++ b/Makefile @@ -0,0 +1,21 @@ +.PHONY: reports test + +reports: + CLIENT?=partacademy + @echo "Generating reports for $(CLIENT)" + python - <<'PY' +from app.analysis_sources import fetch_sources +from app.analysis_pages import fetch_pages +from app.analysis_goals import fetch_goals +from app.analysis_gsc import fetch_gsc_sites +from app.analysis_ym_webmaster import fetch_webmaster_hosts + +print("sources", fetch_sources()) +print("pages", fetch_pages()) +print("goals", fetch_goals()) +print("gsc", fetch_gsc_sites()) +print("ym webmaster", fetch_webmaster_hosts()) +PY + +test: + pytest diff --git a/README.md b/README.md index 3199b29..8eaa8bd 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,42 @@ Analyzer Machine Задача: on-demand агент для анализа трафика/конверсий/SEO. Мультиклиентность: configs лежат в clients/*, секреты — только локально (.env / .env.*), в git не попадают. + +## Установка + +```bash +pip install -r requirements.txt +``` + +## Быстрый старт + +1. Скопируйте `.env.example` в `.env` и заполните токены: + - `YANDEX_METRIKA_TOKEN` + - `YM_WEBMASTER_TOKEN` + - `GSC_CLIENT_ID` + - `GSC_REFRESH_TOKEN` + - при необходимости настройте `HTTP_PROXY` / `HTTPS_PROXY` / `NO_PROXY`. + +2. Посмотрите клиентов: + +```bash +python -m app.cli clients +``` + +3. Проверьте конфиг клиента: + +```bash +python -m app.cli validate partacademy +``` + +4. Сгенерируйте отчёты (заглушка печатает полученные данные): + +```bash +make reports CLIENT=partacademy +``` + +5. Запустите тесты: + +```bash +make test +``` diff --git a/app/analysis_goals.py b/app/analysis_goals.py new file mode 100644 index 0000000..605de78 --- /dev/null +++ b/app/analysis_goals.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from typing import Mapping + +from app.metrika_client import list_counters + + +def fetch_goals() -> Mapping[str, object]: + return list_counters() diff --git a/app/analysis_gsc.py b/app/analysis_gsc.py new file mode 100644 index 0000000..77817d9 --- /dev/null +++ b/app/analysis_gsc.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from typing import Mapping + +from app.gsc_client import get_sites + + +def fetch_gsc_sites() -> Mapping[str, object]: + return get_sites() diff --git a/app/analysis_pages.py b/app/analysis_pages.py new file mode 100644 index 0000000..670f7b4 --- /dev/null +++ b/app/analysis_pages.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from typing import Mapping + +from app.metrika_client import list_counters + + +def fetch_pages() -> Mapping[str, object]: + return list_counters() diff --git a/app/analysis_sources.py b/app/analysis_sources.py new file mode 100644 index 0000000..424faa2 --- /dev/null +++ b/app/analysis_sources.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from typing import Mapping + +from app.metrika_client import list_counters + + +def fetch_sources() -> Mapping[str, object]: + return list_counters() diff --git a/app/analysis_ym_webmaster.py b/app/analysis_ym_webmaster.py new file mode 100644 index 0000000..bc219f3 --- /dev/null +++ b/app/analysis_ym_webmaster.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from typing import Mapping + +from app.ym_webmaster_client import user_hosts + + +def fetch_webmaster_hosts() -> Mapping[str, object]: + return user_hosts() diff --git a/app/gsc_client.py b/app/gsc_client.py new file mode 100644 index 0000000..90d7b3d --- /dev/null +++ b/app/gsc_client.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import os +from typing import Mapping + +import requests + +from app.http_client import get_default_session, request_json + + +GSC_API = "https://searchconsole.googleapis.com/webmasters/v3" + + +class GSCError(RuntimeError): + pass + + +def _get_credentials() -> tuple[str, str]: + client_id = os.getenv("GSC_CLIENT_ID") + refresh_token = os.getenv("GSC_REFRESH_TOKEN") + if not client_id or not refresh_token: + raise GSCError("GSC_CLIENT_ID or GSC_REFRESH_TOKEN is not set") + return client_id, refresh_token + + +def _session() -> requests.Session: + return get_default_session() + + +def get_sites() -> Mapping[str, object]: + # Placeholder: real implementation would exchange refresh_token for access_token + _get_credentials() + session = _session() + url = f"{GSC_API}/sites" + return request_json(session, "GET", url) diff --git a/app/http_client.py b/app/http_client.py new file mode 100644 index 0000000..25d9b8f --- /dev/null +++ b/app/http_client.py @@ -0,0 +1,76 @@ +"""HTTP helpers with proxy and timeout support. + +This module centralizes session construction so all API clients can: +- respect HTTP(S)_PROXY while allowing per-host bypass via NO_PROXY +- share a retry-friendly requests.Session with sensible defaults +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Iterable, Mapping, MutableMapping, Sequence + +import requests + + +DEFAULT_TIMEOUT = 30 + +# Domains that frequently require direct access without the corporate proxy. +DEFAULT_NO_PROXY_HOSTS: tuple[str, ...] = ( + "api-metrika.yandex.net", + "api-metrika.yandex.ru", + "api-metrika.yandex.com", + "api.webmaster.yandex.ru", + "api.searchconsole.googleapis.com", + "searchconsole.googleapis.com", +) + + +@dataclass(frozen=True) +class HttpConfig: + timeout: int = DEFAULT_TIMEOUT + extra_no_proxy: Sequence[str] | None = None + + +def _merge_no_proxy(env_value: str | None, extra_hosts: Iterable[str]) -> str: + hosts = [] if not env_value else [h.strip() for h in env_value.split(",") if h.strip()] + for host in extra_hosts: + if host and host not in hosts: + hosts.append(host) + return ",".join(hosts) + + +def _build_proxies(config: HttpConfig) -> MutableMapping[str, str]: + proxies: MutableMapping[str, str] = {} + http_proxy = os.getenv("HTTP_PROXY") or os.getenv("http_proxy") + https_proxy = os.getenv("HTTPS_PROXY") or os.getenv("https_proxy") + + if http_proxy: + proxies["http"] = http_proxy + if https_proxy: + proxies["https"] = https_proxy + + merged_no_proxy = _merge_no_proxy(os.getenv("NO_PROXY") or os.getenv("no_proxy"), DEFAULT_NO_PROXY_HOSTS) + if config.extra_no_proxy: + merged_no_proxy = _merge_no_proxy(merged_no_proxy, config.extra_no_proxy) + + proxies["no_proxy"] = merged_no_proxy + return proxies + + +def get_default_session(config: HttpConfig | None = None) -> requests.Session: + cfg = config or HttpConfig() + session = requests.Session() + session.trust_env = False + session.proxies = _build_proxies(cfg) + session.headers.update({"User-Agent": "analyzer-machine/1.0"}) + session.timeout = cfg.timeout # type: ignore[attr-defined] + return session + + +def request_json(session: requests.Session, method: str, url: str, **kwargs) -> Mapping[str, object]: + timeout = kwargs.pop("timeout", getattr(session, "timeout", DEFAULT_TIMEOUT)) + response = session.request(method=method, url=url, timeout=timeout, **kwargs) + response.raise_for_status() + return response.json() diff --git a/app/metrika_client.py b/app/metrika_client.py new file mode 100644 index 0000000..3835317 --- /dev/null +++ b/app/metrika_client.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import os +from typing import Mapping + +import requests + +from app.http_client import get_default_session, request_json + + +METRIKA_API = "https://api-metrika.yandex.net/management/v1" + + +class MetrikaError(RuntimeError): + pass + + +def _get_token() -> str: + token = os.getenv("YANDEX_METRIKA_TOKEN") + if not token: + raise MetrikaError("YANDEX_METRIKA_TOKEN is not set") + return token + + +def _session() -> requests.Session: + return get_default_session() + + +def list_counters() -> Mapping[str, object]: + session = _session() + headers = {"Authorization": f"OAuth {_get_token()}"} + url = f"{METRIKA_API}/counters" + return request_json(session, "GET", url, headers=headers) diff --git a/app/ym_webmaster_client.py b/app/ym_webmaster_client.py new file mode 100644 index 0000000..bba208a --- /dev/null +++ b/app/ym_webmaster_client.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import os +from typing import Mapping + +import requests + +from app.http_client import get_default_session, request_json + + +YM_WEB_API = "https://api.webmaster.yandex.net/v4" + + +class YMWError(RuntimeError): + pass + + +def _get_token() -> str: + token = os.getenv("YM_WEBMASTER_TOKEN") + if not token: + raise YMWError("YM_WEBMASTER_TOKEN is not set") + return token + + +def _session() -> requests.Session: + return get_default_session() + + +def user_hosts() -> Mapping[str, object]: + session = _session() + headers = {"Authorization": f"OAuth {_get_token()}"} + url = f"{YM_WEB_API}/user/hosts" + return request_json(session, "GET", url, headers=headers) diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..aae0afb --- /dev/null +++ b/conftest.py @@ -0,0 +1,11 @@ +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent +PARENT = ROOT + +while PARENT and PARENT.name: + if (PARENT / "app").exists(): + sys.path.insert(0, str(PARENT)) + break + PARENT = PARENT.parent diff --git a/docs/cursor_tasks.md b/docs/cursor_tasks.md new file mode 100644 index 0000000..5524c56 --- /dev/null +++ b/docs/cursor_tasks.md @@ -0,0 +1,35 @@ +Cursor tasking guide +==================== + +Goal: align this repo with the “full” Analyzer Machine you had in Cursor (API clients, analysis scripts, data cache, reports) and make it runnable in this environment with the provided credentials. + +What exists now +--------------- +- Minimal CLI only: `app/cli.py` (Typer commands `clients`, `show`, `validate`) and config loader `app/config.py`. +- Client configs in `clients/*` plus sample report assets in `docs/reports/partacademy/`. +- No API clients or analysis jobs are present in the repo yet. + +Tasks for Cursor to implement +----------------------------- +1) Restore data collectors (paths under `app/`) +- Recreate the missing modules: `analysis_*.py`, `metrika_client.py`, `gsc_client.py`, `ym_webmaster_client.py` (names from the original Cursor workspace). Place them under `app/` to keep imports consistent. +- Each client should read tokens from environment variables that are already available here: `YANDEX_METRIKA_TOKEN`, `YM_WEBMASTER_TOKEN`, `GSC_CLIENT_ID`, `GSC_REFRESH_TOKEN`. +- Add lightweight wrappers that expose simple functions for fetching counters/metrics so the CLI can consume them later. + +2) Handle proxy restrictions +- Current environment forces traffic through `http://proxy:8080` and blocks `api-metrika.yandex.net` with `403 CONNECT`. +- Add proxy configuration hooks (e.g., respecting `HTTP_PROXY`, `HTTPS_PROXY`, `NO_PROXY`) and allow overriding them per request so domains like `api-metrika.yandex.net`/`api-metrika.yandex.ru` can be bypassed if permitted. +- Provide a retryable HTTP helper (requests.Session with timeouts) to centralize proxy/no-proxy handling for all API clients. + +3) Ship a runnable workflow +- Add a `make reports` (or similar) target to orchestrate loading configs, fetching data via the clients, and generating HTML/Markdown reports in `docs/reports//`. +- Include a `.env.example` listing required variables (tokens above + optional proxy tuning such as `NO_PROXY`). Do **not** commit real secrets. +- Extend README with usage examples for the new commands and the expected report outputs. + +4) Testing and CI hooks +- Add smoke tests for the HTTP helper and config parsing (Pytest preferred). Make sure they can run offline by mocking HTTP calls. +- If CI is added, include a job that runs `pytest` and `python -m app.cli validate ` for all clients. + +Context for Cursor +- Known network symptom here: `Tunnel connection failed: 403 Forbidden` when calling Yandex Metrika via the forced proxy. The code should allow opting out of the proxy for specific hosts when possible. +- Sample client to validate against: `clients/partacademy/config.yaml`. The existing report scaffold lives in `docs/reports/partacademy/`. diff --git a/requirements.txt b/requirements.txt index c3bb4cc..8ed5d71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ typer==0.12.5 rich==13.9.4 PyYAML==6.0.2 python-dotenv==1.0.1 +requests==2.32.3 +pytest==8.3.3 diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..d7cc392 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from app.config import load_client_config + + +def test_load_client_config_smoke(): + cfg, path = load_client_config("partacademy") + assert cfg.client_name == "partacademy" + assert path.exists() diff --git a/tests/test_http_client.py b/tests/test_http_client.py new file mode 100644 index 0000000..4cad259 --- /dev/null +++ b/tests/test_http_client.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from app import http_client + + +def test_merge_no_proxy_merges_and_deduplicates(): + merged = http_client._merge_no_proxy("a,b", ["b", "c"]) + assert merged.split(",") == ["a", "b", "c"] + + +def test_default_session_sets_proxies(monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy:8080") + monkeypatch.setenv("NO_PROXY", "localhost") + session = http_client.get_default_session() + assert session.proxies["http"] == "http://proxy:8080" + assert "api-metrika.yandex.net" in session.proxies["no_proxy"] From 3cf1a3be07538c2e4feadf356da261bda15ca6df Mon Sep 17 00:00:00 2001 From: hivrich Date: Fri, 26 Dec 2025 17:51:03 +0300 Subject: [PATCH 2/2] Fix TLS handling for proxy and tidy GSC client --- app/gsc_client.py | 6 +----- app/http_client.py | 9 ++++++++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/app/gsc_client.py b/app/gsc_client.py index 90d7b3d..0d57f71 100644 --- a/app/gsc_client.py +++ b/app/gsc_client.py @@ -23,13 +23,9 @@ def _get_credentials() -> tuple[str, str]: return client_id, refresh_token -def _session() -> requests.Session: - return get_default_session() - - def get_sites() -> Mapping[str, object]: # Placeholder: real implementation would exchange refresh_token for access_token _get_credentials() - session = _session() + session = get_default_session() url = f"{GSC_API}/sites" return request_json(session, "GET", url) diff --git a/app/http_client.py b/app/http_client.py index 25d9b8f..0265e7e 100644 --- a/app/http_client.py +++ b/app/http_client.py @@ -62,9 +62,16 @@ def _build_proxies(config: HttpConfig) -> MutableMapping[str, str]: def get_default_session(config: HttpConfig | None = None) -> requests.Session: cfg = config or HttpConfig() session = requests.Session() - session.trust_env = False + # Доверяем окружению: прокси, CA, etc. + session.trust_env = True session.proxies = _build_proxies(cfg) session.headers.update({"User-Agent": "analyzer-machine/1.0"}) + # Подхватываем пользовательский CA (для MITM‑прокси) + session.verify = ( + os.getenv("REQUESTS_CA_BUNDLE") + or os.getenv("SSL_CERT_FILE") + or session.verify + ) session.timeout = cfg.timeout # type: ignore[attr-defined] return session