diff --git a/Docs/CONTRIBUTORS.md b/Docs/CONTRIBUTORS.md new file mode 100644 index 0000000..99c6baa --- /dev/null +++ b/Docs/CONTRIBUTORS.md @@ -0,0 +1,10 @@ +# Contributors + +Thank you to everyone who has contributed to PulseEngine. + +| Name | GitHub | +|------|--------| +| Bhargavaram Krishnapur | [@Codex-Crusader](https://github.com/Codex-Crusader) | +| Anshul Yadav | [@SudoMayo](https://github.com/SudoMayo) | +| Anshul Khandelwal | [@anshul](https://github.com/anshul) | +| Shruthikha | [@shruthikha](https://github.com/shruthikha) | diff --git a/src/news.py b/src/news.py index b44b860..d0c1cbc 100644 --- a/src/news.py +++ b/src/news.py @@ -8,6 +8,7 @@ - deduplicate_articles : remove near-duplicates via Jaccard title similarity - cluster_articles : group articles by dominant detected event type - get_display_clusters : filtered, summarised cluster view for UI consumption + - generate_keywords : auto-build a keyword list for any ticker from Yahoo Finance metadata This module does NOT score sentiment or match articles to assets — those responsibilities belong to src/sentiment.py and src/signals.py respectively. @@ -18,12 +19,14 @@ import datetime as dt import logging import re +import threading import urllib.request from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Optional from urllib.parse import urlparse import feedparser +import yfinance as yf from config.settings import ( DEDUP_SIMILARITY_THRESHOLD, @@ -216,6 +219,83 @@ def get_display_clusters( } +# ── Keyword generation ──────────────────────────────────────────────────────── + +_CORP_SUFFIXES: frozenset[str] = frozenset({ + "inc", "corp", "corporation", "ltd", "limited", "plc", "llc", "lp", + "group", "holdings", "co", "company", "technologies", "technology", + "systems", "services", "solutions", "international", "global", +}) + + +def generate_keywords(ticker: str) -> list[str]: + """ + Build a keyword list for news correlation from Yahoo Finance metadata. + Returns a deduplicated list of relevant search terms for the given ticker. + Falls back to [ticker] if metadata fetch fails. + """ + ticker = ticker.upper().strip() + + _result: list = [None] + _exc: list = [None] + + def _fetch() -> None: + try: + _result[0] = yf.Ticker(ticker).info + except Exception as exc: + _exc[0] = exc + + thread = threading.Thread(target=_fetch, daemon=True) + thread.start() + thread.join(timeout=REQUEST_TIMEOUT) + + if thread.is_alive(): + log.warning("generate_keywords(%r): metadata fetch timed out", ticker) + return [ticker] + + if _exc[0] is not None: + log.warning("generate_keywords(%r) failed: %s", ticker, _exc[0]) + return [ticker] + + info = _result[0] + + if not info or not info.get("longName"): + return [ticker] + + candidates: list[str] = [ticker] + + for field in ("longName", "shortName"): + val = (info.get(field) or "").strip() + if not val: + continue + candidates.append(val) + for token in re.split(r"[\s,./&]+", val): + clean = re.sub(r"[^a-zA-Z0-9]", "", token) + if clean and clean.lower() not in _CORP_SUFFIXES: + candidates.append(clean) + + for officer in (info.get("companyOfficers") or [])[:5]: + name = (officer.get("name") or "").strip() + if not name: + continue + parts = name.split() + if parts: + surname = re.sub(r"[^a-zA-Z]", "", parts[-1]) + if surname: + candidates.append(surname) + + seen: set[str] = set() + result: list[str] = [] + for kw in candidates: + kw = kw.strip() + key = kw.lower() + if len(kw) >= 3 and key not in seen: + seen.add(key) + result.append(kw) + + return result if result else [ticker] + + # ── Private helpers ─────────────────────────────────────────────────────────── def _parse_pub_date(entry) -> Optional[dt.datetime]: diff --git a/src/signals.py b/src/signals.py index d9463a0..a5b0ea8 100644 --- a/src/signals.py +++ b/src/signals.py @@ -15,6 +15,7 @@ import datetime as dt import logging +import re from typing import Optional from config.settings import ( @@ -28,6 +29,21 @@ log = logging.getLogger(__name__) +# Compiled keyword patterns, built once and reused across all correlate_news calls. +# Word boundaries (\b) on each alphanumeric end prevent substring false-positives +# (e.g. "gold" matching "goldman", "oil" matching "broil"). +_KW_PATTERN_CACHE: dict[str, re.Pattern] = {} + + +def _kw_re(kw: str) -> re.Pattern: + """Return a compiled regex that matches *kw* as a whole token in lowercase text.""" + if kw not in _KW_PATTERN_CACHE: + escaped = re.escape(kw) + prefix = r'\b' + suffix = r'\b' if kw[-1].isalnum() else '' + _KW_PATTERN_CACHE[kw] = re.compile(prefix + escaped + suffix) + return _KW_PATTERN_CACHE[kw] + # ── News-asset correlation ──────────────────────────────────────────────────── @@ -47,7 +63,7 @@ def correlate_news(asset_name: str, articles: list[dict]) -> list[dict]: matched: list[dict] = [] for article in articles: blob = (article["title"] + " " + article["summary"]).lower() - score = sum(w for kw, w in kw_pairs if kw in blob) + score = sum(w for kw, w in kw_pairs if _kw_re(kw).search(blob)) if score <= 0: continue diff --git a/tests/test_core.py b/tests/test_core.py index 874a192..67e5524 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -33,8 +33,10 @@ # Canonical imports — new code should use these from src.price import _compute_rsi as _src_rsi, _compute_roc as _src_roc from src.sentiment import score_sentiment as src_score_sentiment -from src.news import deduplicate_articles as src_dedup -from src.signals import compute_signal_score as src_signal_score +import datetime as dt + +from src.news import deduplicate_articles as src_dedup, generate_keywords +from src.signals import compute_signal_score as src_signal_score, correlate_news # ── RSI ─────────────────────────────────────────────────────────────────────── @@ -159,6 +161,96 @@ def test_fetch_price_history_raises_on_fetch_failure(mocker): fetch_price_history("TEST", days=1) +# ── generate_keywords ───────────────────────────────────────────────────────── + +def test_generate_keywords_known_ticker(mocker): + """Known ticker returns symbol, company name tokens, and officer surnames — not sector/industry.""" + mock_info = { + "longName": "NVIDIA Corporation", + "shortName": "NVIDIA", + "symbol": "NVDA", + "industry": "Semiconductors", + "sector": "Technology", + "companyOfficers": [ + {"name": "Jensen Huang"}, + {"name": "Colette Kress"}, + ], + } + ticker_mock = mocker.Mock() + ticker_mock.info = mock_info + mocker.patch("src.news.yf.Ticker", return_value=ticker_mock) + + result = generate_keywords("NVDA") + assert "NVDA" in result + assert "NVIDIA" in result + assert "Huang" in result # executive surname included + assert "Semiconductors" not in result # industry dropped — too broad for keyword matching + assert "Technology" not in result # sector dropped — too broad for keyword matching + assert all(len(kw) >= 3 for kw in result) + assert len(result) == len({kw.lower() for kw in result}), "result must be deduplicated" + + +def test_generate_keywords_unknown_ticker(mocker): + """Unknown ticker (empty info) should return [ticker] without raising.""" + ticker_mock = mocker.Mock() + ticker_mock.info = {} + mocker.patch("src.news.yf.Ticker", return_value=ticker_mock) + + result = generate_keywords("NOTREAL") + assert result == ["NOTREAL"] + + +def test_generate_keywords_network_failure(mocker): + """Network failure should return [ticker] without raising.""" + mocker.patch("src.news.yf.Ticker", side_effect=Exception("connection refused")) + + result = generate_keywords("NVDA") + assert result == ["NVDA"] + + +def test_generate_keywords_timeout(mocker): + """When the fetch thread does not finish within REQUEST_TIMEOUT, return [ticker].""" + thread_mock = mocker.MagicMock() + thread_mock.is_alive.return_value = True # simulates a hung thread + mocker.patch("src.news.threading.Thread", return_value=thread_mock) + + result = generate_keywords("NVDA") + assert result == ["NVDA"] + thread_mock.start.assert_called_once() + thread_mock.join.assert_called_once() + + +# ── correlate_news word-boundary matching ───────────────────────────────────── + +def test_correlate_news_no_substring_false_positive(): + """'gold' keyword must not match articles whose only hit is a substring like 'goldman'.""" + now = dt.datetime.now(dt.timezone.utc) + goldman_article = { + "title": "goldman sachs raises forecast for major banks", + "summary": "Goldman Sachs analysts upgraded their outlook for the banking sector.", + "link": "https://example.com/gs", + "source": "Reuters Business", + "published": now - dt.timedelta(hours=1), + } + gold_article = { + "title": "gold prices hit record as safe haven demand surges", + "summary": "Bullion climbs on central bank buying.", + "link": "https://example.com/gold", + "source": "Reuters Business", + "published": now - dt.timedelta(hours=1), + } + + results = correlate_news("Gold", [goldman_article, gold_article]) + matched_titles = {a["title"] for a in results} + + assert goldman_article["title"] not in matched_titles, ( + "Goldman Sachs article must not match Gold — 'gold' is a substring of 'goldman'" + ) + assert gold_article["title"] in matched_titles, ( + "Gold article must match Gold" + ) + + def test_fetch_news_articles_uses_explicit_timeout(mocker): """RSS fetches should be bounded by an explicit timeout.""" mocker.patch("src.news.NEWS_FEEDS", [("Test Feed", "https://example.com/feed")])