The-Pulse-Engine · Codex-Crusader · Apr 14, 2026 · Apr 14, 2026
diff --git a/Docs/CONTRIBUTORS.md b/Docs/CONTRIBUTORS.md
@@ -0,0 +1,10 @@
+# Contributors
+
+Thank you to everyone who has contributed to PulseEngine.
+
+| Name | GitHub |
+|------|--------|
+| Bhargavaram Krishnapur | [@Codex-Crusader](https://github.com/Codex-Crusader) |
+| Anshul Yadav | [@SudoMayo](https://github.com/SudoMayo) |
+| Anshul Khandelwal | [@anshul](https://github.com/anshul) |
+| Shruthikha | [@shruthikha](https://github.com/shruthikha) |
diff --git a/src/news.py b/src/news.py
@@ -8,6 +8,7 @@
   - deduplicate_articles  : remove near-duplicates via Jaccard title similarity
   - cluster_articles      : group articles by dominant detected event type
   - get_display_clusters  : filtered, summarised cluster view for UI consumption
+  - generate_keywords     : auto-build a keyword list for any ticker from Yahoo Finance metadata
 
 This module does NOT score sentiment or match articles to assets — those
 responsibilities belong to src/sentiment.py and src/signals.py respectively.
@@ -18,12 +19,14 @@
 import datetime as dt
 import logging
 import re
+import threading
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Optional
 from urllib.parse import urlparse
 
 import feedparser
+import yfinance as yf
 
 from config.settings import (
     DEDUP_SIMILARITY_THRESHOLD,
@@ -216,6 +219,83 @@ def get_display_clusters(
     }
 
 
+# ── Keyword generation ────────────────────────────────────────────────────────
+
+_CORP_SUFFIXES: frozenset[str] = frozenset({
+    "inc", "corp", "corporation", "ltd", "limited", "plc", "llc", "lp",
+    "group", "holdings", "co", "company", "technologies", "technology",
+    "systems", "services", "solutions", "international", "global",
+})
+
+
+def generate_keywords(ticker: str) -> list[str]:
+    """
+    Build a keyword list for news correlation from Yahoo Finance metadata.
+    Returns a deduplicated list of relevant search terms for the given ticker.
+    Falls back to [ticker] if metadata fetch fails.
+    """
+    ticker = ticker.upper().strip()
+
+    _result: list = [None]
+    _exc: list = [None]
+
+    def _fetch() -> None:
+        try:
+            _result[0] = yf.Ticker(ticker).info
+        except Exception as exc:
+            _exc[0] = exc
+
+    thread = threading.Thread(target=_fetch, daemon=True)
+    thread.start()
+    thread.join(timeout=REQUEST_TIMEOUT)
+
+    if thread.is_alive():
+        log.warning("generate_keywords(%r): metadata fetch timed out", ticker)
+        return [ticker]
+
+    if _exc[0] is not None:
+        log.warning("generate_keywords(%r) failed: %s", ticker, _exc[0])
+        return [ticker]
+
+    info = _result[0]
+
+    if not info or not info.get("longName"):
+        return [ticker]
+
+    candidates: list[str] = [ticker]
+
+    for field in ("longName", "shortName"):
+        val = (info.get(field) or "").strip()
+        if not val:
+            continue
+        candidates.append(val)
+        for token in re.split(r"[\s,./&]+", val):
+            clean = re.sub(r"[^a-zA-Z0-9]", "", token)
+            if clean and clean.lower() not in _CORP_SUFFIXES:
+                candidates.append(clean)
+
+    for officer in (info.get("companyOfficers") or [])[:5]:
+        name = (officer.get("name") or "").strip()
+        if not name:
+            continue
+        parts = name.split()
+        if parts:
+            surname = re.sub(r"[^a-zA-Z]", "", parts[-1])
+            if surname:
+                candidates.append(surname)
+
+    seen: set[str] = set()
+    result: list[str] = []
+    for kw in candidates:
+        kw = kw.strip()
+        key = kw.lower()
+        if len(kw) >= 3 and key not in seen:
+            seen.add(key)
+            result.append(kw)
+
+    return result if result else [ticker]
+
+
 # ── Private helpers ───────────────────────────────────────────────────────────
 
 def _parse_pub_date(entry) -> Optional[dt.datetime]:

diff --git a/src/signals.py b/src/signals.py
@@ -15,6 +15,7 @@
 
 import datetime as dt
 import logging
+import re
 from typing import Optional
 
 from config.settings import (
@@ -28,6 +29,21 @@
 
 log = logging.getLogger(__name__)
 
+# Compiled keyword patterns, built once and reused across all correlate_news calls.
+# Word boundaries (\b) on each alphanumeric end prevent substring false-positives
+# (e.g. "gold" matching "goldman", "oil" matching "broil").
+_KW_PATTERN_CACHE: dict[str, re.Pattern] = {}
+
+
+def _kw_re(kw: str) -> re.Pattern:
+    """Return a compiled regex that matches *kw* as a whole token in lowercase text."""
+    if kw not in _KW_PATTERN_CACHE:
+        escaped = re.escape(kw)
+        prefix  = r'\b'
+        suffix  = r'\b' if kw[-1].isalnum() else ''
+        _KW_PATTERN_CACHE[kw] = re.compile(prefix + escaped + suffix)
+    return _KW_PATTERN_CACHE[kw]
+
 
 # ── News-asset correlation ────────────────────────────────────────────────────
 
@@ -47,7 +63,7 @@ def correlate_news(asset_name: str, articles: list[dict]) -> list[dict]:
     matched: list[dict] = []
     for article in articles:
         blob  = (article["title"] + " " + article["summary"]).lower()
-        score = sum(w for kw, w in kw_pairs if kw in blob)
+        score = sum(w for kw, w in kw_pairs if _kw_re(kw).search(blob))
 
         if score <= 0:
             continue

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -33,8 +33,10 @@
 # Canonical imports — new code should use these
 from src.price import _compute_rsi as _src_rsi, _compute_roc as _src_roc
 from src.sentiment import score_sentiment as src_score_sentiment
-from src.news import deduplicate_articles as src_dedup
-from src.signals import compute_signal_score as src_signal_score
+import datetime as dt
+
+from src.news import deduplicate_articles as src_dedup, generate_keywords
+from src.signals import compute_signal_score as src_signal_score, correlate_news
 
 
 # ── RSI ───────────────────────────────────────────────────────────────────────
@@ -159,6 +161,96 @@ def test_fetch_price_history_raises_on_fetch_failure(mocker):
         fetch_price_history("TEST", days=1)
 
 
+# ── generate_keywords ─────────────────────────────────────────────────────────
+
+def test_generate_keywords_known_ticker(mocker):
+    """Known ticker returns symbol, company name tokens, and officer surnames — not sector/industry."""
+    mock_info = {
+        "longName": "NVIDIA Corporation",
+        "shortName": "NVIDIA",
+        "symbol": "NVDA",
+        "industry": "Semiconductors",
+        "sector": "Technology",
+        "companyOfficers": [
+            {"name": "Jensen Huang"},
+            {"name": "Colette Kress"},
+        ],
+    }
+    ticker_mock = mocker.Mock()
+    ticker_mock.info = mock_info
+    mocker.patch("src.news.yf.Ticker", return_value=ticker_mock)
+
+    result = generate_keywords("NVDA")
+    assert "NVDA" in result
+    assert "NVIDIA" in result
+    assert "Huang" in result                  # executive surname included
+    assert "Semiconductors" not in result     # industry dropped — too broad for keyword matching
+    assert "Technology" not in result         # sector dropped — too broad for keyword matching
+    assert all(len(kw) >= 3 for kw in result)
+    assert len(result) == len({kw.lower() for kw in result}), "result must be deduplicated"
+
+
+def test_generate_keywords_unknown_ticker(mocker):
+    """Unknown ticker (empty info) should return [ticker] without raising."""
+    ticker_mock = mocker.Mock()
+    ticker_mock.info = {}
+    mocker.patch("src.news.yf.Ticker", return_value=ticker_mock)
+
+    result = generate_keywords("NOTREAL")
+    assert result == ["NOTREAL"]
+
+
+def test_generate_keywords_network_failure(mocker):
+    """Network failure should return [ticker] without raising."""
+    mocker.patch("src.news.yf.Ticker", side_effect=Exception("connection refused"))
+
+    result = generate_keywords("NVDA")
+    assert result == ["NVDA"]
+
+
+def test_generate_keywords_timeout(mocker):
+    """When the fetch thread does not finish within REQUEST_TIMEOUT, return [ticker]."""
+    thread_mock = mocker.MagicMock()
+    thread_mock.is_alive.return_value = True   # simulates a hung thread
+    mocker.patch("src.news.threading.Thread", return_value=thread_mock)
+
+    result = generate_keywords("NVDA")
+    assert result == ["NVDA"]
+    thread_mock.start.assert_called_once()
+    thread_mock.join.assert_called_once()
+
+
+# ── correlate_news word-boundary matching ─────────────────────────────────────
+
+def test_correlate_news_no_substring_false_positive():
+    """'gold' keyword must not match articles whose only hit is a substring like 'goldman'."""
+    now = dt.datetime.now(dt.timezone.utc)
+    goldman_article = {
+        "title": "goldman sachs raises forecast for major banks",
+        "summary": "Goldman Sachs analysts upgraded their outlook for the banking sector.",
+        "link": "https://example.com/gs",
+        "source": "Reuters Business",
+        "published": now - dt.timedelta(hours=1),
+    }
+    gold_article = {
+        "title": "gold prices hit record as safe haven demand surges",
+        "summary": "Bullion climbs on central bank buying.",
+        "link": "https://example.com/gold",
+        "source": "Reuters Business",
+        "published": now - dt.timedelta(hours=1),
+    }
+
+    results = correlate_news("Gold", [goldman_article, gold_article])
+    matched_titles = {a["title"] for a in results}
+
+    assert goldman_article["title"] not in matched_titles, (
+        "Goldman Sachs article must not match Gold — 'gold' is a substring of 'goldman'"
+    )
+    assert gold_article["title"] in matched_titles, (
+        "Gold article must match Gold"
+    )
+
+
 def test_fetch_news_articles_uses_explicit_timeout(mocker):
     """RSS fetches should be bounded by an explicit timeout."""
     mocker.patch("src.news.NEWS_FEEDS", [("Test Feed", "https://example.com/feed")])