Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Docs/CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Contributors

Thank you to everyone who has contributed to PulseEngine.

| Name | GitHub |
|------|--------|
| Bhargavaram Krishnapur | [@Codex-Crusader](https://github.com/Codex-Crusader) |
| Anshul Yadav | [@SudoMayo](https://github.com/SudoMayo) |
| Anshul Khandelwal | [@anshul](https://github.com/anshul) |
| Shruthikha | [@shruthikha](https://github.com/shruthikha) |
80 changes: 80 additions & 0 deletions src/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- deduplicate_articles : remove near-duplicates via Jaccard title similarity
- cluster_articles : group articles by dominant detected event type
- get_display_clusters : filtered, summarised cluster view for UI consumption
- generate_keywords : auto-build a keyword list for any ticker from Yahoo Finance metadata

This module does NOT score sentiment or match articles to assets — those
responsibilities belong to src/sentiment.py and src/signals.py respectively.
Expand All @@ -18,12 +19,14 @@
import datetime as dt
import logging
import re
import threading
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
from urllib.parse import urlparse

import feedparser
import yfinance as yf

from config.settings import (
DEDUP_SIMILARITY_THRESHOLD,
Expand Down Expand Up @@ -216,6 +219,83 @@ def get_display_clusters(
}


# ── Keyword generation ────────────────────────────────────────────────────────

_CORP_SUFFIXES: frozenset[str] = frozenset({
"inc", "corp", "corporation", "ltd", "limited", "plc", "llc", "lp",
"group", "holdings", "co", "company", "technologies", "technology",
"systems", "services", "solutions", "international", "global",
})


def generate_keywords(ticker: str) -> list[str]:
"""
Build a keyword list for news correlation from Yahoo Finance metadata.
Returns a deduplicated list of relevant search terms for the given ticker.
Falls back to [ticker] if metadata fetch fails.
"""
ticker = ticker.upper().strip()

_result: list = [None]
_exc: list = [None]

def _fetch() -> None:
try:
_result[0] = yf.Ticker(ticker).info
except Exception as exc:
_exc[0] = exc

thread = threading.Thread(target=_fetch, daemon=True)
thread.start()
thread.join(timeout=REQUEST_TIMEOUT)

if thread.is_alive():
log.warning("generate_keywords(%r): metadata fetch timed out", ticker)
return [ticker]

if _exc[0] is not None:
log.warning("generate_keywords(%r) failed: %s", ticker, _exc[0])
return [ticker]

info = _result[0]

if not info or not info.get("longName"):
return [ticker]

candidates: list[str] = [ticker]

for field in ("longName", "shortName"):
val = (info.get(field) or "").strip()
if not val:
continue
candidates.append(val)
for token in re.split(r"[\s,./&]+", val):
clean = re.sub(r"[^a-zA-Z0-9]", "", token)
if clean and clean.lower() not in _CORP_SUFFIXES:
candidates.append(clean)

for officer in (info.get("companyOfficers") or [])[:5]:
name = (officer.get("name") or "").strip()
if not name:
continue
parts = name.split()
if parts:
surname = re.sub(r"[^a-zA-Z]", "", parts[-1])
if surname:
candidates.append(surname)

seen: set[str] = set()
result: list[str] = []
for kw in candidates:
kw = kw.strip()
key = kw.lower()
if len(kw) >= 3 and key not in seen:
seen.add(key)
result.append(kw)

return result if result else [ticker]


# ── Private helpers ───────────────────────────────────────────────────────────

def _parse_pub_date(entry) -> Optional[dt.datetime]:
Expand Down
18 changes: 17 additions & 1 deletion src/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import datetime as dt
import logging
import re
from typing import Optional

from config.settings import (
Expand All @@ -28,6 +29,21 @@

log = logging.getLogger(__name__)

# Compiled keyword patterns, built once and reused across all correlate_news calls.
# Word boundaries (\b) on each alphanumeric end prevent substring false-positives
# (e.g. "gold" matching "goldman", "oil" matching "broil").
_KW_PATTERN_CACHE: dict[str, re.Pattern] = {}


def _kw_re(kw: str) -> re.Pattern:
"""Return a compiled regex that matches *kw* as a whole token in lowercase text."""
if kw not in _KW_PATTERN_CACHE:
escaped = re.escape(kw)
prefix = r'\b'
suffix = r'\b' if kw[-1].isalnum() else ''
_KW_PATTERN_CACHE[kw] = re.compile(prefix + escaped + suffix)
return _KW_PATTERN_CACHE[kw]


# ── News-asset correlation ────────────────────────────────────────────────────

Expand All @@ -47,7 +63,7 @@ def correlate_news(asset_name: str, articles: list[dict]) -> list[dict]:
matched: list[dict] = []
for article in articles:
blob = (article["title"] + " " + article["summary"]).lower()
score = sum(w for kw, w in kw_pairs if kw in blob)
score = sum(w for kw, w in kw_pairs if _kw_re(kw).search(blob))

if score <= 0:
continue
Expand Down
96 changes: 94 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@
# Canonical imports — new code should use these
from src.price import _compute_rsi as _src_rsi, _compute_roc as _src_roc
from src.sentiment import score_sentiment as src_score_sentiment
from src.news import deduplicate_articles as src_dedup
from src.signals import compute_signal_score as src_signal_score
import datetime as dt

from src.news import deduplicate_articles as src_dedup, generate_keywords
from src.signals import compute_signal_score as src_signal_score, correlate_news


# ── RSI ───────────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -159,6 +161,96 @@ def test_fetch_price_history_raises_on_fetch_failure(mocker):
fetch_price_history("TEST", days=1)


# ── generate_keywords ─────────────────────────────────────────────────────────

def test_generate_keywords_known_ticker(mocker):
"""Known ticker returns symbol, company name tokens, and officer surnames — not sector/industry."""
mock_info = {
"longName": "NVIDIA Corporation",
"shortName": "NVIDIA",
"symbol": "NVDA",
"industry": "Semiconductors",
"sector": "Technology",
"companyOfficers": [
{"name": "Jensen Huang"},
{"name": "Colette Kress"},
],
}
ticker_mock = mocker.Mock()
ticker_mock.info = mock_info
mocker.patch("src.news.yf.Ticker", return_value=ticker_mock)

result = generate_keywords("NVDA")
assert "NVDA" in result
assert "NVIDIA" in result
assert "Huang" in result # executive surname included
assert "Semiconductors" not in result # industry dropped — too broad for keyword matching
assert "Technology" not in result # sector dropped — too broad for keyword matching
assert all(len(kw) >= 3 for kw in result)
assert len(result) == len({kw.lower() for kw in result}), "result must be deduplicated"


def test_generate_keywords_unknown_ticker(mocker):
"""Unknown ticker (empty info) should return [ticker] without raising."""
ticker_mock = mocker.Mock()
ticker_mock.info = {}
mocker.patch("src.news.yf.Ticker", return_value=ticker_mock)

result = generate_keywords("NOTREAL")
assert result == ["NOTREAL"]


def test_generate_keywords_network_failure(mocker):
"""Network failure should return [ticker] without raising."""
mocker.patch("src.news.yf.Ticker", side_effect=Exception("connection refused"))

result = generate_keywords("NVDA")
assert result == ["NVDA"]


def test_generate_keywords_timeout(mocker):
"""When the fetch thread does not finish within REQUEST_TIMEOUT, return [ticker]."""
thread_mock = mocker.MagicMock()
thread_mock.is_alive.return_value = True # simulates a hung thread
mocker.patch("src.news.threading.Thread", return_value=thread_mock)

result = generate_keywords("NVDA")
assert result == ["NVDA"]
thread_mock.start.assert_called_once()
thread_mock.join.assert_called_once()


# ── correlate_news word-boundary matching ─────────────────────────────────────

def test_correlate_news_no_substring_false_positive():
"""'gold' keyword must not match articles whose only hit is a substring like 'goldman'."""
now = dt.datetime.now(dt.timezone.utc)
goldman_article = {
"title": "goldman sachs raises forecast for major banks",
"summary": "Goldman Sachs analysts upgraded their outlook for the banking sector.",
"link": "https://example.com/gs",
"source": "Reuters Business",
"published": now - dt.timedelta(hours=1),
}
gold_article = {
"title": "gold prices hit record as safe haven demand surges",
"summary": "Bullion climbs on central bank buying.",
"link": "https://example.com/gold",
"source": "Reuters Business",
"published": now - dt.timedelta(hours=1),
}

results = correlate_news("Gold", [goldman_article, gold_article])
matched_titles = {a["title"] for a in results}

assert goldman_article["title"] not in matched_titles, (
"Goldman Sachs article must not match Gold — 'gold' is a substring of 'goldman'"
)
assert gold_article["title"] in matched_titles, (
"Gold article must match Gold"
)


def test_fetch_news_articles_uses_explicit_timeout(mocker):
"""RSS fetches should be bounded by an explicit timeout."""
mocker.patch("src.news.NEWS_FEEDS", [("Test Feed", "https://example.com/feed")])
Expand Down
Loading