diff --git a/api/analyze-text.ts b/api/analyze-text.ts index 1bd40da..0aaf82c 100644 --- a/api/analyze-text.ts +++ b/api/analyze-text.ts @@ -2,6 +2,7 @@ import type { VercelRequest, VercelResponse } from '@vercel/node'; import { KeywordMatcher } from '../src/analysis/keyword-matcher'; import { generateSignal, TradingSignal } from '../src/analysis/signal-generator'; import { getMarkets, getArbitrage, getMarketMetadata } from './lib/market-cache'; +import { checkRateLimit } from './lib/rate-limit'; function isMalformedJsonError(error: unknown): boolean { if (!(error instanceof Error)) { @@ -48,6 +49,8 @@ export default async function handler( return; } + if (!await checkRateLimit(req, res)) return; + const startTime = Date.now(); try { diff --git a/api/cron/collect-tweets.ts b/api/cron/collect-tweets.ts index 20b5018..350ee71 100644 --- a/api/cron/collect-tweets.ts +++ b/api/cron/collect-tweets.ts @@ -61,8 +61,9 @@ export default async function handler( } // Verify cron secret (Vercel sends this header for authenticated cron calls) - const cronSecret = req.headers.authorization?.replace('Bearer ', ''); - if (cronSecret !== process.env.CRON_SECRET) { + const cronSecret = req.headers.authorization?.replace('Bearer ', '') ?? ''; + const expectedSecret = process.env.CRON_SECRET ?? ''; + if (!expectedSecret || cronSecret !== expectedSecret) { console.error('[Cron] Unauthorized: Invalid CRON_SECRET'); res.status(401).json({ success: false, diff --git a/api/feed.ts b/api/feed.ts index 4250c14..393651e 100644 --- a/api/feed.ts +++ b/api/feed.ts @@ -2,6 +2,7 @@ import type { VercelRequest, VercelResponse } from '@vercel/node'; import type { AnalyzedTweet, FeedResponse, AccountCategory } from '../src/types/feed'; import { batchGetFromKV, setFeedCache, getFeedCache, getFeedCacheTimestamp } from './lib/cache-helper'; import { kv } from './lib/vercel-kv'; +import { checkRateLimit } from './lib/rate-limit'; // ─── KV Storage Keys ─────────────────────────────────────────────────────── @@ -76,6 +77,8 @@ export default async function handler( return; } + if (!await checkRateLimit(req, res)) return; + try { // Parse query parameters const limit = Math.min(parseInt(req.query.limit as string) || 20, 100); @@ -221,51 +224,44 @@ export default async function handler( const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error('[Feed API] Error:', errorMessage); - const isQuotaError = errorMessage.includes('quota') || errorMessage.includes('max requests limit'); - - // Fallback to in-memory cache on quota error - if (isQuotaError) { - // Parse query parameters again (they're in try block scope) - const limit = Math.min(parseInt(req.query.limit as string) || 20, 100); - const category = req.query.category as AccountCategory | undefined; - const minUrgency = req.query.minUrgency as string | undefined; - const cacheKey = `${FEED_CACHE_KEY_PREFIX}${category || 'all'}_${minUrgency || 'all'}_${limit}`; - - const cachedResponse = getFeedCache(cacheKey); - const cachedAt = getFeedCacheTimestamp(cacheKey); - - if (cachedResponse) { - // Modify response to indicate it's cached - const fallbackResponse = { - ...cachedResponse, - data: { - ...cachedResponse.data, - metadata: { - ...cachedResponse.data.metadata, - cached: true, - cached_at: cachedAt ? new Date(cachedAt).toISOString() : null, - cache_age_seconds: cachedAt ? Math.floor((Date.now() - cachedAt) / 1000) : null, - }, + // Attempt stale cache fallback for ALL KV errors — not just quota errors. + // A network timeout, auth failure, or Upstash outage should still serve + // the last known-good feed rather than returning a hard 500 that breaks + // the bot's polling loop. cacheKey is re-derived here because it is + // scoped inside the try block above. + const limit = Math.min(parseInt(req.query.limit as string) || 20, 100); + const category = req.query.category as AccountCategory | undefined; + const minUrgency = req.query.minUrgency as string | undefined; + const cacheKey = `${FEED_CACHE_KEY_PREFIX}${category || 'all'}_${minUrgency || 'all'}_${limit}`; + + const cachedResponse = getFeedCache(cacheKey); + const cachedAt = getFeedCacheTimestamp(cacheKey); + + if (cachedResponse) { + const fallbackResponse = { + ...cachedResponse, + data: { + ...cachedResponse.data, + metadata: { + ...cachedResponse.data.metadata, + stale: true, + cached: true, + cached_at: cachedAt ? new Date(cachedAt).toISOString() : null, + cache_age_seconds: cachedAt ? Math.floor((Date.now() - cachedAt) / 1000) : null, }, - }; + }, + }; - console.log(`[Feed API] Serving cached feed (age: ${fallbackResponse.data.metadata.cache_age_seconds}s)`); - res.setHeader('Cache-Control', FEED_CACHE_CONTROL); - res.status(200).json(fallbackResponse); - return; - } + console.log(`[Feed API] Serving stale cache (age: ${fallbackResponse.data.metadata.cache_age_seconds}s) after error: ${errorMessage}`); + res.setHeader('Cache-Control', FEED_CACHE_CONTROL); + res.status(200).json(fallbackResponse); + return; } - const sanitized = getSanitizedFeedError(errorMessage); res.setHeader('Cache-Control', FEED_CACHE_CONTROL); - res.status(isQuotaError ? 503 : sanitized.status).json({ + res.status(503).json({ success: false, - error: isQuotaError - ? 'Service temporarily unavailable due to quota limits. No cached data available.' - : sanitized.error, - ...(sanitized.note && { - note: sanitized.note, - }), + error: 'Feed unavailable. No cached data available.', }); } } diff --git a/api/ground-probability.ts b/api/ground-probability.ts index dc2933f..f9e55f2 100644 --- a/api/ground-probability.ts +++ b/api/ground-probability.ts @@ -2,6 +2,7 @@ import type { VercelRequest, VercelResponse } from '@vercel/node'; import { KeywordMatcher } from '../src/analysis/keyword-matcher'; import { getMarkets, getMarketMetadata } from './lib/market-cache'; import { Market, MarketMatch } from '../src/types/market'; +import { checkRateLimit } from './lib/rate-limit'; /** * Ground Probability Endpoint @@ -166,6 +167,8 @@ export default async function handler( return; } + if (!await checkRateLimit(req, res)) return; + const startTime = Date.now(); try { @@ -181,10 +184,10 @@ export default async function handler( } // Validate claim - if (!body.claim || typeof body.claim !== 'string') { + if (!body.claim?.trim() || typeof body.claim !== 'string') { res.status(400).json({ success: false, - error: 'Missing or invalid "claim" field. Must be a string.', + error: 'Missing or invalid "claim" field. Must be a non-empty string.', }); return; } @@ -217,9 +220,14 @@ export default async function handler( claim, llm_estimate = null, min_confidence = 0.3, - max_markets = 5, } = body; + // Clamp max_markets to [1, 20] rather than rejecting — any numeric input + // produces a valid result; NaN/undefined/out-of-range all fall back to 5. + const max_markets = Number.isFinite(body.max_markets) + ? Math.max(1, Math.min(20, body.max_markets as number)) + : 5; + // Validate numeric parameters if ( typeof min_confidence !== 'number' || @@ -234,19 +242,6 @@ export default async function handler( return; } - if ( - typeof max_markets !== 'number' || - !Number.isFinite(max_markets) || - max_markets < 1 || - max_markets > 20 - ) { - res.status(400).json({ - success: false, - error: 'max_markets must be between 1 and 20.', - }); - return; - } - // Get markets const markets = await getMarkets(); diff --git a/api/lib/cache-helper.ts b/api/lib/cache-helper.ts index 6857bbd..ae77969 100644 --- a/api/lib/cache-helper.ts +++ b/api/lib/cache-helper.ts @@ -135,10 +135,17 @@ const feedCache = new Map(); +const MAX_FEED_CACHE_ENTRIES = 50; + /** - * Store feed data in memory for fallback + * Store feed data in memory for fallback. + * Evicts the oldest entry (insertion order) when the cap is reached so + * the Map cannot grow unboundedly inside a warm lambda. */ export function setFeedCache(key: string, data: any, ttlMs: number): void { + if (feedCache.size >= MAX_FEED_CACHE_ENTRIES) { + feedCache.delete(feedCache.keys().next().value!); + } feedCache.set(key, { data, timestamp: Date.now(), diff --git a/api/lib/market-cache.ts b/api/lib/market-cache.ts index 8b41d67..89579b2 100644 --- a/api/lib/market-cache.ts +++ b/api/lib/market-cache.ts @@ -30,13 +30,16 @@ let cachedArbitrage: ArbitrageOpportunity[] = []; let arbCacheTimestamp = 0; const ARB_CACHE_TTL_MS = (parseInt(process.env.ARBITRAGE_CACHE_TTL_SECONDS || '15', 10)) * 1000; -const POLYMARKET_TARGET_COUNT = parsePositiveInt(process.env.MUSASHI_POLYMARKET_TARGET_COUNT, 1200); +// Keyword filter drops Polymarket yield to ~15 matches/page; 300 topic-relevant +// markets across 20 pages is enough to cover Kalshi's 651 targeted markets. +const POLYMARKET_TARGET_COUNT = parsePositiveInt(process.env.MUSASHI_POLYMARKET_TARGET_COUNT, 300); const POLYMARKET_MAX_PAGES = parsePositiveInt(process.env.MUSASHI_POLYMARKET_MAX_PAGES, 20); const KALSHI_TARGET_COUNT = parsePositiveInt(process.env.MUSASHI_KALSHI_TARGET_COUNT, 1000); const KALSHI_MAX_PAGES = parsePositiveInt(process.env.MUSASHI_KALSHI_MAX_PAGES, 20); -// Stage 0 Session 2: Per-source timeout (5 seconds) -const SOURCE_TIMEOUT_MS = 5000; +// Polymarket: 20 pages × ~1.5s = ~30s. Kalshi: 24 series × 500ms delay = ~15s. +// 60s gives both enough headroom on cold start. +const SOURCE_TIMEOUT_MS = parsePositiveInt(process.env.MUSASHI_SOURCE_TIMEOUT_MS, 60_000); function parsePositiveInt(value: string | undefined, fallback: number): number { const parsed = Number.parseInt(value ?? '', 10); diff --git a/api/lib/rate-limit.ts b/api/lib/rate-limit.ts new file mode 100644 index 0000000..36d0dbe --- /dev/null +++ b/api/lib/rate-limit.ts @@ -0,0 +1,47 @@ +import type { VercelRequest, VercelResponse } from '@vercel/node'; + +const RATE_LIMIT_MAX_REQUESTS = 30; +const RATE_LIMIT_WINDOW_SECONDS = 60; + +/** + * IP-based fixed-window rate limiter backed by Vercel KV (Upstash Redis). + * + * Uses a dynamic import of @vercel/kv so that local development without + * KV credentials degrades gracefully — the catch block returns true, + * allowing all requests through rather than blocking legitimate traffic. + * + * Returns true if the request is within the rate limit, false if it was + * rejected with a 429 response (caller should return immediately). + */ +export async function checkRateLimit( + req: VercelRequest, + res: VercelResponse +): Promise { + try { + const rawIp = req.headers['x-forwarded-for']; + const ip = (Array.isArray(rawIp) ? rawIp[0] : rawIp) ?? 'unknown'; + const key = `ratelimit:${ip}`; + + // Dynamic import matches the pattern in vercel-kv.ts and allows the + // module to load even when @vercel/kv credentials are not configured. + const { kv } = await import('@vercel/kv'); + const count: number = await kv.incr(key); + // Set the TTL only on the first increment so the window is fixed, not + // sliding — subsequent increments within the window do not reset it. + if (count === 1) await kv.expire(key, RATE_LIMIT_WINDOW_SECONDS); + + if (count > RATE_LIMIT_MAX_REQUESTS) { + res.status(429).json({ + success: false, + error: `Rate limit exceeded. Max ${RATE_LIMIT_MAX_REQUESTS} requests per ${RATE_LIMIT_WINDOW_SECONDS}s window.`, + }); + return false; + } + + return true; + } catch { + // KV unavailable (local dev, missing credentials, or Upstash outage). + // Fail open — allow the request rather than blocking legitimate traffic. + return true; + } +} diff --git a/api/markets/arbitrage.ts b/api/markets/arbitrage.ts index 26a2f2a..668ea36 100644 --- a/api/markets/arbitrage.ts +++ b/api/markets/arbitrage.ts @@ -1,5 +1,6 @@ import type { VercelRequest, VercelResponse } from '@vercel/node'; import { getMarkets, getArbitrage, getMarketMetadata } from '../lib/market-cache'; +import { checkRateLimit } from '../lib/rate-limit'; export default async function handler( req: VercelRequest, @@ -26,6 +27,8 @@ export default async function handler( return; } + if (!await checkRateLimit(req, res)) return; + const startTime = Date.now(); try { diff --git a/src/analysis/keyword-matcher.ts b/src/analysis/keyword-matcher.ts index d2da844..628fbe1 100644 --- a/src/analysis/keyword-matcher.ts +++ b/src/analysis/keyword-matcher.ts @@ -955,8 +955,6 @@ function extractNumericContexts(text: string): Set { return contexts; } -const DENOMINATOR_CAP = 5; - interface MatchCounts { exactMatches: number; // tweet token directly matches market keyword synonymMatches: number; // tweet token matched via synonym expansion @@ -978,7 +976,7 @@ function computeScore(r: MatchCounts, market: Market, matchedKeywords: string[]) // Normalize by keyword list length, capped to avoid penalizing markets // that happen to have many keywords from description extraction. - const denominator = Math.min(r.totalChecked, DENOMINATOR_CAP); + const denominator = Math.max(1, Math.log10(r.totalChecked) * 2); const normalized = weighted / denominator; const totalMatched = r.exactMatches + r.synonymMatches + r.titleMatches + r.entityMatches; @@ -1128,24 +1126,20 @@ export class KeywordMatcher { for (const mk of explicitKeywords) { if (mk.includes(' ')) { if (hasWordBoundaryMatch(Array.from(rawTokenSet).join(' '), mk)) { - exactMatches++; multiWordMatches++; - - // Check if this is an entity match if (isEntity(mk, entities)) { entityMatches++; + } else { + exactMatches++; } - matchedKeywords.push(mk); } else if (hasWordBoundaryMatch(Array.from(expandedTokenSet).join(' '), mk)) { - synonymMatches++; multiWordMatches++; - - // Check if this is an entity match if (isEntity(mk, entities)) { entityMatches++; + } else { + synonymMatches++; } - matchedKeywords.push(mk); } } @@ -1155,17 +1149,13 @@ export class KeywordMatcher { for (const mk of explicitKeywords) { if (!mk.includes(' ') && !matchedKeywords.includes(mk)) { if (expandedTokenSet.has(mk)) { - if (rawTokenSet.has(mk)) { + if (isEntity(mk, entities)) { + entityMatches++; + } else if (rawTokenSet.has(mk)) { exactMatches++; } else { synonymMatches++; } - - // Check if this is an entity match (people, tickers, orgs) - if (isEntity(mk, entities)) { - entityMatches++; - } - matchedKeywords.push(mk); } } @@ -1177,13 +1167,11 @@ export class KeywordMatcher { const titleTokens = extractTitleTokens(market.title); for (const tt of titleTokens) { if (!matchedKeywords.includes(tt) && expandedTokenSet.has(tt)) { - titleMatches++; - - // Check if this is an entity match if (isEntity(tt, entities)) { entityMatches++; + } else { + titleMatches++; } - matchedKeywords.push(tt); } } diff --git a/src/analysis/signal-generator.ts b/src/analysis/signal-generator.ts index 45c1eaf..db2a9af 100644 --- a/src/analysis/signal-generator.ts +++ b/src/analysis/signal-generator.ts @@ -69,20 +69,32 @@ function calculateImpliedProbability(sentiment: SentimentResult): number { } /** - * Calculate trading edge for a market given sentiment - * Edge = how much the sentiment-implied probability differs from market price + * Calculate trading edge for a market given sentiment and keyword match quality. + * + * Edge = effective confidence × price gap between implied and market price. + * + * Previously only sentiment.confidence was used, which collapsed to 0 for + * factual news (e.g. "Trump announces peace deal") because those phrases don't + * appear in the sentiment keyword lists. A confidence-1.0 keyword match IS + * itself an edge signal — the text is unambiguously about this market. + * We now take the stronger of the two confidence signals. */ -function calculateEdge(market: Market, sentiment: SentimentResult): number { +function calculateEdge( + market: Market, + sentiment: SentimentResult, + matchConfidence: number = 0 +): number { const impliedProb = calculateImpliedProbability(sentiment); const currentPrice = market.yesPrice; - // Raw difference between implied and actual price const priceDiff = Math.abs(impliedProb - currentPrice); - // Weight by sentiment confidence - const edge = sentiment.confidence * priceDiff; + // Use whichever confidence signal is stronger: + // - matchConfidence: how precisely the text is about this market + // - sentiment.confidence: how strongly directional the language is + const effectiveConfidence = Math.max(sentiment.confidence, matchConfidence); - return edge; + return effectiveConfidence * priceDiff; } /** @@ -170,7 +182,8 @@ function generateSuggestedAction( market: Market, sentiment: SentimentResult, edge: number, - urgency: UrgencyLevel + urgency: UrgencyLevel, + matchConfidence: number = 0 ): SuggestedAction { // Don't suggest action if edge is too low if (edge < 0.10) { @@ -189,8 +202,17 @@ function generateSuggestedAction( let reasoning: string; if (sentiment.sentiment === 'neutral') { - direction = 'HOLD'; - reasoning = 'Neutral sentiment, no clear directional bias'; + // Neutral sentiment (e.g. factual news): use price gap + match confidence as the signal + if (impliedProb > currentPrice) { + direction = 'YES'; + reasoning = `Match confidence ${(matchConfidence * 100).toFixed(0)}%: YES underpriced at ${(currentPrice * 100).toFixed(0)}¢ vs neutral implied ${(impliedProb * 100).toFixed(0)}¢`; + } else if (impliedProb < currentPrice) { + direction = 'NO'; + reasoning = `Match confidence ${(matchConfidence * 100).toFixed(0)}%: YES overpriced at ${(currentPrice * 100).toFixed(0)}¢ vs neutral implied ${(impliedProb * 100).toFixed(0)}¢`; + } else { + direction = 'HOLD'; + reasoning = 'Neutral sentiment with no significant price gap'; + } } else if (sentiment.sentiment === 'bullish') { // Bullish sentiment if (impliedProb > currentPrice) { @@ -274,7 +296,7 @@ export function generateSignal( const topMarket = topMatch.market; // Calculate edge - const edge = calculateEdge(topMarket, sentiment); + const edge = calculateEdge(topMarket, sentiment, topMatch.confidence); // Compute urgency const urgency = computeUrgency( @@ -288,7 +310,7 @@ export function generateSignal( const signal_type = computeSignalType(tweetText, sentiment, edge, !!arbitrageOpportunity); // Generate suggested action - const suggested_action = generateSuggestedAction(topMarket, sentiment, edge, urgency); + const suggested_action = generateSuggestedAction(topMarket, sentiment, edge, urgency, topMatch.confidence); return { event_id: generateEventId(tweetText), diff --git a/src/api/arbitrage-detector.ts b/src/api/arbitrage-detector.ts index 0f2317f..fc1086d 100644 --- a/src/api/arbitrage-detector.ts +++ b/src/api/arbitrage-detector.ts @@ -3,142 +3,274 @@ import { Market, ArbitrageOpportunity } from '../types/market'; -/** - * Normalize a title for fuzzy matching - * Removes punctuation, dates, common question words, normalizes spacing - */ -function normalizeTitle(title: string): string { - return title - .toLowerCase() - .replace(/\?/g, '') // Remove question marks - .replace(/\b(will|before|after|by|in|on|at|the|a|an)\b/g, '') // Remove filler words - .replace(/\b(2024|2025|2026|2027|2028)\b/g, '') // Remove years - .replace(/[^a-z0-9\s]/g, ' ') // Remove all punctuation - .replace(/\s+/g, ' ') // Normalize whitespace - .trim(); -} +// Words that carry no information about what a bet is actually about. +// Stripping these before comparison prevents "Fed no change April" from +// matching "Fed above 5% June" on generic finance words alone. +const STOP_WORDS = new Set([ + 'will','the','a','an','of','in','on','by','at','to','is','be','for', + 'that','this','are','was','were','been','have','has','had', + 'above','below','over','under','more','less','than','or','and', + 'next','new','first','last','no','not','any','all','its', + 'following','after','before','between','within','until','since', + 'federal','funds','fed','rate','rates','upper','bound','price','market', + 'change','meeting','interest','united','states','kingdom', + 'there','what','when','who','which','where','how','does', + 'from','with','out','end','ago','get','got','let','set','put', + // Role/title words — too generic across different markets + 'prime','minister','president','senator','governor','secretary', + // Economic modifiers that add no topic specificity + 'real','rise','fall','increase','decrease','grow','growth', + // Structural prediction-market words + 'least','most','many','much','some','such','only','also', + 'dollars','percent','dollar','amount','total','number','level', + 'happen','occur','take','place','round','second','third','fourth', + // "Election" / "electoral" are too generic — every election market has them. + // Country/candidate names carry the actual specificity. + 'election','electoral', + // "Win" is too generic across prediction markets — sports, elections, awards. + // Genuine matches use the subject (candidate, team) as the shared anchor, not "win". + 'win','wins','winner','winning', +]); + +// Common prediction market rephrasing patterns. +// Each key and its synonyms are treated as equivalent during word overlap +// comparison so "trump out president" matches "trump resign president". +// Keys must be single tokens (no spaces) — multi-word phrases like +// "peace deal" are handled by normalizing to the first token ("ceasefire"). +const EQUIVALENCES: Record = { + // Leaving office + resign: ['removed', 'leave', 'step', 'cease', 'exit', 'fired', 'ousted'], + removed: ['resign', 'leave', 'fired', 'ousted'], + // Price direction + exceed: ['surpass', 'hit', 'reach', 'pass'], + surpass: ['exceed', 'hit', 'reach', 'pass'], + below: ['under', 'fall', 'drop', 'miss', 'fail'], + // Agreements / peace + ceasefire: ['truce', 'armistice', 'peace'], + truce: ['ceasefire', 'armistice', 'peace'], + // Elections / confirmation + win: ['elected', 'chosen', 'confirmed', 'nominated', 'appointed'], + elected: ['win', 'chosen', 'confirmed', 'nominated', 'appointed'], + confirmed: ['win', 'elected', 'chosen', 'appointed', 'nominated'], + // Conflict + war: ['conflict', 'invasion', 'attack', 'strike'], + attack: ['war', 'conflict', 'strike', 'invasion'], + // Price movement direction + fall: ['drop', 'decline', 'decrease', 'sink', 'plunge'], + drop: ['fall', 'decline', 'decrease', 'sink', 'plunge'], + // Crypto abbreviations — same asset, different naming conventions + btc: ['bitcoin'], + bitcoin: ['btc'], + eth: ['ethereum'], + ethereum: ['eth'], + xrp: ['ripple'], + ripple: ['xrp'], + // Government / policy + shutdown: ['shut'], + shut: ['shutdown'], + // Central bank body abbreviations — intentionally NOT equivalenced: + // fed↔fomc would cause "11 Fed cuts" to match "FOMC rate upper bound" bets, + // which are related but different propositions. Genuine Fed/FOMC pairs share + // other meaningful words (e.g. "raise rates March 2025") and don't need this bridge. + // Economic cycles — recession is defined by GDP contraction + recession: ['contraction', 'downturn', 'gdp', 'shrink'], + gdp: ['recession', 'contraction', 'downturn'], +}; /** - * Extract key entities from a market title - * Looks for: names, tickers, numbers, organizations + * Expand a set of meaningful words to include synonyms from EQUIVALENCES. + * "out" → adds "resign", "removed", "leave" etc. to the set so that + * "trump out president" overlaps with "trump resign president". */ -function extractEntities(title: string): Set { - const normalized = normalizeTitle(title); - const words = normalized.split(' '); - const entities = new Set(); - - // Extract significant words (3+ chars, not in stop list) - const stopWords = new Set(['will', 'hit', 'reach', 'win', 'lose', 'pass', 'than', 'over', 'under']); - +function expandWithEquivalences(words: Set): Set { + const expanded = new Set(words); for (const word of words) { - if (word.length >= 3 && !stopWords.has(word)) { - entities.add(word); + const synonyms = EQUIVALENCES[word]; + if (synonyms) { + for (const syn of synonyms) expanded.add(syn); } } - - return entities; + return expanded; } /** - * Calculate similarity score between two titles - * Returns 0-1 based on shared entities + * Normalize a single word token: expand magnitude suffixes so that + * "100k" and "100000" produce the same string for comparison. */ -function calculateTitleSimilarity(title1: string, title2: string): number { - const entities1 = extractEntities(title1); - const entities2 = extractEntities(title2); - - if (entities1.size === 0 || entities2.size === 0) return 0; - - // Count shared entities - let sharedCount = 0; - for (const entity of entities1) { - if (entities2.has(entity)) { - sharedCount++; - } +function normalizeToken(word: string): string { + if (/^\d/.test(word)) { + const lower = word.toLowerCase(); + if (lower.endsWith('k')) return String(Math.round(parseFloat(lower) * 1_000)); + if (lower.endsWith('m')) return String(Math.round(parseFloat(lower) * 1_000_000)); + if (lower.endsWith('b')) return String(Math.round(parseFloat(lower) * 1_000_000_000)); } + return word; +} - // Jaccard similarity: intersection / union - const union = entities1.size + entities2.size - sharedCount; - return union > 0 ? sharedCount / union : 0; +/** + * Extract meaningful content words from a title — everything that + * actually describes what the bet is about, with stop words removed. + * Numbers are magnitude-normalized so "100k" and "100000" compare equal. + */ +function meaningfulWords(title: string): Set { + return new Set( + title + .toLowerCase() + .replace(/[^a-z0-9\s]/g, ' ') + .split(/\s+/) + .filter(w => w.length >= 3 && !STOP_WORDS.has(w)) + .map(normalizeToken) + ); } +const MONTH_MAP: Record = { + january: 'jan', february: 'feb', march: 'mar', april: 'apr', + may: 'may', june: 'jun', july: 'jul', august: 'aug', + september: 'sep', october: 'oct', november: 'nov', december: 'dec', +}; + /** - * Calculate keyword overlap between two markets - * Returns the number of shared keywords + * Extract month names from a title for timeframe comparison, normalized + * to 3-letter abbreviations so "april" and "apr" compare as equal. + * "April 2026" and "June 2026" are different bets even on the same topic. */ -function calculateKeywordOverlap(market1: Market, market2: Market): number { - const keywords1 = new Set(market1.keywords); - const keywords2 = new Set(market2.keywords); - - let overlap = 0; - for (const kw of keywords1) { - if (keywords2.has(kw)) { - overlap++; - } - } +function extractTimeframe(title: string): string[] { + const matches = title.toLowerCase().match( + /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|july|august|september|october|november|december|q[1-4])\b/g + ); + return (matches ?? []).map(m => MONTH_MAP[m] ?? m); +} - return overlap; +/** + * Extract and normalize numeric values for strike/threshold comparison. + * All variants of the same number normalize to the same string: + * "$70,000" → "70000", "70k" → "70000", "70K" → "70000", "1.5m" → "1500000" + * This prevents "BTC above $100K" and "BTC above $100,000" from being treated + * as different bets simply due to formatting differences. + */ +function extractNumbers(title: string): string[] { + const matches = title.match(/[\d,]+(?:\.\d+)?[kKmMbB%]?/g) ?? []; + return matches.map(n => { + const clean = n.replace(/,/g, '').toLowerCase(); + // Expand magnitude suffixes to full integers for comparison + if (clean.endsWith('k')) return String(Math.round(parseFloat(clean) * 1_000)); + if (clean.endsWith('m')) return String(Math.round(parseFloat(clean) * 1_000_000)); + if (clean.endsWith('b')) return String(Math.round(parseFloat(clean) * 1_000_000_000)); + return clean; + }).filter(n => n !== 'nan'); } /** - * Check if two markets refer to the same event - * Uses title similarity + keyword overlap + category matching + * Check if two markets refer to the same event and the same bet. + * + * All five gates must pass: + * 1. Same category + * 2. Same timeframe (month) — "April" vs "June" = different bet + * 3. Same strike/threshold — "$70K" vs "$60K" = different bet + * 4. At least 3 shared meaningful content words + * 5. Jaccard similarity ≥ 0.6 on meaningful words + * + * Previously: OR logic on title similarity OR keyword count → false positives + * Now: AND logic across all five gates → only genuine same-event pairs pass */ function areMarketsSimilar(poly: Market, kalshi: Market): { isSimilar: boolean; confidence: number; reason: string; } { - // Must be in the same category (or one is 'other') + // Gate 1: category must match (or one is 'other') const categoryMatch = poly.category === kalshi.category || - poly.category === 'other' || - kalshi.category === 'other'; - + poly.category === 'other' || + kalshi.category === 'other'; if (!categoryMatch) { return { isSimilar: false, confidence: 0, reason: 'Different categories' }; } - // Calculate title similarity - const titleSim = calculateTitleSimilarity(poly.title, kalshi.title); + // Gate 2: timeframe — if both titles contain month names, at least one must overlap + const polyTime = extractTimeframe(poly.title); + const kalshiTime = extractTimeframe(kalshi.title); + if (polyTime.length > 0 && kalshiTime.length > 0) { + const sharedTime = polyTime.filter(t => kalshiTime.includes(t)); + if (sharedTime.length === 0) { + return { isSimilar: false, confidence: 0, reason: `Different timeframes (${polyTime[0]} vs ${kalshiTime[0]})` }; + } + } - // Calculate keyword overlap - const keywordOverlap = calculateKeywordOverlap(poly, kalshi); + // Gate 3: strike/threshold — if both titles contain numbers, at least one must overlap + const polyNums = extractNumbers(poly.title); + const kalshiNums = extractNumbers(kalshi.title); + if (polyNums.length > 0 && kalshiNums.length > 0) { + const sharedNums = polyNums.filter(n => kalshiNums.includes(n)); + if (sharedNums.length === 0) { + return { isSimilar: false, confidence: 0, reason: `Different strikes (${polyNums[0]} vs ${kalshiNums[0]})` }; + } + } + + // Gate 4 & 5: count shared words using raw direct matches + synonym bridges. + // Crucially we do NOT expand both sides and intersect — that inflates shared + // counts when both titles share the same generic word (e.g. "win" → 7 synonyms + // in common, making sports "win" match political "win"). + // Instead: a word counts as shared only if it appears directly in both raw sets, + // OR it has a synonym (via EQUIVALENCES) that appears in the other raw set AND + // that synonym is a DIFFERENT word. Each poly word contributes at most once. + const polyWords = meaningfulWords(poly.title); + const kalshiWords = meaningfulWords(kalshi.title); + + const directMatches = new Set([...polyWords].filter(w => kalshiWords.has(w))); + + // Synonym bridges: poly word whose synonym (a DIFFERENT word) is in kalshi raw set + const synonymBridges = new Set(); + for (const pw of polyWords) { + if (directMatches.has(pw)) continue; + const syns = EQUIVALENCES[pw] ?? []; + if (syns.some(syn => kalshiWords.has(syn) && !directMatches.has(syn))) { + synonymBridges.add(pw); + } + } - // Matching criteria (needs at least one strong signal): - // 1. High title similarity (>0.5) OR - // 2. Strong keyword overlap (3+ shared keywords) + const sharedCount = directMatches.size + synonymBridges.size; - if (titleSim > 0.5) { - return { - isSimilar: true, - confidence: titleSim, - reason: `High title similarity (${(titleSim * 100).toFixed(0)}%)` - }; + // Gate 4a: at least 2 genuinely shared content words + if (sharedCount < 2) { + return { isSimilar: false, confidence: 0, reason: `Only ${sharedCount} shared words (need 2)` }; } - if (keywordOverlap >= 3) { - const confidence = Math.min(keywordOverlap / 10, 0.9); // Cap at 0.9 - return { - isSimilar: true, - confidence, - reason: `${keywordOverlap} shared keywords` - }; + // Gate 4b: at least 1 shared word must be a topic word — not a year (2024, 2025) + // and not a month abbreviation (apr, jan, etc.). Sharing only timeframe tokens + // ("April 2026") causes Fed-rate markets to match CPI markets, or Elon tweet + // markets to match CPI markets — they're in the same month but different bets. + const TIMEFRAME_TOKENS = new Set([ + 'jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec', + 'january','february','march','april','june','july','august','september', + 'october','november','december','q1','q2','q3','q4', + ]); + const sharedTopicWords = [...directMatches, ...synonymBridges] + .filter(w => !/^\d{4}$/.test(w) && !TIMEFRAME_TOKENS.has(w)); + if (sharedTopicWords.length < 2) { + return { isSimilar: false, confidence: 0, reason: `Only ${sharedTopicWords.length} shared topic word(s) (need 2)` }; } - // Check for exact entity matches (strong signal even with low overall similarity) - const polyEntities = extractEntities(poly.title); - const kalshiEntities = extractEntities(kalshi.title); - const sharedEntities = Array.from(polyEntities).filter(e => kalshiEntities.has(e)); - - if (sharedEntities.length >= 2 && titleSim > 0.3) { - return { - isSimilar: true, - confidence: 0.7, - reason: `Shared entities: ${sharedEntities.slice(0, 3).join(', ')}` - }; + // Gate 5: Dice coefficient (F1 / harmonic-mean overlap) ≥ 0.60. + // Dice = 2 × sharedCount / (|polyWords| + |kalshiWords|). + // + // Why Dice over plain overlap coefficient: + // Overlap = shared / min(A, B). When one title is very short (e.g. Kalshi's + // "Who will win the next presidential election?" → only 2 meaningful words + // after stop-word removal), ANY Poly market containing those 2 words reaches + // 100% overlap — Peru elections, sports finals, US elections all look the same. + // Dice penalises that imbalance: a Peru market (6 meaningful words) sharing 2 + // generic words gets Dice 2*2/(6+2) = 50%, well below the 0.60 threshold. + const totalSize = polyWords.size + kalshiWords.size; + const dice = totalSize > 0 ? (2 * sharedCount) / totalSize : 0; + if (dice < 0.60) { + return { isSimilar: false, confidence: 0, reason: `Similarity ${(dice * 100).toFixed(0)}% (need 60%)` }; } - return { isSimilar: false, confidence: 0, reason: 'Insufficient similarity' }; + return { + isSimilar: true, + confidence: dice, + reason: `${sharedCount} shared words, dice=${(dice * 100).toFixed(0)}%`, + }; } /** diff --git a/src/api/kalshi-client.ts b/src/api/kalshi-client.ts index f1653a6..91746ca 100644 --- a/src/api/kalshi-client.ts +++ b/src/api/kalshi-client.ts @@ -7,6 +7,11 @@ import { generateKeywords } from './keyword-generator'; const KALSHI_API = 'https://api.elections.kalshi.com/trade-api/v2'; const FETCH_TIMEOUT_MS = 10000; // 10s timeout to prevent hanging on cold starts +const INTER_PAGE_DELAY_MS = 500; // throttle: wait 500ms between page requests +const RATE_LIMIT_RETRY_DELAY_MS = 5000; // on 429: wait 5s and retry once +const KALSHI_CACHE_TTL_MS = 60_000; // cache Kalshi results for 60 seconds + +let kalshiCache: { markets: Market[]; fetchedAt: number } | null = null; // Shape of a market object returned by the Kalshi REST API interface KalshiMarket { @@ -16,14 +21,14 @@ interface KalshiMarket { title: string; market_type?: string; mve_collection_ticker?: string; // present only on multi-variable event (parlay) markets - yes_ask: number; // cents (0–100) - yes_ask_dollars?: number; // same in dollars (0–1), prefer this if present + yes_ask: number; // cents (0–100) + yes_ask_dollars?: number | string; // same in dollars (0–1), may be returned as string yes_bid: number; - yes_bid_dollars?: number; + yes_bid_dollars?: number | string; no_ask: number; no_bid: number; - last_price?: number; // last trade price for YES in cents - last_price_dollars?: number; + last_price?: number; // last trade price for YES in cents + last_price_dollars?: number | string; volume?: number; volume_24h?: number; open_interest?: number; @@ -58,28 +63,13 @@ function isSimpleMarket(km: KalshiMarket): boolean { return true; } +const sleep = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); + /** - * Fetch open markets from Kalshi's public API using cursor pagination. - * - * The default API ordering puts thousands of MVE (parlay/sports) markets first. - * isSimpleMarket() filters those out, so we must page through until we have - * enough simple binary markets for meaningful tweet matching. - * - * Stops when we reach `targetSimpleCount` simple markets or exhaust `maxPages`. + * Fetch a single page from Kalshi with one 429-retry before giving up. */ -export async function fetchKalshiMarkets( - targetSimpleCount = 400, - maxPages = 15, -): Promise { - const PAGE_SIZE = 200; - const allSimple: Market[] = []; - let cursor: string | undefined; - - for (let page = 0; page < maxPages; page++) { - const url = cursor - ? `${KALSHI_API}/markets?status=open&mve_filter=exclude&limit=${PAGE_SIZE}&cursor=${encodeURIComponent(cursor)}` - : `${KALSHI_API}/markets?status=open&mve_filter=exclude&limit=${PAGE_SIZE}`; - +async function fetchKalshiPage(url: string): Promise { + for (let attempt = 0; attempt < 2; attempt++) { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); @@ -87,6 +77,15 @@ export async function fetchKalshiMarkets( const resp = await fetch(url, { signal: controller.signal }); clearTimeout(timeoutId); + if (resp.status === 429) { + if (attempt === 0) { + console.warn(`[Kalshi] Rate limited (429) — waiting ${RATE_LIMIT_RETRY_DELAY_MS}ms before retry`); + await sleep(RATE_LIMIT_RETRY_DELAY_MS); + continue; + } + throw new Error('Kalshi API rate limit exceeded after retry'); + } + if (!resp.ok) { console.error(`[Musashi SW] Kalshi HTTP ${resp.status} — declarativeNetRequest header stripping may not be active yet`); throw new Error(`Kalshi API responded with ${resp.status}`); @@ -97,21 +96,7 @@ export async function fetchKalshiMarkets( throw new Error('Unexpected Kalshi API response shape'); } - const pageSimple = data.markets - .filter(isSimpleMarket) - .map(toMarket) - .filter(m => m.yesPrice > 0 && m.yesPrice < 1); - - allSimple.push(...pageSimple); - - console.log( - `[Musashi] Page ${page + 1}: ${data.markets.length} raw → ` + - `${pageSimple.length} simple (total simple: ${allSimple.length})` - ); - - // Stop early once we have enough, or when the API has no more pages - if (allSimple.length >= targetSimpleCount || !data.cursor) break; - cursor = data.cursor; + return data; } catch (error) { clearTimeout(timeoutId); if ((error as Error).name === 'AbortError') { @@ -121,20 +106,123 @@ export async function fetchKalshiMarkets( } } - console.log(`[Musashi] Fetched ${allSimple.length} live markets from Kalshi`); - return allSimple; + throw new Error('Kalshi fetch failed after all attempts'); +} + +// Series tickers that overlap with Polymarket categories. +// Fetching by series_ticker skips the thousands of sports/baseball markets +// that dominate blind pagination and returns only topic-relevant markets. +const OVERLAP_SERIES = [ + // Crypto + 'KXBTC', // Bitcoin price + 'KXETH', // Ethereum price + 'KXXRP', // XRP price + // Economics / Fed + 'KXFED', // Federal funds rate + 'KXCPI', // CPI inflation + 'KXGDP', // GDP growth + // US Politics + 'KXTRUMPRESIGN', // Trump resignation + 'KXTRUMPPARDONS', // Trump pardons + 'KXNEXTSPEAKER', // House Speaker + 'KXPRESPERSON', // Next president + 'KXPRESPARTY', // Presidential party + 'KXNEXTPRESSEC', // Next press secretary + 'KXAMEND22', // 22nd Amendment + // Geopolitics + 'KXZELENSKYPUTIN', // Russia-Ukraine + 'KXTAIWANLVL4', // Taiwan conflict + 'KXNEXTISRAELPM', // Israel PM + 'KXWITHDRAW', // US troop withdrawal + 'KXUSTAKEOVER', // US foreign policy + // Tech / AI + 'KXOAIANTH', // OpenAI vs Anthropic IPO + 'KXAGICO', // AGI company + 'KXDATACENTER', // Data center + // Elections + 'KXNEWPOPE', // New Pope + 'KXNEXTUKPM', // UK PM + 'KXUKPARTY', // UK party +]; + +/** + * Fetch all open markets for a single series_ticker. + * Returns an empty array if the series has no active markets. + */ +async function fetchSeriesMarkets(seriesTicker: string): Promise { + const url = `${KALSHI_API}/markets?status=open&mve_filter=exclude&limit=100&series_ticker=${seriesTicker}`; + + try { + const data = await fetchKalshiPage(url); + const markets = data.markets + .filter(isSimpleMarket) + .map(toMarket) + .filter(m => m.yesPrice > 0 && m.yesPrice < 1); + + if (markets.length > 0) { + console.log(`[Kalshi] ${seriesTicker}: ${markets.length} markets`); + } + return markets; + } catch (error) { + console.warn(`[Kalshi] ${seriesTicker} fetch failed: ${(error as Error).message}`); + return []; + } +} + +/** + * Fetch Kalshi markets using targeted series_ticker fetches instead of + * blind pagination. Blind pagination puts thousands of sports games first — + * targeted fetches go directly to crypto, economics, and politics markets + * that actually overlap with Polymarket. + * + * Keeps 500ms delay between series fetches and a 60-second result cache. + */ +export async function fetchKalshiMarkets( + _targetSimpleCount = 400, + _maxPages = 15, +): Promise { + // Return cached result if still fresh + if (kalshiCache && (Date.now() - kalshiCache.fetchedAt) < KALSHI_CACHE_TTL_MS) { + console.log(`[Kalshi] Returning cached ${kalshiCache.markets.length} markets (age: ${Date.now() - kalshiCache.fetchedAt}ms)`); + return kalshiCache.markets; + } + + const seen = new Set(); // deduplicate by ticker + const allMarkets: Market[] = []; + + for (let i = 0; i < OVERLAP_SERIES.length; i++) { + if (i > 0) await sleep(INTER_PAGE_DELAY_MS); + + const markets = await fetchSeriesMarkets(OVERLAP_SERIES[i]); + for (const m of markets) { + if (!seen.has(m.id)) { + seen.add(m.id); + allMarkets.push(m); + } + } + } + + console.log(`[Kalshi] Fetched ${allMarkets.length} targeted markets across ${OVERLAP_SERIES.length} series`); + + kalshiCache = { markets: allMarkets, fetchedAt: Date.now() }; + return allMarkets; } /** Map a raw Kalshi market object to our Market interface */ function toMarket(km: KalshiMarket): Market { + // Kalshi API returns _dollars fields as strings in some responses — coerce to number + const yesBidDollars = km.yes_bid_dollars != null ? Number(km.yes_bid_dollars) : null; + const yesAskDollars = km.yes_ask_dollars != null ? Number(km.yes_ask_dollars) : null; + const lastPriceDollars = km.last_price_dollars != null ? Number(km.last_price_dollars) : null; + // Prefer the _dollars variant (already 0–1); fall back to /100 conversion let yesPrice: number; - if (km.yes_bid_dollars != null && km.yes_ask_dollars != null && km.yes_ask_dollars > 0) { - yesPrice = (km.yes_bid_dollars + km.yes_ask_dollars) / 2; + if (yesBidDollars != null && yesAskDollars != null && yesAskDollars > 0) { + yesPrice = (yesBidDollars + yesAskDollars) / 2; } else if (km.yes_bid != null && km.yes_ask != null && km.yes_ask > 0) { yesPrice = ((km.yes_bid + km.yes_ask) / 2) / 100; - } else if (km.last_price_dollars != null && km.last_price_dollars > 0) { - yesPrice = km.last_price_dollars; + } else if (lastPriceDollars != null && lastPriceDollars > 0) { + yesPrice = lastPriceDollars; } else if (km.last_price != null && km.last_price > 0) { yesPrice = km.last_price / 100; } else { diff --git a/src/api/polymarket-client.ts b/src/api/polymarket-client.ts index a0b9ce4..a7dd0c0 100644 --- a/src/api/polymarket-client.ts +++ b/src/api/polymarket-client.ts @@ -32,6 +32,33 @@ interface PolymarketMarket { endDateIso?: string; // ISO date e.g. "2026-03-31" } +// Keywords mirroring Kalshi's 24 targeted series (KXBTC, KXFED, KXTRUMPRESIGN, etc.) +// A Polymarket market must match at least one group to be included. +// This replaces pure volume-sorted pagination with topic-targeted fetching. +const KALSHI_TOPIC_PATTERNS: RegExp[] = [ + // Crypto — KXBTC, KXETH, KXXRP + /\b(bitcoin|btc|ethereum|eth|xrp|ripple|crypto)\b/i, + // Economics / Fed — KXFED, KXCPI, KXGDP + /\b(federal funds|fed rate|interest rate|fomc|cpi|inflation|gdp|recession)\b/i, + // US Politics — KXTRUMPRESIGN, KXTRUMPPARDONS, KXNEXTSPEAKER, KXPRESPERSON, KXPRESPARTY, KXNEXTPRESSEC, KXAMEND22 + /\b(trump|pardon|speaker of the house|next president|22nd amendment|press secretary)\b/i, + // Geopolitics — KXZELENSKYPUTIN, KXTAIWANLVL4, KXNEXTISRAELPM, KXWITHDRAW, KXUSTAKEOVER + /\b(zelensky|putin|russia|ukraine|taiwan|israel|prime minister|troop withdrawal)\b/i, + // Tech / AI — KXOAIANTH, KXAGICO, KXDATACENTER + /\b(openai|anthropic|agi|artificial general intelligence|data center)\b/i, + // Elections / leadership — KXNEWPOPE, KXNEXTUKPM, KXUKPARTY + /\b(pope|uk (election|prime minister|party)|labour|conservative|reform party)\b/i, +]; + +/** + * Returns true if the market question matches at least one Kalshi topic area. + * This keeps Polymarket fetches aligned with the topics Kalshi covers so + * the arbitrage matcher sees comparable markets on both sides. + */ +function matchesKalshiTopics(question: string): boolean { + return KALSHI_TOPIC_PATTERNS.some(re => re.test(question)); +} + /** * Returns true only for simple binary Yes/No markets. * Filters out multi-outcome and non-binary markets. @@ -92,6 +119,7 @@ export async function fetchPolymarkets( const pageBinary = data .filter(isBinaryMarket) + .filter(pm => matchesKalshiTopics(pm.question)) .map(toMarket) .filter(m => m.yesPrice > 0 && m.yesPrice < 1); @@ -99,7 +127,7 @@ export async function fetchPolymarkets( console.log( `[Musashi] Polymarket page ${page + 1}: ${data.length} raw → ` + - `${pageBinary.length} binary (total: ${allMarkets.length})` + `${pageBinary.length} topic-matched (total: ${allMarkets.length})` ); if (allMarkets.length >= targetCount || data.length < PAGE_SIZE) break;