Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,39 @@ A small MCP server that turns a YouTube, Spotify, or Apple Podcasts URL into a m

## What it does

One MCP tool:
Three MCP tools:

```
get_transcript(url, language="en", include_timestamps=False) -> markdown
get_episode_info(url) -> markdown
list_recent_episodes(url, limit=10) -> markdown
```

### `get_transcript` — full transcript

- **YouTube URLs** (`youtube.com/watch`, `youtu.be`, `/shorts/`, `/embed/`, `/live/`) — fetches manual or auto-generated captions and renders coalesced paragraphs with YAML frontmatter (title, channel, duration, language, fetched_at, etc.).
- **Spotify episode URLs** (`open.spotify.com/episode/...`) — Spotify has no public transcript API, so the server resolves the episode title via Spotify's oEmbed endpoint, finds the same upload on YouTube via search, and fetches captions there. Works for podcasts that double-post. Spotify-exclusives return a clear error naming the best guess so you can decide whether to find an alternate source.
- **Apple Podcasts URLs** (`podcasts.apple.com/.../id<show>?i=<episode>`) — same pattern as Spotify but via the public iTunes Lookup API to get the episode title. Apple-exclusives won't work; very old episodes (outside the most recent 200 for a show) return a clear "out of lookup window" error.
- Optional `include_timestamps=true` adds `## [MM:SS]` section markers every 5 minutes.

### `get_episode_info` — metadata only, no transcript

Returns YAML frontmatter (title, show/channel, duration, release date, etc.) and a brief description body when one is available. Useful when you want to confirm "is this the right episode?" or just reference an episode in conversation without spending tokens on a full transcript.

- **YouTube** — full metadata via yt-dlp (title, channel, duration, upload date, description).
- **Apple** — full metadata via iTunes Lookup (title, show, duration, release date, description, audio URL).
- **Spotify** — limited metadata via oEmbed (title and show only — Spotify does not expose duration or release date publicly without authenticated API access).

### `list_recent_episodes` — discovery

Returns a numbered markdown list of recent episodes/videos with titles, dates, and durations.

- **Apple Podcasts show URL** (`podcasts.apple.com/.../id<show>` without `?i=`) — recent episodes via iTunes Lookup. If you pass an episode URL, the `?i=` is ignored and the show is listed.
- **YouTube channel URL** (`youtube.com/@name`, `/channel/...`, `/c/...`, `/user/...`) or **playlist URL** (`youtube.com/playlist?list=...`) — recent videos via yt-dlp. Bare channel URLs are auto-resolved to the `/videos` tab.
- **Spotify show URLs are not supported** — Spotify has no public episodes API. Returns a clear error message; try the same show on Apple Podcasts or YouTube.

`limit` is capped at 50.

## Why a proxy is required on Railway

YouTube aggressively blocks transcript requests from cloud-provider IPs. On a residential connection it works without a proxy; on Railway it will fail without one. The server has built-in support for [Webshare](https://www.webshare.io) residential proxies (cheap, ~$1/GB; transcripts are tiny so even the smallest plan lasts forever). Set the proxy env vars below and the underlying library handles rotation.
Expand Down Expand Up @@ -250,12 +272,12 @@ Claude Code should call `get_transcript` and ground the summary in the returned
└── src/tldl/
├── __init__.py
├── config.py # env-var loader, fail-closed
├── markdown.py # frontmatter + paragraph coalescing
├── youtube.py # video_id parse + proxy-aware transcript fetch + yt-dlp metadata
├── markdown.py # frontmatter + paragraph coalescing + episode-info / list formatters
├── youtube.py # video_id parse + proxy-aware transcript fetch + yt-dlp metadata + channel listing
├── resolver.py # shared YouTube search + rapidfuzz scoring (used by Spotify + Apple)
├── spotify.py # oEmbed → resolver
├── apple.py # iTunes Lookup → resolver
└── server.py # FastMCP app, get_transcript tool, /healthz route
├── spotify.py # oEmbed → resolver / episode-info
├── apple.py # iTunes Lookup → resolver / episode-info / show-listing
└── server.py # FastMCP app: get_transcript + get_episode_info + list_recent_episodes + /healthz
```

## Credits
Expand Down
119 changes: 102 additions & 17 deletions src/tldl/apple.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
_SHOW_ID_RE = re.compile(r"/id(\d+)")


def _parse_apple_url(url: str) -> tuple[str, str]:
"""Returns (show_id, episode_track_id). Raises ResolutionError on a malformed URL."""
def _parse_show_id(url: str) -> str:
m = _SHOW_ID_RE.search(url)
if not m:
raise ResolutionError(
"Apple Podcasts URL is missing the show id (expected '/id<digits>')."
)
show_id = m.group(1)
return m.group(1)


def _parse_apple_url(url: str) -> tuple[str, str]:
"""Returns (show_id, episode_track_id). Raises ResolutionError on a malformed URL."""
show_id = _parse_show_id(url)
qs = parse_qs(urlparse(url).query)
i_vals = qs.get("i", [])
if not i_vals or not i_vals[0].isdigit():
Expand All @@ -33,30 +37,61 @@ def _parse_apple_url(url: str) -> tuple[str, str]:
return show_id, i_vals[0]


def _itunes_lookup(show_id: str, episode_track_id: str) -> dict[str, Any]:
"""
Hit the iTunes Lookup API for the show and find the target episode by trackId.
Returns {"title": str, "show": str|None}.
"""
def _itunes_fetch(show_id: str, limit: int) -> tuple[list[dict[str, Any]], int]:
"""Hit iTunes Lookup. Returns (results, requested_limit) so callers can detect cap-hit."""
try:
r = httpx.get(
ITUNES_LOOKUP_URL,
params={"id": show_id, "entity": "podcastEpisode", "limit": LOOKUP_LIMIT},
params={"id": show_id, "entity": "podcastEpisode", "limit": limit},
timeout=10.0,
follow_redirects=True,
)
except httpx.HTTPError as e:
raise ResolutionError(f"Failed to reach iTunes Lookup: {e}") from e
r.raise_for_status()
data = r.json()
results = data.get("results") or []
target_id = int(episode_track_id)
return r.json().get("results") or [], limit

show_name: str | None = None

def _show_name_from_results(
results: list[dict[str, Any]], show_id: str
) -> str | None:
show_id_int = int(show_id)
for item in results:
if item.get("wrapperType") == "track" and item.get("trackId") == int(show_id):
show_name = item.get("collectionName") or item.get("trackName")
break
if item.get("wrapperType") == "track" and item.get("trackId") == show_id_int:
return item.get("collectionName") or item.get("trackName")
return None


def _episode_to_info(item: dict[str, Any], show_name: str | None) -> dict[str, Any]:
"""Normalize an iTunes podcastEpisode dict into our shape."""
release = (item.get("releaseDate") or "").split("T")[0] or None
duration_ms = item.get("trackTimeMillis")
duration_s = int(duration_ms / 1000) if duration_ms else None
show_id = item.get("collectionId")
track_id = item.get("trackId")
episode_url = (
f"https://podcasts.apple.com/us/podcast/id{show_id}?i={track_id}"
if show_id and track_id
else None
)
return {
"title": (item.get("trackName") or "").strip(),
"show": (item.get("collectionName") or show_name or "").strip() or None,
"release_date": release,
"duration": duration_s,
"description": (item.get("description") or "").strip() or None,
"audio_url": item.get("episodeUrl"),
"episode_url": episode_url,
"track_id": track_id,
"show_id": show_id,
}


def _itunes_lookup(show_id: str, episode_track_id: str) -> dict[str, Any]:
"""Find a single episode by trackId. Returns {"title", "show"}."""
results, _ = _itunes_fetch(show_id, LOOKUP_LIMIT)
target_id = int(episode_track_id)
show_name = _show_name_from_results(results, show_id)

for item in results:
if item.get("wrapperType") != "podcastEpisode":
Expand All @@ -69,7 +104,7 @@ def _itunes_lookup(show_id: str, episode_track_id: str) -> dict[str, Any]:
).strip() or None,
}

if data.get("resultCount", 0) >= LOOKUP_LIMIT:
if len(results) >= LOOKUP_LIMIT:
raise ResolutionError(
f"Episode not found in the most recent {LOOKUP_LIMIT} episodes for this show. "
"It may be too old to look up via the iTunes API."
Expand All @@ -92,3 +127,53 @@ def resolve_apple_to_youtube(
if not info["title"]:
raise ResolutionError("iTunes Lookup returned no episode title.")
return find_youtube_match(info["title"], info["show"])


def fetch_apple_episode_info(apple_url: str) -> dict[str, Any]:
"""
Resolve an Apple Podcasts episode URL to a normalized info dict (no YouTube fetch).
"""
show_id, episode_track_id = _parse_apple_url(apple_url)
target_id = int(episode_track_id)
results, _ = _itunes_fetch(show_id, LOOKUP_LIMIT)
show_name = _show_name_from_results(results, show_id)

for item in results:
if item.get("wrapperType") != "podcastEpisode":
continue
if item.get("trackId") == target_id:
info = _episode_to_info(item, show_name)
info["source_url"] = apple_url
return info

if len(results) >= LOOKUP_LIMIT:
raise ResolutionError(
f"Episode not found in the most recent {LOOKUP_LIMIT} episodes for this show."
)
raise ResolutionError("Episode not found in iTunes Lookup results.")


def list_apple_episodes(show_url: str, limit: int) -> dict[str, Any]:
"""
List recent episodes for an Apple Podcasts show URL.
Returns {"show", "show_id", "source_url", "episodes": [info_dict, ...]}.
"""
show_id = _parse_show_id(show_url)
fetch_count = min(max(limit + 1, 1), LOOKUP_LIMIT)
results, _ = _itunes_fetch(show_id, fetch_count)
show_name = _show_name_from_results(results, show_id)

episodes = [
_episode_to_info(item, show_name)
for item in results
if item.get("wrapperType") == "podcastEpisode"
]
if not episodes:
raise ResolutionError("No episodes found for this show.")

return {
"show": show_name,
"show_id": int(show_id),
"source_url": show_url,
"episodes": episodes[:limit],
}
92 changes: 92 additions & 0 deletions src/tldl/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,98 @@ def _frontmatter(fields: dict[str, Any]) -> str:
return "\n".join(lines)


def _now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def render_episode_info(info: dict[str, Any], source: str) -> str:
"""
Format an episode-info dict as markdown frontmatter + a brief description body.
`source` is the platform key ("youtube" / "spotify" / "apple").
"""
duration_s = info.get("duration") or 0
use_hours = duration_s >= 3600
duration_str = _fmt_ts(duration_s, force_hours=use_hours) if duration_s else None
upload = info.get("upload_date")
if upload and len(upload) == 8 and upload.isdigit():
upload = f"{upload[0:4]}-{upload[4:6]}-{upload[6:8]}"

fm: dict[str, Any] = {
"title": info.get("title"),
"channel": info.get("channel"),
"show": info.get("show"),
"source": source,
"source_url": info.get("source_url") or info.get("webpage_url"),
"duration": duration_str,
"release_date": info.get("release_date") or upload,
"video_id": info.get("video_id"),
"show_id": info.get("show_id"),
"track_id": info.get("track_id"),
"audio_url": info.get("audio_url"),
"fetched_at": _now_iso(),
}

parts = [_frontmatter(fm)]
title = info.get("title") or "Episode info"
parts.append(f"# {title}")

description = info.get("description")
if description:
parts.append(description)

return "\n\n".join(parts) + "\n"


def render_episode_list(payload: dict[str, Any], source: str) -> str:
"""
Format a list of recent episodes as markdown frontmatter + numbered list.
`payload` shape:
{"show" or "channel": str, "source_url": str, "episodes" or "videos": [...]}
`source` is "apple" / "youtube".
"""
items_key = "videos" if source == "youtube" else "episodes"
items: list[dict[str, Any]] = payload.get(items_key) or []
name_key = "channel" if source == "youtube" else "show"

fm: dict[str, Any] = {
name_key: payload.get(name_key),
"source": source,
"source_url": payload.get("source_url"),
"count": len(items),
"fetched_at": _now_iso(),
}
if "show_id" in payload:
fm["show_id"] = payload.get("show_id")

parts = [_frontmatter(fm)]
heading_name = payload.get(name_key) or "this " + ("channel" if source == "youtube" else "show")
parts.append(f"# Recent from {heading_name}")

body_lines: list[str] = []
for i, item in enumerate(items, 1):
title = item.get("title") or "(untitled)"
url = item.get("url") or item.get("episode_url") or ""
meta_bits: list[str] = []
date = item.get("upload_date") or item.get("release_date")
if date and len(str(date)) == 8 and str(date).isdigit():
date = f"{date[0:4]}-{date[4:6]}-{date[6:8]}"
if date:
meta_bits.append(str(date))
duration_s = item.get("duration") or 0
if duration_s:
meta_bits.append(_fmt_ts(int(duration_s), force_hours=int(duration_s) >= 3600))
meta = " — ".join(meta_bits)
line = f"{i}. **{title}**"
if url:
line = f"{i}. **[{title}]({url})**"
if meta:
line += f" — {meta}"
body_lines.append(line)

parts.append("\n".join(body_lines))
return "\n\n".join(parts) + "\n"


def _coalesce_paragraphs(snippets: list[Any]) -> list[tuple[float, str]]:
"""
Group raw caption snippets into (paragraph_start_seconds, paragraph_text) tuples.
Expand Down
Loading