diff --git a/app-store/arxiv/arxiv_background.py b/app-store/arxiv/arxiv_background.py new file mode 100644 index 0000000..821f431 --- /dev/null +++ b/app-store/arxiv/arxiv_background.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import atexit +import logging +import os +import sys + +from app_runtime.background import BackgroundRunContext, run_background +from truffle.app.background_pb2 import BackgroundContext + +from arxiv_bg_worker import ArxivBackgroundWorker + +logger = logging.getLogger("arxiv.background") +logger.setLevel(logging.INFO) + +_worker: ArxivBackgroundWorker | None = None +_PRIORITY_DEFAULT = getattr( + BackgroundContext, + "PRIORITY_DEFAULT", + getattr(BackgroundContext, "PRIORITY_HIGH", 1), +) + + +def _is_verify_mode() -> bool: + return bool(sys.argv and len(sys.argv) > 1 and "verify" in sys.argv[1].lower()) + + +def _ensure_worker() -> ArxivBackgroundWorker: + global _worker + if _worker is None: + interests = str(os.getenv("ARXIV_RESEARCH_INTERESTS", "")).strip() + _worker = ArxivBackgroundWorker(interests_raw=interests) + return _worker + + +def _submit(ctx: BackgroundRunContext, content: str) -> None: + ctx.bg.submit_context( + content=content, + uris=[], + priority=_PRIORITY_DEFAULT, + ) + + +def arxiv_ambient(ctx: BackgroundRunContext) -> None: + worker = _ensure_worker() + result = worker.run_cycle() + + if result.error: + logger.error("ArXiv background cycle failed", extra={"error": result.error}) + return + if not result.content: + logger.info("ArXiv background cycle produced no new recommendations") + return + + _submit(ctx, result.content) + + +def verify() -> int: + worker = _ensure_worker() + ok, message = worker.verify() + if ok: + logger.info(message) + return 0 + logger.error(message) + return 1 + + +def _cleanup() -> None: + global _worker + _worker = None + + +if __name__ == "__main__": + atexit.register(_cleanup) + if _is_verify_mode(): + sys.exit(verify()) + run_background(arxiv_ambient) diff --git a/app-store/arxiv/arxiv_bg_worker.py b/app-store/arxiv/arxiv_bg_worker.py new file mode 100644 index 0000000..3a2c0da --- /dev/null +++ b/app-store/arxiv/arxiv_bg_worker.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import timezone +import logging +from typing import Any + +import arxiv + +from arxiv_common import get_bg_state_path, parse_research_interests + +logger = logging.getLogger("arxiv.bg_worker") +logger.setLevel(logging.INFO) + + +@dataclass +class ArxivRecommendation: + interest: str + paper_id: str + title: str + published: str + abs_url: str + summary: str + + +@dataclass +class BgRunResult: + content: str | None = None + error: str | None = None + + +class ArxivBackgroundWorker: + def __init__(self, interests_raw: str) -> None: + self._interests_raw = interests_raw + self._client = arxiv.Client() + self._state_path = get_bg_state_path() + + @property + def interests(self) -> list[str]: + return parse_research_interests(self._interests_raw) + + def verify(self) -> tuple[bool, str]: + interests = self.interests + if not interests: + return False, "No research interests configured. Provide at least one interest." + return True, f"ArXiv background configured with {len(interests)} interest(s)." + + def run_cycle(self) -> BgRunResult: + interests = self.interests + if not interests: + return BgRunResult(error="no_interests") + + state = self._load_state() + seen_ids: set[str] = set(state.get("seen_ids") or []) + recommendations: list[ArxivRecommendation] = [] + + for interest in interests: + for paper in self._search_interest(interest, max_results=8): + paper_id = paper.get_short_id() + if not paper_id or paper_id in seen_ids: + continue + published_iso = paper.published.astimezone(timezone.utc).date().isoformat() + recommendations.append( + ArxivRecommendation( + interest=interest, + paper_id=paper_id, + title=paper.title.strip(), + published=published_iso, + abs_url=f"https://arxiv.org/abs/{paper_id}", + summary=" ".join((paper.summary or "").split())[:450], + ) + ) + seen_ids.add(paper_id) + if len(recommendations) >= 3: + break + if len(recommendations) >= 3: + break + + if not recommendations: + return BgRunResult(content=None) + + state["seen_ids"] = sorted(list(seen_ids))[-1500:] + self._save_state(state) + return BgRunResult(content=self._build_context(recommendations)) + + def _search_interest(self, interest: str, *, max_results: int) -> list[arxiv.Result]: + try: + search = arxiv.Search( + query=interest, + max_results=max_results, + sort_by=arxiv.SortCriterion.SubmittedDate, + ) + return list(self._client.results(search)) + except Exception as exc: + logger.warning("arXiv search failed for interest '%s': %s", interest, exc) + return [] + + def _load_state(self) -> dict[str, Any]: + path = self._state_path + try: + if not path.exists(): + return {"seen_ids": []} + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data + except Exception as exc: + logger.warning("Failed to load BG state: %s", exc) + return {"seen_ids": []} + + def _save_state(self, state: dict[str, Any]) -> None: + path = self._state_path + try: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(state, indent=2, sort_keys=True), encoding="utf-8") + except Exception as exc: + logger.warning("Failed to save BG state: %s", exc) + + def _build_context(self, items: list[ArxivRecommendation]) -> str: + lines: list[str] = [ + "These are research papers the user likes.", + "Please use a research tool like Exa or web search to read each paper and provide the user a summary and notes.", + "", + "Recommended papers:", + ] + for idx, item in enumerate(items, start=1): + lines.append( + f"{idx}. {item.title} (arXiv:{item.paper_id}, published {item.published})" + ) + lines.append(f" Interest match: {item.interest}") + lines.append(f" URL: {item.abs_url}") + if item.summary: + lines.append(f" Abstract snippet: {item.summary}") + return "\n".join(lines) diff --git a/app-store/arxiv/arxiv_common.py b/app-store/arxiv/arxiv_common.py new file mode 100644 index 0000000..7ef1bd8 --- /dev/null +++ b/app-store/arxiv/arxiv_common.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import os +from pathlib import Path + +MAX_RESULTS = 50 +DEFAULT_STORAGE_PATH = Path.home() / ".arxiv-mcp-server" / "papers" +DEFAULT_BG_STATE_PATH = Path("/root/.arxiv-truffle/arxiv_bg_state.json") + + +def get_storage_path() -> Path: + raw = str(os.getenv("ARXIV_STORAGE_PATH", "")).strip() + path = Path(raw) if raw else DEFAULT_STORAGE_PATH + path = path.resolve() + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_bg_state_path() -> Path: + raw = str(os.getenv("ARXIV_BG_STATE_PATH", "")).strip() + path = Path(raw) if raw else DEFAULT_BG_STATE_PATH + path.parent.mkdir(parents=True, exist_ok=True) + return path + + +def parse_research_interests(raw: str | None) -> list[str]: + if not raw: + return [] + normalized = raw.replace("\n", ",") + out: list[str] = [] + seen: set[str] = set() + for part in normalized.split(","): + interest = part.strip() + if not interest: + continue + key = interest.lower() + if key in seen: + continue + seen.add(key) + out.append(interest) + return out + diff --git a/app-store/arxiv/arxiv_foreground.py b/app-store/arxiv/arxiv_foreground.py new file mode 100644 index 0000000..aaf48d3 --- /dev/null +++ b/app-store/arxiv/arxiv_foreground.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import logging +from typing import Any + +from app_runtime.mcp import create_mcp_server, run_mcp_server + +from arxiv_tools import ( + build_deep_analysis_prompt, + download_paper, + list_papers, + read_paper, + search_papers, +) +from mcp.types import Icon + +logger = logging.getLogger("arxiv.foreground") +logger.setLevel(logging.INFO) + +mcp = create_mcp_server("arxiv") + + +@mcp.tool( + "search_papers", + description=( + "Search arXiv papers with optional date/category filters. " + "Use quoted phrases for exact matches (for example: \"multi-agent systems\") " + "and categories for precision (for example: cs.AI, cs.LG, cs.CL)." + ), + icons=[Icon(src="https://raw.githubusercontent.com/phosphor-icons/core/main/assets/regular/magnifying-glass.svg")], +) +async def tool_search_papers( + query: str, + max_results: int = 10, + date_from: str | None = None, + date_to: str | None = None, + categories: list[str] | None = None, + sort_by: str = "relevance", +) -> dict[str, Any]: + return await search_papers( + query=query, + max_results=max_results, + date_from=date_from, + date_to=date_to, + categories=categories, + sort_by=sort_by, + ) + + +@mcp.tool( + "download_paper", + description=( + "Download an arXiv paper by ID and convert it to markdown for local reading. " + "Use check_status=true to poll conversion progress." + ), + icons=[Icon(src="https://raw.githubusercontent.com/phosphor-icons/core/main/assets/regular/download-simple.svg")], +) +async def tool_download_paper( + paper_id: str, + check_status: bool = False, +) -> dict[str, Any]: + return await download_paper(paper_id=paper_id, check_status=check_status) + + +@mcp.tool( + "list_papers", + description="List papers currently downloaded to local storage.", + icons=[Icon(src="https://raw.githubusercontent.com/phosphor-icons/core/main/assets/regular/list.svg")], +) +async def tool_list_papers() -> dict[str, Any]: + return await list_papers() + + +@mcp.tool( + "read_paper", + description="Read full markdown content for a downloaded paper by arXiv ID.", + icons=[Icon(src="https://raw.githubusercontent.com/phosphor-icons/core/main/assets/regular/book-open-text.svg")], +) +async def tool_read_paper(paper_id: str) -> dict[str, Any]: + return await read_paper(paper_id=paper_id) + + +@mcp.prompt( + "deep-paper-analysis", + description="Generate a structured deep analysis instruction set for a paper ID.", +) +async def prompt_deep_paper_analysis(paper_id: str) -> list[dict[str, str]]: + return [{"role": "user", "content": build_deep_analysis_prompt(paper_id)}] + + +if __name__ == "__main__": + run_mcp_server(mcp, logger) diff --git a/app-store/arxiv/arxiv_tools.py b/app-store/arxiv/arxiv_tools.py new file mode 100644 index 0000000..c263d59 --- /dev/null +++ b/app-store/arxiv/arxiv_tools.py @@ -0,0 +1,435 @@ +from __future__ import annotations + +import asyncio +import logging +import xml.etree.ElementTree as et +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + +import arxiv +import httpx +from dateutil import parser as date_parser +from pypdf import PdfReader + +from arxiv_common import MAX_RESULTS, get_storage_path + +logger = logging.getLogger("arxiv.tools") +logger.setLevel(logging.INFO) + +ARXIV_API_URL = "https://export.arxiv.org/api/query" +ARXIV_NS = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", +} + +VALID_CATEGORIES = { + "cs", + "econ", + "eess", + "math", + "physics", + "q-bio", + "q-fin", + "stat", + "astro-ph", + "cond-mat", + "gr-qc", + "hep-ex", + "hep-lat", + "hep-ph", + "hep-th", + "math-ph", + "nlin", + "nucl-ex", + "nucl-th", + "quant-ph", +} + + +@dataclass +class ConversionStatus: + paper_id: str + status: str + started_at: datetime + completed_at: datetime | None = None + error: str | None = None + + +_conversion_statuses: dict[str, ConversionStatus] = {} +_SLASH_TOKEN = "__arxiv_slash__" + + +def _safe_paper_filename(paper_id: str) -> str: + return paper_id.replace("/", _SLASH_TOKEN) + + +def _get_paper_path(paper_id: str, suffix: str = ".md") -> Path: + storage = get_storage_path() + storage.mkdir(parents=True, exist_ok=True) + return storage / f"{_safe_paper_filename(paper_id)}{suffix}" + + +def list_stored_paper_ids() -> list[str]: + storage = get_storage_path() + ids: list[str] = [] + for path in storage.glob("*.md"): + ids.append(path.stem.replace(_SLASH_TOKEN, "/")) + return ids + + +def _validate_categories(categories: list[str]) -> bool: + for category in categories: + prefix = category.split(".", 1)[0] if "." in category else category + if prefix not in VALID_CATEGORIES: + logger.warning("Unknown category prefix: %s", prefix) + return False + return True + + +def _optimize_query(query: str) -> str: + if any(field in query for field in ["ti:", "au:", "abs:", "cat:", "AND", "OR", "ANDNOT"]): + return query + if query.startswith('"') and query.endswith('"'): + return query + return query + + +def _parse_arxiv_atom_response(xml_text: str) -> list[dict[str, Any]]: + results: list[dict[str, Any]] = [] + try: + root = et.fromstring(xml_text) + except et.ParseError as exc: + raise ValueError(f"Failed to parse arXiv API response: {exc}") from exc + + for entry in root.findall("atom:entry", ARXIV_NS): + id_elem = entry.find("atom:id", ARXIV_NS) + if id_elem is None or not id_elem.text: + continue + + paper_id = id_elem.text.split("/abs/")[-1] + short_id = paper_id.split("v", 1)[0] if "v" in paper_id else paper_id + + title_elem = entry.find("atom:title", ARXIV_NS) + title = title_elem.text.strip().replace("\n", " ") if title_elem is not None and title_elem.text else "" + + authors: list[str] = [] + for author in entry.findall("atom:author", ARXIV_NS): + name_elem = author.find("atom:name", ARXIV_NS) + if name_elem is not None and name_elem.text: + authors.append(name_elem.text) + + summary_elem = entry.find("atom:summary", ARXIV_NS) + abstract = summary_elem.text.strip().replace("\n", " ") if summary_elem is not None and summary_elem.text else "" + + categories: list[str] = [] + for cat in entry.findall("arxiv:primary_category", ARXIV_NS): + term = cat.get("term") + if term: + categories.append(term) + for cat in entry.findall("atom:category", ARXIV_NS): + term = cat.get("term") + if term and term not in categories: + categories.append(term) + + published_elem = entry.find("atom:published", ARXIV_NS) + published = published_elem.text if published_elem is not None and published_elem.text else "" + + pdf_url: str | None = None + for link in entry.findall("atom:link", ARXIV_NS): + if link.get("title") == "pdf": + pdf_url = link.get("href") + break + if not pdf_url: + pdf_url = f"https://arxiv.org/pdf/{paper_id}" + + results.append( + { + "id": short_id, + "title": title, + "authors": authors, + "abstract": abstract, + "categories": categories, + "published": published, + "url": pdf_url, + "resource_uri": f"arxiv://{short_id}", + } + ) + + return results + + +async def _raw_arxiv_search( + *, + query: str, + max_results: int = 10, + sort_by: str = "relevance", + date_from: str | None = None, + date_to: str | None = None, + categories: list[str] | None = None, +) -> list[dict[str, Any]]: + query_parts: list[str] = [] + + if query.strip(): + query_parts.append(f"({query})") + + if categories: + category_filter = " OR ".join(f"cat:{cat}" for cat in categories) + query_parts.append(f"({category_filter})") + + if date_from or date_to: + try: + start_date = date_parser.parse(date_from).strftime("%Y%m%d0000") if date_from else "199107010000" + end_date = date_parser.parse(date_to).strftime("%Y%m%d2359") if date_to else datetime.now().strftime("%Y%m%d2359") + except (ValueError, TypeError) as exc: + raise ValueError(f"Invalid date format. Use YYYY-MM-DD format: {exc}") from exc + query_parts.append(f"submittedDate:[{start_date}+TO+{end_date}]") + + if not query_parts: + raise ValueError("No search criteria provided") + + final_query = " AND ".join(query_parts) + sort_map = {"relevance": "relevance", "date": "submittedDate"} + encoded_query = final_query.replace(" AND ", "+AND+").replace(" OR ", "+OR+").replace(" ", "+") + url = ( + f"{ARXIV_API_URL}?search_query={encoded_query}" + f"&max_results={max_results}" + f"&sortBy={sort_map.get(sort_by, 'relevance')}" + "&sortOrder=descending" + ) + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(url) + response.raise_for_status() + return _parse_arxiv_atom_response(response.text) + + +def _process_paper(paper: arxiv.Result) -> dict[str, Any]: + return { + "id": paper.get_short_id(), + "title": paper.title, + "authors": [author.name for author in paper.authors], + "abstract": paper.summary, + "categories": paper.categories, + "published": paper.published.isoformat(), + "url": paper.pdf_url, + "resource_uri": f"arxiv://{paper.get_short_id()}", + } + + +async def search_papers( + *, + query: str, + max_results: int = 10, + date_from: str | None = None, + date_to: str | None = None, + categories: list[str] | None = None, + sort_by: str = "relevance", +) -> dict[str, Any]: + try: + bounded_max = min(int(max_results), MAX_RESULTS) + if categories and not _validate_categories(categories): + return {"status": "error", "message": "Invalid category provided. Please check arXiv category names."} + + if date_from or date_to: + results = await _raw_arxiv_search( + query=_optimize_query(query) if query.strip() else "", + max_results=bounded_max, + sort_by=sort_by, + date_from=date_from, + date_to=date_to, + categories=categories, + ) + return {"status": "success", "total_results": len(results), "papers": results} + + client = arxiv.Client() + query_parts: list[str] = [] + if query.strip(): + query_parts.append(f"({_optimize_query(query)})") + if categories: + query_parts.append("(" + " OR ".join(f"cat:{cat}" for cat in categories) + ")") + if not query_parts: + return {"status": "error", "message": "No search criteria provided"} + + final_query = " ".join(query_parts) + sort_criterion = arxiv.SortCriterion.SubmittedDate if sort_by == "date" else arxiv.SortCriterion.Relevance + search = arxiv.Search(query=final_query, max_results=bounded_max, sort_by=sort_criterion) + + papers: list[dict[str, Any]] = [] + for paper in client.results(search): + if len(papers) >= bounded_max: + break + papers.append(_process_paper(paper)) + return {"status": "success", "total_results": len(papers), "papers": papers} + except arxiv.ArxivError as exc: + return {"status": "error", "message": f"ArXiv API error - {exc}"} + except Exception as exc: + return {"status": "error", "message": str(exc)} + + +def _convert_pdf_to_markdown(paper_id: str, pdf_path: Path) -> None: + try: + reader = PdfReader(str(pdf_path)) + chunks: list[str] = [] + for index, page in enumerate(reader.pages, start=1): + text = page.extract_text() or "" + chunks.append(f"# Page {index}\n\n{text.strip()}\n") + markdown = "\n".join(chunks).strip() + if not markdown: + markdown = "No extractable text found in PDF." + md_path = _get_paper_path(paper_id, ".md") + md_path.write_text(markdown, encoding="utf-8") + status = _conversion_statuses.get(paper_id) + if status: + status.status = "success" + status.completed_at = datetime.now() + try: + pdf_path.unlink(missing_ok=True) + except Exception: + pass + except Exception as exc: + status = _conversion_statuses.get(paper_id) + if status: + status.status = "error" + status.completed_at = datetime.now() + status.error = str(exc) + + +async def download_paper(*, paper_id: str, check_status: bool = False) -> dict[str, Any]: + try: + if check_status: + status = _conversion_statuses.get(paper_id) + if not status: + if _get_paper_path(paper_id, ".md").exists(): + return { + "status": "success", + "message": "Paper is ready", + "resource_uri": f"file://{_get_paper_path(paper_id, '.md')}", + } + return {"status": "unknown", "message": "No download or conversion in progress"} + return { + "status": status.status, + "started_at": status.started_at.isoformat(), + "completed_at": status.completed_at.isoformat() if status.completed_at else None, + "error": status.error, + "message": f"Paper conversion {status.status}", + } + + if _get_paper_path(paper_id, ".md").exists(): + return { + "status": "success", + "message": "Paper already available", + "resource_uri": f"file://{_get_paper_path(paper_id, '.md')}", + } + + if paper_id in _conversion_statuses: + status = _conversion_statuses[paper_id] + return { + "status": status.status, + "message": f"Paper conversion {status.status}", + "started_at": status.started_at.isoformat(), + } + + pdf_path = _get_paper_path(paper_id, ".pdf") + client = arxiv.Client() + _conversion_statuses[paper_id] = ConversionStatus( + paper_id=paper_id, + status="downloading", + started_at=datetime.now(), + ) + + paper = next(client.results(arxiv.Search(id_list=[paper_id]))) + paper.download_pdf(dirpath=pdf_path.parent, filename=pdf_path.name) + + status = _conversion_statuses[paper_id] + status.status = "converting" + asyncio.create_task(asyncio.to_thread(_convert_pdf_to_markdown, paper_id, pdf_path)) + return { + "status": "converting", + "message": "Paper downloaded, conversion started", + "started_at": status.started_at.isoformat(), + } + except StopIteration: + return {"status": "error", "message": f"Paper {paper_id} not found on arXiv"} + except Exception as exc: + return {"status": "error", "message": f"Error: {exc}"} + + +async def list_papers() -> dict[str, Any]: + try: + paper_ids = list_stored_paper_ids() + if not paper_ids: + return {"status": "success", "total_papers": 0, "papers": []} + + client = arxiv.Client() + results = client.results(arxiv.Search(id_list=paper_ids)) + papers: list[dict[str, Any]] = [] + for result in results: + papers.append( + { + "id": result.get_short_id(), + "title": result.title, + "summary": result.summary, + "authors": [author.name for author in result.authors], + "links": [link.href for link in result.links], + "pdf_url": result.pdf_url, + } + ) + return {"status": "success", "total_papers": len(paper_ids), "papers": papers} + except Exception as exc: + return {"status": "error", "message": f"Error: {exc}"} + + +async def read_paper(*, paper_id: str) -> dict[str, Any]: + try: + paper_path = _get_paper_path(paper_id, ".md") + if not paper_path.exists(): + return { + "status": "error", + "message": f"Paper {paper_id} not found in storage. You may need to download it first using download_paper.", + } + content = paper_path.read_text(encoding="utf-8") + return {"status": "success", "paper_id": paper_id, "content": content} + except Exception as exc: + return {"status": "error", "message": f"Error reading paper: {exc}"} + + +DEEP_PAPER_ANALYSIS_PROMPT = """ +You are an AI research assistant tasked with analyzing academic papers from arXiv. + +AVAILABLE TOOLS: +1. read_paper: retrieve full content of a downloaded paper +2. download_paper: download and convert a paper if not present +3. search_papers: find related papers +4. list_papers: inspect which papers are already downloaded + +WORKFLOW: +- First list_papers, then download_paper if needed, then read_paper. +- If the paper is unavailable, use search_papers for related work and context. + +ANALYSIS STRUCTURE: +1) Executive Summary +2) Research Context +3) Methodology Analysis +4) Results Analysis +5) Practical Implications +6) Theoretical Implications +7) Future Directions +8) Broader Impact + +Keep the analysis technically rigorous, concise, and actionable. +""" + + +def build_deep_analysis_prompt(paper_id: str) -> str: + return ( + f"Analyze paper {paper_id}.\n\n" + "Present your analysis with:\n" + "1. Executive Summary (3-5 sentences)\n" + "2. Detailed Analysis\n" + "3. Visual Breakdown (figures/tables)\n" + "4. Related Work Map\n" + "5. Implementation Notes\n\n" + f"{DEEP_PAPER_ANALYSIS_PROMPT}" + ) diff --git a/app-store/arxiv/icon.png b/app-store/arxiv/icon.png new file mode 100644 index 0000000..903f6b5 Binary files /dev/null and b/app-store/arxiv/icon.png differ diff --git a/app-store/arxiv/truffile.yaml b/app-store/arxiv/truffile.yaml new file mode 100644 index 0000000..902dfae --- /dev/null +++ b/app-store/arxiv/truffile.yaml @@ -0,0 +1,87 @@ +metadata: + name: ArXiv + bundle_id: org.deepshard.arxiv + description: | + Search arXiv research papers, read downloaded papers, and get background recommendations. + icon_file: ./icon.png + background: + process: + cmd: + - python + - arxiv_background.py + working_directory: / + environment: + PYTHONUNBUFFERED: "1" + default_schedule: + type: interval + interval: + duration: 30m + prod_duration: 60m + daily_window: "00:00-23:59" + foreground: + process: + cmd: + - python + - arxiv_foreground.py + working_directory: / + environment: + PYTHONUNBUFFERED: "1" + +steps: + - name: Install Python dependencies + type: bash + run: | + pip install --no-cache-dir \ + "arxiv>=2.1.0" \ + "httpx>=0.24.0" \ + "python-dateutil>=2.8.2" \ + "pypdf>=5.0.0" + + - name: Copy application files + type: files + files: + - source: ./arxiv_common.py + destination: ./arxiv_common.py + - source: ./arxiv_tools.py + destination: ./arxiv_tools.py + - source: ./arxiv_bg_worker.py + destination: ./arxiv_bg_worker.py + - source: ./arxiv_foreground.py + destination: ./arxiv_foreground.py + - source: ./arxiv_background.py + destination: ./arxiv_background.py + + - name: Configure research interests + type: text + ui_state_on_show: user_interaction_ready + ui_state_on_complete: move_to_background + content: | + Tell Truffle what research areas you care about. + + The background worker will use these interests to find relevant papers and + submit context recommending which papers the model should summarize. + + Examples: + - multi-agent systems, reinforcement learning, LLM safety + - cs.AI, cs.LG, retrieval augmented generation + - diffusion models, interpretability, robustness + fields: + - name: research_interests + label: Research Interests (comma-separated) + type: text + placeholder: multi-agent systems, reinforcement learning, LLM safety + env: ARXIV_RESEARCH_INTERESTS + validator: + type: bash + run: | + python ./arxiv_background.py --verify + timeout: 60 + error_message: | + Please enter at least one research interest (comma-separated). + + - name: Finalize installation + type: bash + run: | + mkdir -p /root/.arxiv-mcp-server/papers + mkdir -p /root/.arxiv-truffle + echo "ArXiv MCP installation complete!" diff --git a/tests/test_transport_client.py b/tests/test_transport_client.py new file mode 100644 index 0000000..db2700d --- /dev/null +++ b/tests/test_transport_client.py @@ -0,0 +1,44 @@ +import asyncio +import sys +from unittest.mock import Mock + +import truffile +from truffile.transport.client import GRPC_MAX_MESSAGE_BYTES, TruffleClient + + +class _ReadyChannel: + async def channel_ready(self) -> None: + return None + + +def test_connect_sets_grpc_message_size_limits(monkeypatch): + captured = {} + fake_channel = _ReadyChannel() + + def fake_insecure_channel(address, options=None): + captured["address"] = address + captured["options"] = options + return fake_channel + + monkeypatch.setattr("truffile.transport.client.aio.insecure_channel", fake_insecure_channel) + monkeypatch.setattr("truffile.transport.client.TruffleOSStub", Mock(return_value="stub")) + + client = TruffleClient("127.0.0.1:80", token="token") + asyncio.run(client.connect()) + + assert client.channel is fake_channel + assert client.stub == "stub" + assert captured["address"] == "127.0.0.1:80" + assert captured["options"] == [ + ("grpc.max_receive_message_length", GRPC_MAX_MESSAGE_BYTES), + ("grpc.max_send_message_length", GRPC_MAX_MESSAGE_BYTES), + ] + + +def test_init_prepends_repo_root_for_bundled_truffle(monkeypatch): + repo_root = "/Users/truffle/work/truffile" + monkeypatch.setattr(sys, "path", ["/tmp/external"]) + + truffile._ensure_bundled_truffle_on_path() + + assert sys.path[0] == repo_root diff --git a/truffile/__init__.py b/truffile/__init__.py index 32eda02..97f20b5 100644 --- a/truffile/__init__.py +++ b/truffile/__init__.py @@ -1,8 +1,24 @@ import os +import sys +from pathlib import Path # Keep gRPC from enabling fork support in this CLI process. os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false" + +def _ensure_bundled_truffle_on_path() -> None: + repo_root = Path(__file__).resolve().parent.parent + bundled_truffle = repo_root / "truffle" + if not bundled_truffle.is_dir(): + return + + repo_root_str = str(repo_root) + if repo_root_str not in sys.path: + sys.path.insert(0, repo_root_str) + + +_ensure_bundled_truffle_on_path() + try: from ._version import __version__ except ImportError: diff --git a/truffile/_version.py b/truffile/_version.py index d9a7fcc..8565e31 100644 --- a/truffile/_version.py +++ b/truffile/_version.py @@ -28,7 +28,7 @@ commit_id: COMMIT_ID __commit_id__: COMMIT_ID -__version__ = version = '0.1.15.dev20' -__version_tuple__ = version_tuple = (0, 1, 15, 'dev20') +__version__ = version = '0.1.9.dev38' +__version_tuple__ = version_tuple = (0, 1, 9, 'dev38') -__commit_id__ = commit_id = 'g13b0fc12b' +__commit_id__ = commit_id = 'g2aac4c451' diff --git a/truffile/transport/client.py b/truffile/transport/client.py index e88b123..41d6715 100644 --- a/truffile/transport/client.py +++ b/truffile/transport/client.py @@ -27,6 +27,8 @@ from truffle.app.background_pb2 import BackgroundApp, BackgroundAppRuntimePolicy from truffile.schedule import parse_runtime_policy +GRPC_MAX_MESSAGE_BYTES = 32 * 1024 * 1024 + def get_client_metadata() -> ClientMetadata: from truffile import __version__ @@ -82,7 +84,13 @@ def _metadata(self) -> list: return [("session", self.token)] async def connect(self, timeout: float = 15.0): - self.channel = aio.insecure_channel(self.address) + self.channel = aio.insecure_channel( + self.address, + options=[ + ("grpc.max_receive_message_length", GRPC_MAX_MESSAGE_BYTES), + ("grpc.max_send_message_length", GRPC_MAX_MESSAGE_BYTES), + ], + ) await asyncio.wait_for(self.channel.channel_ready(), timeout=timeout) self.stub = TruffleOSStub(self.channel)