From cab4a989390923270803e50d0fa79537af86ac9b Mon Sep 17 00:00:00 2001 From: Oscar Valdez Date: Wed, 26 Feb 2025 14:03:47 -0600 Subject: [PATCH 1/3] Add obsidian option and updated gitignore --- .gitignore | 8 ++++++++ obsidian_utils.py | 39 +++++++++++++++++++++++++++++++++++++++ paperstack.py | 43 +++++++++++++++++++++++++++---------------- 3 files changed, 74 insertions(+), 16 deletions(-) create mode 100644 obsidian_utils.py diff --git a/.gitignore b/.gitignore index 68bc17f..fce4bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,11 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + + +# Obsidian +/obsidian/ + +# Papers +papers.csv +papers.md diff --git a/obsidian_utils.py b/obsidian_utils.py new file mode 100644 index 0000000..0bd7f35 --- /dev/null +++ b/obsidian_utils.py @@ -0,0 +1,39 @@ +import os +from datetime import datetime +from typing import List +from _types import Paper + +def write_papers_to_obsidian(output_dir: str, papers: List[Paper]) -> None: + """Write papers to Obsidian markdown files.""" + os.makedirs(output_dir, exist_ok=True) + + for paper in papers: + if not paper.title: + continue + + # Create a safe filename from the title + safe_title = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in paper.title) + filename = os.path.join(output_dir, f"{safe_title}.md") + + with open(filename, 'w', encoding='utf-8') as f: + # Write frontmatter + f.write("---\n") + f.write(f"title: {paper.title}\n") + if paper.url: + f.write(f"url: {paper.url}\n") + if paper.published: + f.write(f"date: {paper.published.strftime('%Y-%m-%d')}\n") + if paper.authors: + f.write(f"authors: {', '.join(paper.authors)}\n") + if paper.focus: + f.write(f"focus: {paper.focus.value}\n") + f.write("---\n\n") + + # Write content + if paper.summary: + f.write("## Summary\n") + f.write(f"{paper.summary}\n\n") + + if paper.abstract: + f.write("## Abstract\n") + f.write(f"{paper.abstract}\n") \ No newline at end of file diff --git a/paperstack.py b/paperstack.py index 2423d51..24abeee 100644 --- a/paperstack.py +++ b/paperstack.py @@ -29,18 +29,29 @@ def main(): parser.add_argument( "--openai-token", type=str, - default=os.environ.get("OPENAI_API_TOKEN"), - help="OpenAI token", + default=None, + help="OpenAI token (optional)", ) parser.add_argument("--arxiv-search-query", type=str, default=ARXIV_SEARCH) parser.add_argument("--search-arxiv", action="store_true", default=False) parser.add_argument("--search-semantic-scholar", action="store_true", default=False) + parser.add_argument( + "--output-obsidian", + type=str, + default="papers.md", + help="Path to output Obsidian folder" + ) args = parser.parse_args() print("[+] Paperstack") - openai_client = get_openai_client(args.openai_token) + openai_client = None + if args.openai_token: + openai_client = get_openai_client(args.openai_token) + print(" |- OpenAI client initialized") + else: + print(" |- No OpenAI token provided; skipping OpenAI operations") print(f" |- Reading existing papers from CSV [{args.output_csv}]") papers = get_papers_from_csv(args.output_csv) @@ -69,24 +80,24 @@ def main(): else: print(" |- All papers have been explored") - if not all([paper.summary for paper in papers]): - print(" |- Building summaries with OpenAI") - for paper in [p for p in papers if not p.summary and p.abstract]: - print(f" |- {paper.title[:50]}...") - paper.summary = summarize_abstract_with_openai( - openai_client, paper.abstract - ) - - if not all([paper.focus for paper in papers]): - print(" |- Assigning focus labels with OpenAI") - for paper in [p for p in papers if not p.focus and p.abstract]: - paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract) - print(f" |- {paper.focus}") + if openai_client: + print(" |- Generating summaries and focus labels using OpenAI") + for paper in papers: + if paper.abstract: + paper.summary = summarize_abstract_with_openai(openai_client, paper.abstract) + paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract) + else: + print(" |- Skipping summary generation as no OpenAI token was provided") print(f" |- Writing papers to CSV [{args.output_csv}]") write_papers_to_csv(args.output_csv, papers) print(f" |- Done! Saved {len(papers)} papers to {args.output_csv}") + if args.output_obsidian: + print(f" |- Writing papers to Obsidian format [{args.output_obsidian}]") + from obsidian_utils import write_papers_to_obsidian + write_papers_to_obsidian(args.output_obsidian, papers) + if __name__ == "__main__": main() From 219e956f979389a04541cca3d4bcf56454bdcb26 Mon Sep 17 00:00:00 2001 From: Oscar Valdez Date: Wed, 26 Feb 2025 16:26:00 -0600 Subject: [PATCH 2/3] Added table with obsdian option --- obsidian_utils.py | 19 ++++++++++++++++++- paperstack.py | 6 ++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/obsidian_utils.py b/obsidian_utils.py index 0bd7f35..3674ad7 100644 --- a/obsidian_utils.py +++ b/obsidian_utils.py @@ -36,4 +36,21 @@ def write_papers_to_obsidian(output_dir: str, papers: List[Paper]) -> None: if paper.abstract: f.write("## Abstract\n") - f.write(f"{paper.abstract}\n") \ No newline at end of file + f.write(f"{paper.abstract}\n") + +def write_papers_table_to_markdown(output_file: str, papers: List[Paper]) -> None: + """Write a Markdown table of papers to a file.""" + with open(output_file, 'w', encoding='utf-8') as f: + f.write("# Arxivist Papers\n\n") + f.write("| Title | Authors | Published | URL | Summary | Focus |\n") + f.write("|-------|---------|-----------|-----|---------|-------|\n") + + for paper in papers: + # Create an internal link for Obsidian + title_link = f"[[{paper.title}]]" if paper.title else "N/A" + authors = ", ".join(paper.authors) if paper.authors else "N/A" + published_date = paper.published.strftime('%Y-%m-%d') if paper.published else "N/A" + summary = paper.summary if paper.summary else "N/A" + focus = paper.focus.value if paper.focus else "N/A" + + f.write(f"| {title_link} | {authors} | {published_date} | {paper.url} | {summary} | {focus} |\n") \ No newline at end of file diff --git a/paperstack.py b/paperstack.py index 24abeee..1dee09f 100644 --- a/paperstack.py +++ b/paperstack.py @@ -9,6 +9,7 @@ summarize_abstract_with_openai, ) from scholar_utils import get_recommended_arxiv_ids_from_semantic_scholar +from obsidian_utils import write_papers_table_to_markdown ARXIV_SEARCH = """\ "adversarial attacks" OR "language model attacks" OR "LLM vulnerabilities" OR \ @@ -98,6 +99,11 @@ def main(): from obsidian_utils import write_papers_to_obsidian write_papers_to_obsidian(args.output_obsidian, papers) + # Create the arxivist-papers.md file + arxivist_file_path = os.path.join(args.output_obsidian, "arxivist-papers.md") + print(f" |- Writing papers table to Markdown file [{arxivist_file_path}]") + write_papers_table_to_markdown(arxivist_file_path, papers) + if __name__ == "__main__": main() From 5f87feb3cb60cabe21a17c83eccd507dbaf0db05 Mon Sep 17 00:00:00 2001 From: Oscar Valdez Date: Wed, 26 Feb 2025 16:31:24 -0600 Subject: [PATCH 3/3] Renaming this to Arxivist --- .github/workflows/core.yml | 6 +++--- .github/workflows/csv.yml | 4 ++-- .github/workflows/long.yml | 2 +- .github/workflows/manual.yml | 2 +- .github/workflows/short.yml | 2 +- README.md | 4 ++-- paperstack.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 3f351fa..a8f0b85 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -1,4 +1,4 @@ -name: Paperstack Core +name: Arxivist Core on: workflow_call: @@ -29,11 +29,11 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt - - name: Run paperstack + - name: Run arxivist env: OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }} run: | - python paperstack.py \ + python arxivist.py \ --output-csv "papers.csv" \ ${{ inputs.search-arxiv == true && '--search-arxiv' || '' }} \ ${{ inputs.search-scholar == true && '--search-semantic-scholar' || '' }} diff --git a/.github/workflows/csv.yml b/.github/workflows/csv.yml index a78bdcf..fe52b32 100644 --- a/.github/workflows/csv.yml +++ b/.github/workflows/csv.yml @@ -1,4 +1,4 @@ -name: Paperstack (CSV) +name: Arxivist (CSV) on: workflow_dispatch: @@ -36,6 +36,6 @@ jobs: # Create new release gh release create latest-papers \ --title "Latest Research Papers" \ - --notes "Latest research papers from PaperStack" \ + --notes "Latest research papers from Arxivist" \ research-papers/papers.csv \ --latest diff --git a/.github/workflows/long.yml b/.github/workflows/long.yml index d738784..55863fd 100644 --- a/.github/workflows/long.yml +++ b/.github/workflows/long.yml @@ -1,4 +1,4 @@ -name: Paperstack (Long) +name: Arxivist (Long) on: schedule: diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml index 4294884..268f8d6 100644 --- a/.github/workflows/manual.yml +++ b/.github/workflows/manual.yml @@ -1,4 +1,4 @@ -name: Paperstack (Manual) +name: Arxivist (Manual) on: workflow_dispatch: diff --git a/.github/workflows/short.yml b/.github/workflows/short.yml index 31b8b43..f681561 100644 --- a/.github/workflows/short.yml +++ b/.github/workflows/short.yml @@ -1,4 +1,4 @@ -name: Paperstack (Short) +name: Arxivist (Short) on: schedule: diff --git a/README.md b/README.md index e4f586e..83db9d9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# paperstack +# Arxivist -Paperstack uses ArXiv and Semantic Scholar (relational) to sync academic paper information into a Notion DB. It also has some lightweight uses of OpenAI models for summarization and categorization. It was built for gathering machine learning and security related papers, but could be adapted easily to any other subject (`ARXIV_SEARCH`/`--arxiv-search-query`). It's deplyoment is focused on Github actions, but can be executed on the command line directly. It can also detect partial entries (ArXiv link or title) in the Notion DB and fill in the remaining information. +Arxivist uses ArXiv and Semantic Scholar (relational) to sync academic paper information into a Notion DB. It also has some lightweight uses of OpenAI models for summarization and categorization. It was built for gathering machine learning and security related papers, but could be adapted easily to any other subject (`ARXIV_SEARCH`/`--arxiv-search-query`). It's deplyoment is focused on Github actions, but can be executed on the command line directly. It can also detect partial entries (ArXiv link or title) in the Notion DB and fill in the remaining information. The Notion DB requires a semi-fixed structure as a function of the syncing logic (`notion_utils.py`), and you're free to add columns and custom syncing behavior as needed. Here is the mininmum database layout the tool currently expects: diff --git a/paperstack.py b/paperstack.py index 1dee09f..94f43f9 100644 --- a/paperstack.py +++ b/paperstack.py @@ -45,7 +45,7 @@ def main(): args = parser.parse_args() - print("[+] Paperstack") + print("[+] Arxivist") openai_client = None if args.openai_token: