Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/core.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Paperstack Core
name: Arxivist Core

on:
workflow_call:
Expand Down Expand Up @@ -29,11 +29,11 @@ jobs:
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Run paperstack
- name: Run arxivist
env:
OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }}
run: |
python paperstack.py \
python arxivist.py \
--output-csv "papers.csv" \
${{ inputs.search-arxiv == true && '--search-arxiv' || '' }} \
${{ inputs.search-scholar == true && '--search-semantic-scholar' || '' }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/csv.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Paperstack (CSV)
name: Arxivist (CSV)

on:
workflow_dispatch:
Expand Down Expand Up @@ -36,6 +36,6 @@ jobs:
# Create new release
gh release create latest-papers \
--title "Latest Research Papers" \
--notes "Latest research papers from PaperStack" \
--notes "Latest research papers from Arxivist" \
research-papers/papers.csv \
--latest
2 changes: 1 addition & 1 deletion .github/workflows/long.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Paperstack (Long)
name: Arxivist (Long)

on:
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/manual.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Paperstack (Manual)
name: Arxivist (Manual)

on:
workflow_dispatch:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/short.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Paperstack (Short)
name: Arxivist (Short)

on:
schedule:
Expand Down
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,11 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


# Obsidian
/obsidian/

# Papers
papers.csv
papers.md
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# paperstack
# Arxivist

Paperstack uses ArXiv and Semantic Scholar (relational) to sync academic paper information into a Notion DB. It also has some lightweight uses of OpenAI models for summarization and categorization. It was built for gathering machine learning and security related papers, but could be adapted easily to any other subject (`ARXIV_SEARCH`/`--arxiv-search-query`). It's deplyoment is focused on Github actions, but can be executed on the command line directly. It can also detect partial entries (ArXiv link or title) in the Notion DB and fill in the remaining information.
Arxivist uses ArXiv and Semantic Scholar (relational) to sync academic paper information into a Notion DB. It also has some lightweight uses of OpenAI models for summarization and categorization. It was built for gathering machine learning and security related papers, but could be adapted easily to any other subject (`ARXIV_SEARCH`/`--arxiv-search-query`). It's deplyoment is focused on Github actions, but can be executed on the command line directly. It can also detect partial entries (ArXiv link or title) in the Notion DB and fill in the remaining information.

The Notion DB requires a semi-fixed structure as a function of the syncing logic (`notion_utils.py`), and you're free to add columns and custom syncing behavior as needed. Here is the mininmum database layout the tool currently expects:

Expand Down
56 changes: 56 additions & 0 deletions obsidian_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
from datetime import datetime
from typing import List
from _types import Paper

def write_papers_to_obsidian(output_dir: str, papers: List[Paper]) -> None:
"""Write papers to Obsidian markdown files."""
os.makedirs(output_dir, exist_ok=True)

for paper in papers:
if not paper.title:
continue

# Create a safe filename from the title
safe_title = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in paper.title)
filename = os.path.join(output_dir, f"{safe_title}.md")

with open(filename, 'w', encoding='utf-8') as f:
# Write frontmatter
f.write("---\n")
f.write(f"title: {paper.title}\n")
if paper.url:
f.write(f"url: {paper.url}\n")
if paper.published:
f.write(f"date: {paper.published.strftime('%Y-%m-%d')}\n")
if paper.authors:
f.write(f"authors: {', '.join(paper.authors)}\n")
if paper.focus:
f.write(f"focus: {paper.focus.value}\n")
f.write("---\n\n")

# Write content
if paper.summary:
f.write("## Summary\n")
f.write(f"{paper.summary}\n\n")

if paper.abstract:
f.write("## Abstract\n")
f.write(f"{paper.abstract}\n")

def write_papers_table_to_markdown(output_file: str, papers: List[Paper]) -> None:
"""Write a Markdown table of papers to a file."""
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Arxivist Papers\n\n")
f.write("| Title | Authors | Published | URL | Summary | Focus |\n")
f.write("|-------|---------|-----------|-----|---------|-------|\n")

for paper in papers:
# Create an internal link for Obsidian
title_link = f"[[{paper.title}]]" if paper.title else "N/A"
authors = ", ".join(paper.authors) if paper.authors else "N/A"
published_date = paper.published.strftime('%Y-%m-%d') if paper.published else "N/A"
summary = paper.summary if paper.summary else "N/A"
focus = paper.focus.value if paper.focus else "N/A"

f.write(f"| {title_link} | {authors} | {published_date} | {paper.url} | {summary} | {focus} |\n")
51 changes: 34 additions & 17 deletions paperstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
summarize_abstract_with_openai,
)
from scholar_utils import get_recommended_arxiv_ids_from_semantic_scholar
from obsidian_utils import write_papers_table_to_markdown

ARXIV_SEARCH = """\
"adversarial attacks" OR "language model attacks" OR "LLM vulnerabilities" OR \
Expand All @@ -29,18 +30,29 @@ def main():
parser.add_argument(
"--openai-token",
type=str,
default=os.environ.get("OPENAI_API_TOKEN"),
help="OpenAI token",
default=None,
help="OpenAI token (optional)",
)
parser.add_argument("--arxiv-search-query", type=str, default=ARXIV_SEARCH)
parser.add_argument("--search-arxiv", action="store_true", default=False)
parser.add_argument("--search-semantic-scholar", action="store_true", default=False)
parser.add_argument(
"--output-obsidian",
type=str,
default="papers.md",
help="Path to output Obsidian folder"
)

args = parser.parse_args()

print("[+] Paperstack")
print("[+] Arxivist")

openai_client = get_openai_client(args.openai_token)
openai_client = None
if args.openai_token:
openai_client = get_openai_client(args.openai_token)
print(" |- OpenAI client initialized")
else:
print(" |- No OpenAI token provided; skipping OpenAI operations")

print(f" |- Reading existing papers from CSV [{args.output_csv}]")
papers = get_papers_from_csv(args.output_csv)
Expand Down Expand Up @@ -69,24 +81,29 @@ def main():
else:
print(" |- All papers have been explored")

if not all([paper.summary for paper in papers]):
print(" |- Building summaries with OpenAI")
for paper in [p for p in papers if not p.summary and p.abstract]:
print(f" |- {paper.title[:50]}...")
paper.summary = summarize_abstract_with_openai(
openai_client, paper.abstract
)

if not all([paper.focus for paper in papers]):
print(" |- Assigning focus labels with OpenAI")
for paper in [p for p in papers if not p.focus and p.abstract]:
paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract)
print(f" |- {paper.focus}")
if openai_client:
print(" |- Generating summaries and focus labels using OpenAI")
for paper in papers:
if paper.abstract:
paper.summary = summarize_abstract_with_openai(openai_client, paper.abstract)
paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract)
else:
print(" |- Skipping summary generation as no OpenAI token was provided")

print(f" |- Writing papers to CSV [{args.output_csv}]")
write_papers_to_csv(args.output_csv, papers)
print(f" |- Done! Saved {len(papers)} papers to {args.output_csv}")

if args.output_obsidian:
print(f" |- Writing papers to Obsidian format [{args.output_obsidian}]")
from obsidian_utils import write_papers_to_obsidian
write_papers_to_obsidian(args.output_obsidian, papers)

# Create the arxivist-papers.md file
arxivist_file_path = os.path.join(args.output_obsidian, "arxivist-papers.md")
print(f" |- Writing papers table to Markdown file [{arxivist_file_path}]")
write_papers_table_to_markdown(arxivist_file_path, papers)


if __name__ == "__main__":
main()