Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 38 additions & 7 deletions adapters/zhihu/download.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Adapter logic derived from OpenCLI (https://github.com/jackwener/opencli)
# Original author: jackwener | License: Apache-2.0
# Override: fix author extraction (issue #40) + strip zhida links
site: zhihu
name: download
description: 导出知乎文章为 Markdown 格式
Expand All @@ -17,21 +18,45 @@ args:
default: ./zhihu-articles
description: Output directory

columns: [title, author, status, size]
columns: [title, author, status, path, size]

pipeline:
- navigate:
url: "${{ args.url }}"
settleMs: 3000
settleMs: 5000

- evaluate: |
(() => {
// Detect redirect to homepage (article not found / removed)
if (location.hostname === 'www.zhihu.com' && !location.pathname.startsWith('/p/') && !location.pathname.startsWith('/question/')) {
return { title: 'error', author: '', status: 'failed', size: 'Article not found (redirected to homepage)', filename: 'error.md', imageUrls: [], content: 'Article not found or removed', output: args.output || './zhihu-articles' };
}

const title = document.querySelector('.Post-Title, h1.ContentItem-title, .ArticleTitle, .QuestionHeader-title')?.textContent?.trim() || 'untitled';
const author = document.querySelector('.AuthorInfo-name, .UserLink-link')?.textContent?.trim() || 'unknown';

// Fixed author extraction with fallbacks (issue #40)
const author =
document.querySelector('.AuthorInfo-name')?.textContent?.trim() ||
document.querySelector('.UserLink-link')?.textContent?.trim() ||
document.querySelector('meta[itemprop="author"]')?.getAttribute('content') ||
document.querySelector('meta[name="author"]')?.getAttribute('content') ||
(() => {
try {
const s = document.querySelector('script#js-initialData');
if (s) {
const data = JSON.parse(s.textContent);
const users = data?.initialState?.entities?.users;
if (users) return Object.values(users)[0]?.name || '';
}
} catch(e) {}
return '';
})() ||
'unknown';

const timeEl = document.querySelector('.ContentItem-time, .Post-Time');
const publishTime = timeEl?.textContent?.trim() || '';
const contentEl = document.querySelector('.Post-RichTextContainer, .RichContent-inner, .RichText');
if (!contentEl) return [{ title, author, status: 'failed', size: 'No content found' }];
if (!contentEl) return { title, author, status: 'failed', size: 'No content found', filename: 'error.md', imageUrls: [], content: 'Failed to extract content', output: args.output || './zhihu-articles' };

function htmlToMd(el) {
let md = '';
Expand All @@ -46,7 +71,15 @@ pipeline:
else if (tag === 'br') md += '\n';
else if (tag === 'strong' || tag === 'b') md += '**' + node.textContent + '**';
else if (tag === 'em' || tag === 'i') md += '*' + node.textContent + '*';
else if (tag === 'a') md += '[' + node.textContent + '](' + (node.href || '') + ')';
else if (tag === 'a') {
const href = node.href || '';
// Strip zhida.zhihu.com links, keep display text only
if (href.includes('zhida.zhihu.com')) {
md += node.textContent;
} else {
md += '[' + node.textContent + '](' + href + ')';
}
}
else if (tag === 'img') {
const src = node.getAttribute('data-original') || node.getAttribute('data-actualsrc') || node.getAttribute('data-src') || node.getAttribute('data-lazy-src') || node.src || '';
if (src && !src.includes('data:image')) {
Expand All @@ -56,7 +89,6 @@ pipeline:
}
}
else if (tag === 'noscript') {
// zhihu wraps real img inside noscript for lazy loading
const inner = node.textContent || node.innerHTML || '';
const srcMatch = inner.match(/src=["']([^"']+)["']/);
if (srcMatch) {
Expand All @@ -83,7 +115,6 @@ pipeline:

const markdown = htmlToMd(contentEl);

// Extract image URLs from the generated markdown to ensure exact match
const imageUrls = [];
const imgRe = /!\[([^\]]*)\]\(([^)]+)\)/g;
let m;
Expand Down