diff --git a/adapters/zhihu/download.yaml b/adapters/zhihu/download.yaml index 7944c58..bd7799b 100644 --- a/adapters/zhihu/download.yaml +++ b/adapters/zhihu/download.yaml @@ -1,5 +1,6 @@ # Adapter logic derived from OpenCLI (https://github.com/jackwener/opencli) # Original author: jackwener | License: Apache-2.0 +# Override: fix author extraction (issue #40) + strip zhida links site: zhihu name: download description: 导出知乎文章为 Markdown 格式 @@ -17,21 +18,45 @@ args: default: ./zhihu-articles description: Output directory -columns: [title, author, status, size] +columns: [title, author, status, path, size] pipeline: - navigate: url: "${{ args.url }}" - settleMs: 3000 + settleMs: 5000 - evaluate: | (() => { + // Detect redirect to homepage (article not found / removed) + if (location.hostname === 'www.zhihu.com' && !location.pathname.startsWith('/p/') && !location.pathname.startsWith('/question/')) { + return { title: 'error', author: '', status: 'failed', size: 'Article not found (redirected to homepage)', filename: 'error.md', imageUrls: [], content: 'Article not found or removed', output: args.output || './zhihu-articles' }; + } + const title = document.querySelector('.Post-Title, h1.ContentItem-title, .ArticleTitle, .QuestionHeader-title')?.textContent?.trim() || 'untitled'; - const author = document.querySelector('.AuthorInfo-name, .UserLink-link')?.textContent?.trim() || 'unknown'; + + // Fixed author extraction with fallbacks (issue #40) + const author = + document.querySelector('.AuthorInfo-name')?.textContent?.trim() || + document.querySelector('.UserLink-link')?.textContent?.trim() || + document.querySelector('meta[itemprop="author"]')?.getAttribute('content') || + document.querySelector('meta[name="author"]')?.getAttribute('content') || + (() => { + try { + const s = document.querySelector('script#js-initialData'); + if (s) { + const data = JSON.parse(s.textContent); + const users = data?.initialState?.entities?.users; + if (users) return Object.values(users)[0]?.name || ''; + } + } catch(e) {} + return ''; + })() || + 'unknown'; + const timeEl = document.querySelector('.ContentItem-time, .Post-Time'); const publishTime = timeEl?.textContent?.trim() || ''; const contentEl = document.querySelector('.Post-RichTextContainer, .RichContent-inner, .RichText'); - if (!contentEl) return [{ title, author, status: 'failed', size: 'No content found' }]; + if (!contentEl) return { title, author, status: 'failed', size: 'No content found', filename: 'error.md', imageUrls: [], content: 'Failed to extract content', output: args.output || './zhihu-articles' }; function htmlToMd(el) { let md = ''; @@ -46,7 +71,15 @@ pipeline: else if (tag === 'br') md += '\n'; else if (tag === 'strong' || tag === 'b') md += '**' + node.textContent + '**'; else if (tag === 'em' || tag === 'i') md += '*' + node.textContent + '*'; - else if (tag === 'a') md += '[' + node.textContent + '](' + (node.href || '') + ')'; + else if (tag === 'a') { + const href = node.href || ''; + // Strip zhida.zhihu.com links, keep display text only + if (href.includes('zhida.zhihu.com')) { + md += node.textContent; + } else { + md += '[' + node.textContent + '](' + href + ')'; + } + } else if (tag === 'img') { const src = node.getAttribute('data-original') || node.getAttribute('data-actualsrc') || node.getAttribute('data-src') || node.getAttribute('data-lazy-src') || node.src || ''; if (src && !src.includes('data:image')) { @@ -56,7 +89,6 @@ pipeline: } } else if (tag === 'noscript') { - // zhihu wraps real img inside noscript for lazy loading const inner = node.textContent || node.innerHTML || ''; const srcMatch = inner.match(/src=["']([^"']+)["']/); if (srcMatch) { @@ -83,7 +115,6 @@ pipeline: const markdown = htmlToMd(contentEl); - // Extract image URLs from the generated markdown to ensure exact match const imageUrls = []; const imgRe = /!\[([^\]]*)\]\(([^)]+)\)/g; let m;