diff --git a/.aiignore b/.aiignore new file mode 100644 index 00000000..c880c25b --- /dev/null +++ b/.aiignore @@ -0,0 +1,15 @@ +node_modules +coverage +coverage.lcov + +package-lock.json +pnpm-lock.yaml +bun.lock + +.env + +dist +storage + +# AI Session Files (Private Context) +.sessions diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 60c2913d..d6c6a5fa 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -12,27 +12,25 @@ jobs: strategy: matrix: - node_version: [20.x, 22.x, 24.x] + node_version: [22.x, 24.x, 25.x] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: setup Node.js v${{ matrix.node_version }} - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: ${{ matrix.node_version }} - name: run npm scripts - env: - PROXY_SERVER: ${{ secrets.PROXY_SERVER }} run: | npm install npm run lint - npm run build --if-present + #npm run build --if-present npm run test - name: cache node modules - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: ~/.npm key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index a77d776a..5547051d 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -38,7 +38,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.gitignore b/.gitignore index 5b416c34..971acbc7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,9 @@ yarn.lock coverage.lcov pnpm-lock.yaml lcov.info - +bun.lock deno.lock evaluation + +.sessions diff --git a/.npmignore b/.npmignore index f2f3c65a..a0f6f850 100644 --- a/.npmignore +++ b/.npmignore @@ -5,3 +5,6 @@ pnpm-lock.yaml examples test-data lcov.info + +.aiignore +.sessions diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..bfc44e1a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,113 @@ +# AI Agent Instructions + +Coding guidelines for AI agents working in this project. + +## Philosophy + +- Minimalism. Simple is better. KISS (Keep It Simple, Stupid). +- Clean code, easy to read, easy to delete. +- Functional Programming — pure functions, immutability, no side effects. +- MVP mindset — deliver the smallest thing that works, then iterate. + +## Security Rules (CRITICAL — no exceptions) + +- NEVER output or request .env and example.env file contents +- NEVER hardcode API credentials, secret tokens, private keys or passwords in source code +- NEVER send sensitive data to external AI services +- Follow `.aiignore` and `.gitignore` for excluded files — do not read or reference them +- When asking for help, sanitize data (replace real IDs, emails, tokens with placeholders) +- Do not log sensitive information + +## Coding Standards (Strict) +- Language: JavaScript (ESM syntax). No TypeScript. +- Style: No semicolons, single quotes, 2-space indentation. +- Respect `eslint.config.js` — do not suggest rule changes +- Patterns: + - Functional Programming only. No Classes or OOP. + - Arrow functions are preferred. + - Maximum 3 parameters per function. Use objects for more. +- Naming: camelCase for variables/functions, SNAKE_CASE for constants. +- Documentation: + - Add JSDocs before all functions and exported variables. + - Language: Use American English for all comments and JSDocs. + - Constraint: NEVER use Vietnamese or other languages in the source code. + +### Error Handling + +- Handle errors explicitly — never swallow silently +- Use try/catch with proper logging +- Return null or throw meaningful errors + +```javascript +export const send = async (params) => { + try { + const response = await ai.ask(params) + logger.info(`send() -> success: ${response.id}`) + return response + } catch (err) { + logger.error(`send() -> failed: ${err.message}`) + console.error(err) + return null + } +} +``` + +## Testing Standards + +- Write tests for critical business logic, all error cases +- Use simple test runners (node:test, bun:test, vitest) +- No complex mocking frameworks unless necessary +- Tests live alongside source: `[module].test.js` next to `[module].js` + +## Dependency Rules + +- Prefer built-in APIs over external packages +- Before adding dependency, explain: + - Why it is needed + - Alternatives considered + - Bundle size impact +- Never add dependency for trivial utilities +- Avoid packages with large dependency trees + +## Architecture Rules + +- Do NOT change existing project architecture without explicit approval +- Do NOT move or rename core modules unless requested +- Respect module boundaries +- Avoid cross-module coupling +- New modules must follow existing folder structure + +## When Making Changes + +1. Read existing patterns first +2. Follow current coding style strictly +3. Keep dependencies minimal +4. Handle errors explicitly +5. Add JSDoc comments for new functions +6. Run `npm run lint` before committing +7. Do NOT refactor unrelated code +8. Do NOT modify working code outside task scope +9. Prefer minimal diff changes +10. Preserve existing behavior unless explicitly requested + +## When in Doubt + +- Ask for clarification before generating code +- State your assumption explicitly if proceeding without confirmation +- Prefer doing less and asking over doing more and guessing + +## Git Workflow + +- Work only inside the current branch +- Do NOT create or delete branches +- Do NOT rewrite git history +- Do NOT modify commit messages +- Changes must correspond to the current issue + +## Agent References + +Reference these URLs when working on related topics: + +- Bun: https://bun.sh/llms-full.txt + +--- diff --git a/README.md b/README.md index 746693fa..70020954 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,22 @@ Extract main article, main image and meta data from URL. ## Demo -- [Give it a try!](https://extractus-demo.vercel.app/article) +- [Give it a try!](https://extractus.pwshub.com/article) ## Install ```bash -# npm, pnpm, yarn -npm i @extractus/article-extractor - # bun bun add @extractus/article-extractor + +# npm +npm i @extractus/article-extractor + +# pnpm +pnpm install @extractus/article-extractor + +# yarn +yarn add @extractus/article-extractor ``` ## Usage diff --git a/index.d.ts b/index.d.ts index 3a495780..9cb82521 100644 --- a/index.d.ts +++ b/index.d.ts @@ -1,90 +1,140 @@ // Type definitions -import { IOptions as SanitizeOptions } from "sanitize-html"; +import { IOptions as SanitizeOptions } from 'sanitize-html' +/** + * Transformation for per-site HTML pre/post processing. + */ export interface Transformation { - patterns: Array, + /** URL regex patterns to match */ + patterns: Array + /** Function to pre-process raw HTML before extraction */ pre?: (document: Document) => Document + /** Function to post-process extracted article content */ post?: (document: Document) => Document } -export function addTransformations(transformations: Array): Number; -export function removeTransformations(options: Array): Number; - -export function getSanitizeHtmlOptions(): SanitizeOptions; -export function setSanitizeHtmlOptions(options: SanitizeOptions): void; - /** - * @param input url or html + * Options for the article extraction process. */ - export interface ParserOptions { - /** - * to estimate time to read. - * Default: 300 - */ + /** Words per minute for time-to-read estimation. Default: 300 */ wordsPerMinute?: number - /** - * max num of chars generated for description - * Default: 210 - */ + /** Max chars for generated description. Default: 210 */ descriptionTruncateLen?: number - /** - * min num of chars required for description - * Default: 180 - */ + /** Min chars required for description. Default: 180 */ descriptionLengthThreshold?: number - /** - * min num of chars required for content - * Default: 200 - */ + /** Min chars required for content. Default: 200 */ contentLengthThreshold?: number } +/** + * Proxy configuration for fetching articles. + */ export interface ProxyConfig { - target?: string; - headers?: Record; + /** Proxy endpoint URL */ + target?: string + /** Headers for proxy request */ + headers?: Record } +/** + * Options for the HTTP fetch request. + */ export interface FetchOptions { - /** - * list of request headers - * default: null - */ - headers?: Record; - /** - * the values to configure proxy - * default: null - */ - proxy?: ProxyConfig; - - /** - * http proxy agent - * default: null - */ - agent?: object; - /** - * signal to terminate request - * default: null - */ - signal?: object; + /** Custom request headers */ + headers?: Record + /** Proxy configuration */ + proxy?: ProxyConfig + /** HTTP proxy agent (e.g. HttpsProxyAgent) */ + agent?: object + /** AbortSignal to cancel the request */ + signal?: object } +/** + * Extracted article data structure. + */ export interface ArticleData { - url?: string; - links?: string[]; - title?: string; - description?: string; - image?: string; - favicon?: string; - author?: string; - content?: string; - source?: string; - published?: string; - ttr?: number; - type?: string; + /** Best resolved URL of the article */ + url?: string + /** Alternative URLs (canonical, shortlink, etc.) */ + links?: string[] + /** Article title */ + title?: string + /** Short description or excerpt */ + description?: string + /** Main image URL */ + image?: string + /** Site favicon URL */ + favicon?: string + /** Author name */ + author?: string + /** Extracted article HTML content */ + content?: string + /** Original publisher/source domain */ + source?: string + /** Publication date string */ + published?: string + /** Estimated time to read in seconds (0 = unknown) */ + ttr?: number + /** Page type (e.g. article) */ + type?: string } -export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise; +/** + * Register one or more transformations for per-site HTML processing. + * + * @param transformations - Single transformation or array of transformations + * @returns Number of transformations successfully added + */ +export function addTransformations (transformations: Transformation | Array): number + +/** + * Remove transformations matching the given patterns. + * Calling without arguments removes all transformations. + * + * @param patterns - URL patterns to match for removal + * @returns Number of transformations removed + */ +export function removeTransformations (patterns?: Array): number + +/** + * Get a copy of the current sanitize-html options. + */ +export function getSanitizeHtmlOptions (): SanitizeOptions + +/** + * Update sanitize-html options by merging with the current ones. + * + * @param options - Partial sanitize options to merge + */ +export function setSanitizeHtmlOptions (options: SanitizeOptions): void -export function extractFromHtml(html: string, url?: string, parserOptions?: ParserOptions): Promise; +/** + * Load and extract article data from a URL or HTML string. + * + * @param input - URL or HTML string to extract from + * @param parserOptions - Options for parsing + * @param fetchOptions - Options for HTTP fetch + * @returns Extracted article data or null + */ +export function extract ( + input: string, + parserOptions?: ParserOptions, + fetchOptions?: FetchOptions, +): Promise + +/** + * Extract article data from an HTML string directly. + * + * @param html - Raw HTML content + * @param url - Source URL for resolving relative links + * @param parserOptions - Options for parsing + * @returns Extracted article data or null + */ +export function extractFromHtml ( + html: string, + url?: string, + parserOptions?: ParserOptions, +): Promise diff --git a/package.json b/package.json index 8dc8f20c..14fb9893 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "8.0.20", + "version": "8.1.0", "name": "@extractus/article-extractor", "description": "To extract main article from given URL", "homepage": "https://github.com/extractus/article-extractor", @@ -10,12 +10,12 @@ "author": "@extractus", "main": "./src/main.js", "type": "module", - "imports": { - "cross-fetch": "./src/deno/cross-fetch.js" - }, - "browser": { - "cross-fetch": "./src/deno/cross-fetch.js", - "linkedom": "./src/browser/linkedom.js" + "exports": { + ".": { + "types": "./index.d.ts", + "import": "./src/main.js", + "default": "./src/main.js" + } }, "types": "./index.d.ts", "engines": { @@ -31,19 +31,23 @@ }, "dependencies": { "@mozilla/readability": "^0.6.0", - "@ndaidong/bellajs": "^12.0.1", - "cross-fetch": "^4.1.0", + "@pwshub/bellajs": "^13.0.2", "linkedom": "^0.18.12", - "sanitize-html": "2.17.0" + "sanitize-html": "^2.17.3" }, "devDependencies": { - "@eslint/js": "^9.34.0", - "@types/sanitize-html": "^2.16.0", - "eslint": "^9.34.0", - "globals": "^16.3.0", - "https-proxy-agent": "^7.0.6", - "nock": "^14.0.10" + "@eslint/js": "^10.0.1", + "@types/sanitize-html": "^2.16.1", + "eslint": "^10.3.0", + "globals": "^17.6.0", + "https-proxy-agent": "^9.0.0", + "nock": "^14.0.14" }, + "files": [ + "src", + "index.d.ts" + ], + "sideEffects": false, "keywords": [ "article", "extractor", diff --git a/src/browser/linkedom.js b/src/browser/linkedom.js deleted file mode 100644 index 6d5be046..00000000 --- a/src/browser/linkedom.js +++ /dev/null @@ -1 +0,0 @@ -export const DOMParser = window.DOMParser diff --git a/src/config.js b/src/config.js index 50d5d75a..1554625f 100644 --- a/src/config.js +++ b/src/config.js @@ -1,7 +1,13 @@ // config.js -import { clone } from '@ndaidong/bellajs' +import { clone } from '@pwshub/bellajs' +/** + * Default sanitize-html options for cleaning extracted article content. + * Defines allowed HTML tags, attributes, and iframe domains. + * + * @type {SanitizeOptions} + */ const sanitizeHtmlOptions = { allowedTags: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', @@ -10,7 +16,7 @@ const sanitizeHtmlOptions = { 'details', 'summary', 'pre', 'code', 'ul', 'ol', 'li', 'dd', 'dl', - 'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfood', + 'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfoot', 'fieldset', 'legend', 'figure', 'figcaption', 'img', 'picture', 'video', 'audio', 'source', @@ -53,12 +59,20 @@ const sanitizeHtmlOptions = { } /** - * @returns {SanitizeOptions} + * Get a clone of the current sanitize-html options. + * + * @returns {SanitizeOptions} Cloned sanitize options */ export const getSanitizeHtmlOptions = () => { return clone(sanitizeHtmlOptions) } +/** + * Update sanitize-html options by merging with the current ones. + * + * @param {SanitizeOptions} [opts={}] - Partial options to merge + * @returns {void} + */ export const setSanitizeHtmlOptions = (opts = {}) => { Object.keys(opts).forEach((key) => { sanitizeHtmlOptions[key] = clone(opts[key]) diff --git a/src/deno/cross-fetch.js b/src/deno/cross-fetch.js deleted file mode 100644 index d084f98d..00000000 --- a/src/deno/cross-fetch.js +++ /dev/null @@ -1,2 +0,0 @@ -// cross-fetch.js -export default fetch diff --git a/src/main.js b/src/main.js index 7b65fba2..bffaf4a7 100644 --- a/src/main.js +++ b/src/main.js @@ -2,20 +2,28 @@ import { isString -} from '@ndaidong/bellajs' +} from '@pwshub/bellajs' import retrieve from './utils/retrieve.js' import parseFromHtml from './utils/parseFromHtml.js' import { getCharset } from './utils/html.js' import { isValid as isValidUrl } from './utils/linker.js' +/** + * Load and extract article data from a URL or HTML string. + * + * @param {string} input - URL or HTML string to extract from + * @param {ParserOptions} [parserOptions={}] - Options for parsing + * @param {FetchOptions} [fetchOptions={}] - Options for HTTP fetch + * @returns {Promise} Extracted article data or null + */ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => { if (!isString(input)) { throw new Error('Input must be a string') } if (!isValidUrl(input)) { - return parseFromHtml(input, null, parserOptions || {}) + return parseFromHtml(input, null, parserOptions) } const buffer = await retrieve(input, fetchOptions) const text = buffer ? Buffer.from(buffer).toString().trim() : '' @@ -25,9 +33,17 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => { const charset = getCharset(text) const decoder = new TextDecoder(charset) const html = decoder.decode(buffer) - return parseFromHtml(html, input, parserOptions || {}) + return parseFromHtml(html, input, parserOptions) } +/** + * Extract article data from an HTML string directly. + * + * @param {string} html - Raw HTML content + * @param {string} [url] - Source URL for resolving relative links + * @param {ParserOptions} [parserOptions={}] - Options for parsing + * @returns {Promise} Extracted article data or null + */ export const extractFromHtml = async (html, url, parserOptions = {}) => { return parseFromHtml(html, url, parserOptions) } diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js index 0c082045..2f9584b5 100644 --- a/src/utils/extractLdSchema.js +++ b/src/utils/extractLdSchema.js @@ -1,7 +1,12 @@ // utils -> extractLdSchema.js -import { isArray, isObject, isString } from '@ndaidong/bellajs' +import { isArray, isObject, isString } from '@pwshub/bellajs' +/** + * Allowed JSON-LD schema types that indicate an article or webpage. + * + * @type {string[]} + */ const typeSchemas = [ 'aboutpage', 'checkoutpage', @@ -31,6 +36,11 @@ const typeSchemas = [ 'medicalscholarlyarticle', ] +/** + * Mapping from entry keys to JSON-LD attribute names. + * + * @type {Object} + */ const attributeLists = { description: 'description', image: 'image', @@ -39,6 +49,12 @@ const attributeLists = { type: '@type', } +/** + * Safely parse a JSON string, returning an empty object on failure. + * + * @param {string} text - JSON string to parse + * @returns {Object} Parsed object or empty object + */ const parseJson = (text) => { try { return JSON.parse(text) @@ -47,6 +63,12 @@ const parseJson = (text) => { } } +/** + * Check if the given JSON-LD object has an allowed schema type. + * + * @param {Object} ldJson - Parsed JSON-LD object + * @returns {boolean} True if type is in the allowed list + */ const isAllowedLdJsonType = (ldJson) => { const rootLdJsonType = ldJson['@type'] || '' const arr = isArray(rootLdJsonType) ? rootLdJsonType : [rootLdJsonType] @@ -67,9 +89,9 @@ export default (document, entry) => { ldSchemas.forEach(ldSchema => { const ldJson = parseJson(ldSchema.textContent.replace(/[\n\r\t]/g, '')) if (ldJson && isAllowedLdJsonType(ldJson)) { - Object.entries(attributeLists).forEach(([key, attr]) => { + for (const [key, attr] of Object.entries(attributeLists)) { if (!entry[key] || !ldJson[attr]) { - return + continue } const keyValue = ldJson[attr] @@ -77,7 +99,7 @@ export default (document, entry) => { if (isString(val) && val !== '') { entry[key] = val.trim() } - }) + } } }) diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js index 1e2ec63f..00fec7c5 100644 --- a/src/utils/extractMetaData.test.js +++ b/src/utils/extractMetaData.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isObject, hasProperty } from '@ndaidong/bellajs' +import { isObject, hasProperty } from '@pwshub/bellajs' import extractMetaData from './extractMetaData.js' diff --git a/src/utils/extractWithReadability.js b/src/utils/extractWithReadability.js index 0e6582e6..56833d9b 100644 --- a/src/utils/extractWithReadability.js +++ b/src/utils/extractWithReadability.js @@ -2,8 +2,15 @@ import { Readability } from '@mozilla/readability' import { DOMParser } from 'linkedom' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' +/** + * Extract main article content from HTML using Mozilla Readability. + * + * @param {string} html - Raw HTML content + * @param {string} [url=''] - Source URL for resolving relative paths + * @returns {string|null} Extracted article HTML or null + */ export default (html, url = '') => { if (!isString(html)) { return null @@ -19,6 +26,12 @@ export default (html, url = '') => { return result.textContent ? result.content : null } +/** + * Extract article title from HTML using Mozilla Readability. + * + * @param {string} html - Raw HTML content + * @returns {string|null} Extracted title or null + */ export function extractTitleWithReadability (html) { if (!isString(html)) { return null diff --git a/src/utils/extractWithReadability.test.js b/src/utils/extractWithReadability.test.js index bdcc64cb..80f4bdec 100644 --- a/src/utils/extractWithReadability.test.js +++ b/src/utils/extractWithReadability.test.js @@ -5,7 +5,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' import extractWithReadability, { extractTitleWithReadability } from './extractWithReadability.js' diff --git a/src/utils/findDate.js b/src/utils/findDate.js index 3a666e02..242eaa68 100644 --- a/src/utils/findDate.js +++ b/src/utils/findDate.js @@ -11,7 +11,9 @@ function convertDateFormat (dateString) { let year, month, day - if (parseInt(parts[0]) > 12) { + if (parts[0].length === 4 || parseInt(parts[0]) > 31) { + [year, month, day] = parts + } else if (parseInt(parts[0]) > 12) { [day, month, year] = parts } else { [month, day, year] = parts diff --git a/src/utils/getTimeToRead.js b/src/utils/getTimeToRead.js deleted file mode 100644 index 7d8cef37..00000000 --- a/src/utils/getTimeToRead.js +++ /dev/null @@ -1,8 +0,0 @@ -// utils -> getTimeToRead - -export default (text, wordsPerMinute) => { - const words = text.trim().split(/\s+/g).length - const minToRead = words / wordsPerMinute - const secToRead = Math.ceil(minToRead * 60) - return secToRead -} diff --git a/src/utils/html.js b/src/utils/html.js index 25b23ba4..33bbaa6f 100644 --- a/src/utils/html.js +++ b/src/utils/html.js @@ -2,10 +2,17 @@ import { DOMParser } from 'linkedom' import sanitize from 'sanitize-html' -import { pipe } from '@ndaidong/bellajs' +import { pipe } from '@pwshub/bellajs' import { getSanitizeHtmlOptions } from '../config.js' +/** + * Lightweight HTML sanitization that fixes structural issues + * without stripping any tags or attributes. + * + * @param {string} html - Raw HTML input + * @returns {string} Sanitized HTML (all tags/attributes preserved) + */ export const purify = (html) => { return sanitize(html, { allowedTags: false, @@ -14,8 +21,20 @@ export const purify = (html) => { }) } +/** + * Regex matching strings that consist entirely of whitespace characters. + * + * @type {RegExp} + */ const WS_REGEXP = /^[\s\f\n\r\t\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff\x09\x0a\x0b\x0c\x0d\x20\xa0]+$/ // eslint-disable-line +/** + * Collapse multiple consecutive line breaks into single newlines, + * and remove lines that are entirely whitespace. + * + * @param {string} str - Input string + * @returns {string} Cleaned string + */ const stripMultiLinebreaks = (str) => { return str.replace(/(\r\n|\n|\u2424){2,}/g, '\n').split('\n').map((line) => { return WS_REGEXP.test(line) ? line.trim() : line @@ -24,10 +43,22 @@ const stripMultiLinebreaks = (str) => { }).join('\n') } +/** + * Replace all-whitespace sequences with a single space. + * + * @param {string} str - Input string + * @returns {string} Cleaned string + */ const stripMultispaces = (str) => { return str.replace(WS_REGEXP, ' ').trim() } +/** + * Detect HTML character encoding from meta tags. + * + * @param {string} html - Raw HTML content + * @returns {string} Charset name (defaults to 'utf8') + */ export const getCharset = (html) => { const doc = new DOMParser().parseFromString(html, 'text/html') const m = doc.querySelector('meta[charset]') || null @@ -39,6 +70,13 @@ export const getCharset = (html) => { return charset?.toLowerCase() || 'utf8' } +/** + * Final cleanup of extracted article content: + * sanitize to allowed tags, collapse whitespace. + * + * @param {string} inputHtml - Extracted article HTML + * @returns {string} Cleaned HTML string + */ export const cleanify = (inputHtml) => { const doc = new DOMParser().parseFromString(inputHtml, 'text/html') const html = doc.documentElement.innerHTML @@ -48,3 +86,15 @@ export const cleanify = (inputHtml) => { input => stripMultispaces(input) )(html) } + +/** + * Count the number of img tags in HTML content. + * + * @param {string} html - HTML content + * @returns {number} Number of img elements + */ +export const countImages = (html) => { + const doc = new DOMParser().parseFromString(html, 'text/html') + const imgTags = doc.querySelectorAll('img') || [] + return imgTags.length +} diff --git a/src/utils/html.test.js b/src/utils/html.test.js index 00fc2263..f60eb71b 100644 --- a/src/utils/html.test.js +++ b/src/utils/html.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' import { cleanify diff --git a/src/utils/linker.js b/src/utils/linker.js index 3c1a70f0..59ac04fa 100644 --- a/src/utils/linker.js +++ b/src/utils/linker.js @@ -4,6 +4,12 @@ import { DOMParser } from 'linkedom' import { findBestMatch } from './similarity.js' +/** + * Check if a string is a valid HTTP or HTTPS URL. + * + * @param {string} [url=''] - URL string to validate + * @returns {boolean} True if valid HTTP(S) URL + */ export const isValid = (url = '') => { try { const ourl = new URL(url) @@ -13,11 +19,25 @@ export const isValid = (url = '') => { } } +/** + * Pick the URL that best matches the article title using string similarity. + * + * @param {string[]} [candidates=[]] - Candidate URLs + * @param {string} [title=''] - Article title for comparison + * @returns {string} Best matching URL + */ export const chooseBestUrl = (candidates = [], title = '') => { const ranking = findBestMatch(title, candidates) return ranking.bestMatch.target } +/** + * Resolve a relative URL against a base URL. + * + * @param {string} [fullUrl=''] - Base URL + * @param {string} [relativeUrl=''] - Relative URL to resolve + * @returns {string} Absolute URL or empty string on failure + */ export const absolutify = (fullUrl = '', relativeUrl = '') => { try { const result = new URL(relativeUrl, fullUrl) @@ -27,6 +47,11 @@ export const absolutify = (fullUrl = '', relativeUrl = '') => { } } +/** + * Tracking and analytics query parameters to strip from URLs. + * + * @type {string[]} + */ const blacklistKeys = [ 'CNDID', '__twitter_impression', @@ -87,6 +112,12 @@ const blacklistKeys = [ 'pk_campaign', ] +/** + * Remove tracking parameters and hash fragment from a URL. + * + * @param {string} url - URL to clean + * @returns {string|null} Cleaned URL or null if invalid + */ export const purify = (url) => { try { const pureUrl = new URL(url) @@ -106,6 +137,14 @@ export const purify = (url) => { * @param url {string} * @returns article {string} */ +/** + * Normalize all links, images, and source elements in HTML + * by resolving relative URLs to absolute and adding target=_blank to links. + * + * @param {string} html - HTML content to normalize + * @param {string} url - Base URL for resolving relative paths + * @returns {string} Normalized HTML string + */ export const normalize = (html, url) => { const doc = new DOMParser().parseFromString(html, 'text/html') @@ -124,9 +163,22 @@ export const normalize = (html, url) => { } }) + Array.from(doc.getElementsByTagName('source')).forEach((element) => { + const src = element.getAttribute('src') + if (src) { + element.setAttribute('src', absolutify(url, src)) + } + }) + return Array.from(doc.childNodes).map(element => element.outerHTML).join('') } +/** + * Extract the domain from a URL, stripping the www. prefix. + * + * @param {string} url - Full URL + * @returns {string} Domain name + */ export const getDomain = (url) => { const host = (new URL(url)).host return host.replace('www.', '') diff --git a/src/utils/linker.test.js b/src/utils/linker.test.js index 6ed89d2e..89eac3cb 100644 --- a/src/utils/linker.test.js +++ b/src/utils/linker.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' import { chooseBestUrl, diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js index 406d0777..2df173b2 100644 --- a/src/utils/parseFromHtml.js +++ b/src/utils/parseFromHtml.js @@ -1,8 +1,18 @@ // utils -> parseFromHtml -import { stripTags, truncate, unique, pipe } from '@ndaidong/bellajs' +import { + stripTags, + truncateByChar, + unique, + pipe, + getTTR +} from '@pwshub/bellajs' -import { purify, cleanify } from './html.js' +import { + purify, + cleanify, + countImages +} from './html.js' import { isValid as isValidUrl, @@ -21,14 +31,32 @@ import extractWithReadability, { import { execPreParser, execPostParser } from './transformation.js' -import getTimeToRead from './getTimeToRead.js' - -const summarize = (desc, txt, threshold, maxlen) => { // eslint-disable-line +/** + * Build article description from meta description or text content. + * + * @param {Object} params + * @param {string} params.desc - Meta description + * @param {string} params.text - Stripped text content + * @param {number} params.threshold - Min length to use meta description + * @param {number} params.maxlen - Max chars for truncated description + * @returns {string} Final description string + */ +const summarize = ({ desc, text, threshold, maxlen }) => { return desc.length > threshold ? desc - : truncate(txt, maxlen).replace(/\n/g, ' ') + : truncateByChar(text, maxlen).replace(/\n/g, ' ') } +/** + * Parse HTML content and extract article data. + * Orchestrates metadata extraction, URL normalization, transformations, + * Readability extraction, and content sanitization. + * + * @param {string} inputHtml - Raw HTML content + * @param {string} [inputUrl=''] - Source URL for resolving relative links + * @param {ParserOptions} [parserOptions={}] - Parsing options + * @returns {Promise} Extracted article data or null + */ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { const pureHtml = purify(inputHtml) const meta = extractMetaData(pureHtml) @@ -106,16 +134,18 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { return null } - const description = summarize( - metaDesc, - textContent, - descriptionLengthThreshold, - descriptionTruncateLen - ) + const description = summarize({ + desc: metaDesc, + text: textContent, + threshold: descriptionLengthThreshold, + maxlen: descriptionTruncateLen, + }) const image = metaImg ? absolutifyUrl(bestUrl, metaImg) : '' const favicon = metaFav ? absolutifyUrl(bestUrl, metaFav) : '' + const imgcount = countImages(content) + return { url: bestUrl, title, @@ -127,7 +157,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { favicon, source: getDomain(bestUrl), published, - ttr: getTimeToRead(textContent, wordsPerMinute), + ttr: getTTR(textContent, imgcount, wordsPerMinute), type, } } diff --git a/src/utils/parseFromHtml.test.js b/src/utils/parseFromHtml.test.js index e9965038..54512ce4 100644 --- a/src/utils/parseFromHtml.test.js +++ b/src/utils/parseFromHtml.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isFunction } from '@ndaidong/bellajs' +import { isFunction } from '@pwshub/bellajs' import { extractFromHtml as parseFromHtml } from '../main.js' import { addTransformations } from './transformation.js' diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index 6922a752..06f1e596 100644 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -1,7 +1,16 @@ // utils -> retrieve -import fetch from 'cross-fetch' - +/** + * Fetch content through a proxy endpoint. + * + * @param {string} url - Target URL to fetch + * @param {Object} [options={}] - Proxy options + * @param {Object} [options.proxy={}] - Proxy configuration + * @param {string} options.proxy.target - Proxy endpoint URL + * @param {Object} [options.proxy.headers={}] - Headers for proxy request + * @param {AbortSignal} [options.signal] - Optional abort signal + * @returns {Promise} Fetch response object + */ const profetch = async (url, options = {}) => { const { proxy = {}, signal = null } = options const { @@ -15,6 +24,14 @@ const profetch = async (url, options = {}) => { return res } +/** + * Retrieve raw HTML content from a URL. + * Supports direct fetch, proxy, custom headers, agent, and abort signal. + * + * @param {string} url - URL to fetch + * @param {FetchOptions} [options={}] - Fetch configuration + * @returns {Promise} Response body as ArrayBuffer + */ export default async (url, options = {}) => { const { headers = { diff --git a/src/utils/similarity.js b/src/utils/similarity.js index 7e291cbc..4201ca91 100644 --- a/src/utils/similarity.js +++ b/src/utils/similarity.js @@ -1,46 +1,30 @@ // similarity.js -// https://github.com/aceakash/string-similarity - -import { isArray, isString } from '@ndaidong/bellajs' +import { + isString, + compareTwoStrings, + isArray +} from '@pwshub/bellajs' + +/** + * Validate arguments for findBestMatch. + * + * @param {string} mainString - Reference string + * @param {string[]} targetStrings - Strings to compare against + * @returns {boolean} True if arguments are valid + */ const areArgsValid = (mainString, targetStrings) => { return isString(mainString) && isArray(targetStrings) && targetStrings.length > 0 && targetStrings.every(s => isString(s)) } -export const compareTwoStrings = (first, second) => { - first = first.replace(/\s+/g, '') - second = second.replace(/\s+/g, '') - - if (first === second) return 1 // identical or empty - if (first.length < 2 || second.length < 2) return 0 // if either is a 0-letter or 1-letter string - - let firstBigrams = new Map() - for (let i = 0; i < first.length - 1; i++) { - const bigram = first.substring(i, i + 2) - const count = firstBigrams.has(bigram) - ? firstBigrams.get(bigram) + 1 - : 1 - - firstBigrams.set(bigram, count) - } - - let intersectionSize = 0 - for (let i = 0; i < second.length - 1; i++) { - const bigram = second.substring(i, i + 2) - const count = firstBigrams.has(bigram) - ? firstBigrams.get(bigram) - : 0 - - if (count > 0) { - firstBigrams.set(bigram, count - 1) - intersectionSize++ - } - } - - return (2.0 * intersectionSize) / (first.length + second.length - 2) -} - +/** + * Find the best matching string from a list using Dice coefficient. + * + * @param {string} mainString - Reference string to match against + * @param {string[]} targetStrings - Candidate strings + * @returns {{ratings: Array, bestMatch: {target: string, rating: number}, bestMatchIndex: number}} Match results with rankings + */ export const findBestMatch = (mainString, targetStrings) => { if (!areArgsValid(mainString, targetStrings)) { throw new Error('Bad arguments: First argument should be a string, second should be an array of strings') diff --git a/src/utils/transformation.js b/src/utils/transformation.js index 5bb872b1..06aff829 100644 --- a/src/utils/transformation.js +++ b/src/utils/transformation.js @@ -1,10 +1,21 @@ // utils --> transformation.js -import { isArray, isFunction, clone } from '@ndaidong/bellajs' +import { isArray, isFunction } from '@pwshub/bellajs' import { DOMParser } from 'linkedom' +/** + * Registered transformation rules for per-site HTML pre/post processing. + * + * @type {Transformation[]} + */ const transformations = [] +/** + * Add a single transformation to the registry. + * + * @param {Transformation} tn - Transformation object with patterns and handlers + * @returns {number} 1 if added, 0 if invalid + */ const add = (tn) => { const { patterns } = tn if (!patterns || !isArray(patterns) || !patterns.length) { @@ -14,6 +25,12 @@ const add = (tn) => { return 1 } +/** + * Register one or more transformations for per-site HTML processing. + * + * @param {Transformation|Transformation[]} tfms - Transformation(s) to add + * @returns {number} Number of transformations successfully added + */ export const addTransformations = (tfms) => { if (isArray(tfms)) { return tfms.map(tfm => add(tfm)).filter(result => result === 1).length @@ -21,6 +38,13 @@ export const addTransformations = (tfms) => { return add(tfms) } +/** + * Remove transformations matching the given patterns. + * Calling without arguments removes all transformations. + * + * @param {RegExp[]} [patterns] - URL patterns to match for removal + * @returns {number} Number of transformations removed + */ export const removeTransformations = (patterns) => { if (!patterns) { const removed = transformations.length @@ -28,7 +52,7 @@ export const removeTransformations = (patterns) => { return removed } let removing = 0 - for (let i = transformations.length - 1; i > 0; i--) { + for (let i = transformations.length - 1; i >= 0; i--) { const { patterns: ipatterns } = transformations[i] const matched = ipatterns.some((ptn) => patterns.some((pattern) => String(pattern) === String(ptn))) if (matched) { @@ -39,10 +63,21 @@ export const removeTransformations = (patterns) => { return removing } +/** + * Get a copy of all registered transformations. + * + * @returns {Transformation[]} Copy of transformations array + */ export const getTransformations = () => { - return clone(transformations) + return [...transformations] } +/** + * Find all transformations whose patterns match any of the given URLs. + * + * @param {string|string[]} links - URL(s) to match against transformation patterns + * @returns {Transformation[]} Matching transformations + */ export const findTransformations = (links) => { const urls = !isArray(links) ? [links] : links const tfms = [] @@ -50,20 +85,38 @@ export const findTransformations = (links) => { const { patterns } = transformation const matched = urls.some((url) => patterns.some((pattern) => pattern.test(url))) if (matched) { - tfms.push(clone(transformation)) + tfms.push({ + ...transformation, + }) } } return tfms } +/** + * Run pre-extraction transformations on raw HTML. + * Mutates the DOM in place through registered pre-processor functions. + * + * @param {string} html - Raw HTML content + * @param {string[]} links - URLs to match against transformation patterns + * @returns {string} Transformed HTML string + */ export const execPreParser = (html, links) => { const doc = new DOMParser().parseFromString(html, 'text/html') - findTransformations(links).map(tfm => tfm.pre).filter(fn => isFunction(fn)).map(fn => fn(doc)) + findTransformations(links).map(tfm => tfm.pre).filter(fn => isFunction(fn)).forEach(fn => fn(doc)) return Array.from(doc.childNodes).map(it => it.outerHTML).join('') } +/** + * Run post-extraction transformations on extracted article content. + * Mutates the DOM in place through registered post-processor functions. + * + * @param {string} html - Extracted article HTML + * @param {string[]} links - URLs to match against transformation patterns + * @returns {string} Transformed HTML string + */ export const execPostParser = (html, links) => { const doc = new DOMParser().parseFromString(html, 'text/html') - findTransformations(links).map(tfm => tfm.post).filter(fn => isFunction(fn)).map(fn => fn(doc)) + findTransformations(links).map(tfm => tfm.post).filter(fn => isFunction(fn)).forEach(fn => fn(doc)) return Array.from(doc.childNodes).map(it => it.outerHTML).join('') }