From 5a952dd57b62d94d30b6076ed6ae524e4e0e27e0 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 19:27:04 +0700 Subject: [PATCH 1/6] v8.1.0 - Add AI Agents guide - Fix some potential issues - Update packages --- .aiignore | 15 ++ .gitignore | 2 + .npmignore | 3 + AGENTS.md | 113 +++++++++++ bun.lock | 242 +++++++++++++++++++++++ package.json | 26 +-- src/browser/linkedom.js | 1 - src/config.js | 2 +- src/deno/cross-fetch.js | 2 - src/main.js | 2 +- src/utils/extractLdSchema.js | 2 +- src/utils/extractMetaData.test.js | 2 +- src/utils/extractWithReadability.js | 2 +- src/utils/extractWithReadability.test.js | 2 +- src/utils/getTimeToRead.js | 8 - src/utils/html.js | 8 +- src/utils/html.test.js | 2 +- src/utils/linker.test.js | 2 +- src/utils/parseFromHtml.js | 36 ++-- src/utils/parseFromHtml.test.js | 2 +- src/utils/retrieve.js | 2 - src/utils/similarity.js | 40 +--- src/utils/transformation.js | 8 +- 23 files changed, 433 insertions(+), 91 deletions(-) create mode 100644 .aiignore create mode 100644 AGENTS.md create mode 100644 bun.lock delete mode 100644 src/browser/linkedom.js delete mode 100644 src/deno/cross-fetch.js delete mode 100644 src/utils/getTimeToRead.js diff --git a/.aiignore b/.aiignore new file mode 100644 index 00000000..c880c25b --- /dev/null +++ b/.aiignore @@ -0,0 +1,15 @@ +node_modules +coverage +coverage.lcov + +package-lock.json +pnpm-lock.yaml +bun.lock + +.env + +dist +storage + +# AI Session Files (Private Context) +.sessions diff --git a/.gitignore b/.gitignore index 5b416c34..a22d411c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,5 @@ lcov.info deno.lock evaluation + +.sessions diff --git a/.npmignore b/.npmignore index f2f3c65a..a0f6f850 100644 --- a/.npmignore +++ b/.npmignore @@ -5,3 +5,6 @@ pnpm-lock.yaml examples test-data lcov.info + +.aiignore +.sessions diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..bfc44e1a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,113 @@ +# AI Agent Instructions + +Coding guidelines for AI agents working in this project. + +## Philosophy + +- Minimalism. Simple is better. KISS (Keep It Simple, Stupid). +- Clean code, easy to read, easy to delete. +- Functional Programming — pure functions, immutability, no side effects. +- MVP mindset — deliver the smallest thing that works, then iterate. + +## Security Rules (CRITICAL — no exceptions) + +- NEVER output or request .env and example.env file contents +- NEVER hardcode API credentials, secret tokens, private keys or passwords in source code +- NEVER send sensitive data to external AI services +- Follow `.aiignore` and `.gitignore` for excluded files — do not read or reference them +- When asking for help, sanitize data (replace real IDs, emails, tokens with placeholders) +- Do not log sensitive information + +## Coding Standards (Strict) +- Language: JavaScript (ESM syntax). No TypeScript. +- Style: No semicolons, single quotes, 2-space indentation. +- Respect `eslint.config.js` — do not suggest rule changes +- Patterns: + - Functional Programming only. No Classes or OOP. + - Arrow functions are preferred. + - Maximum 3 parameters per function. Use objects for more. +- Naming: camelCase for variables/functions, SNAKE_CASE for constants. +- Documentation: + - Add JSDocs before all functions and exported variables. + - Language: Use American English for all comments and JSDocs. + - Constraint: NEVER use Vietnamese or other languages in the source code. + +### Error Handling + +- Handle errors explicitly — never swallow silently +- Use try/catch with proper logging +- Return null or throw meaningful errors + +```javascript +export const send = async (params) => { + try { + const response = await ai.ask(params) + logger.info(`send() -> success: ${response.id}`) + return response + } catch (err) { + logger.error(`send() -> failed: ${err.message}`) + console.error(err) + return null + } +} +``` + +## Testing Standards + +- Write tests for critical business logic, all error cases +- Use simple test runners (node:test, bun:test, vitest) +- No complex mocking frameworks unless necessary +- Tests live alongside source: `[module].test.js` next to `[module].js` + +## Dependency Rules + +- Prefer built-in APIs over external packages +- Before adding dependency, explain: + - Why it is needed + - Alternatives considered + - Bundle size impact +- Never add dependency for trivial utilities +- Avoid packages with large dependency trees + +## Architecture Rules + +- Do NOT change existing project architecture without explicit approval +- Do NOT move or rename core modules unless requested +- Respect module boundaries +- Avoid cross-module coupling +- New modules must follow existing folder structure + +## When Making Changes + +1. Read existing patterns first +2. Follow current coding style strictly +3. Keep dependencies minimal +4. Handle errors explicitly +5. Add JSDoc comments for new functions +6. Run `npm run lint` before committing +7. Do NOT refactor unrelated code +8. Do NOT modify working code outside task scope +9. Prefer minimal diff changes +10. Preserve existing behavior unless explicitly requested + +## When in Doubt + +- Ask for clarification before generating code +- State your assumption explicitly if proceeding without confirmation +- Prefer doing less and asking over doing more and guessing + +## Git Workflow + +- Work only inside the current branch +- Do NOT create or delete branches +- Do NOT rewrite git history +- Do NOT modify commit messages +- Changes must correspond to the current issue + +## Agent References + +Reference these URLs when working on related topics: + +- Bun: https://bun.sh/llms-full.txt + +--- diff --git a/bun.lock b/bun.lock new file mode 100644 index 00000000..a168741b --- /dev/null +++ b/bun.lock @@ -0,0 +1,242 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "@extractus/article-extractor", + "dependencies": { + "@mozilla/readability": "^0.6.0", + "@pwshub/bellajs": "^13.0.2", + "linkedom": "^0.18.12", + "sanitize-html": "2.17.3", + }, + "devDependencies": { + "@eslint/js": "^10.0.1", + "@types/sanitize-html": "^2.16.1", + "eslint": "^10.3.0", + "globals": "^17.6.0", + "https-proxy-agent": "^9.0.0", + "nock": "^14.0.14", + }, + }, + }, + "packages": { + "@eslint-community/eslint-utils": ["@eslint-community/eslint-utils@4.9.1", "", { "dependencies": { "eslint-visitor-keys": "^3.4.3" }, "peerDependencies": { "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" } }, "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ=="], + + "@eslint-community/regexpp": ["@eslint-community/regexpp@4.12.2", "", {}, "sha512-EriSTlt5OC9/7SXkRSCAhfSxxoSUgBm33OH+IkwbdpgoqsSsUg7y3uh+IICI/Qg4BBWr3U2i39RpmycbxMq4ew=="], + + "@eslint/config-array": ["@eslint/config-array@0.23.5", "", { "dependencies": { "@eslint/object-schema": "^3.0.5", "debug": "^4.3.1", "minimatch": "^10.2.4" } }, "sha512-Y3kKLvC1dvTOT+oGlqNQ1XLqK6D1HU2YXPc52NmAlJZbMMWDzGYXMiPRJ8TYD39muD/OTjlZmNJ4ib7dvSrMBA=="], + + "@eslint/config-helpers": ["@eslint/config-helpers@0.5.5", "", { "dependencies": { "@eslint/core": "^1.2.1" } }, "sha512-eIJYKTCECbP/nsKaaruF6LW967mtbQbsw4JTtSVkUQc9MneSkbrgPJAbKl9nWr0ZeowV8BfsarBmPpBzGelA2w=="], + + "@eslint/core": ["@eslint/core@1.2.1", "", { "dependencies": { "@types/json-schema": "^7.0.15" } }, "sha512-MwcE1P+AZ4C6DWlpin/OmOA54mmIZ/+xZuJiQd4SyB29oAJjN30UW9wkKNptW2ctp4cEsvhlLY/CsQ1uoHDloQ=="], + + "@eslint/js": ["@eslint/js@10.0.1", "", { "peerDependencies": { "eslint": "^10.0.0" }, "optionalPeers": ["eslint"] }, "sha512-zeR9k5pd4gxjZ0abRoIaxdc7I3nDktoXZk2qOv9gCNWx3mVwEn32VRhyLaRsDiJjTs0xq/T8mfPtyuXu7GWBcA=="], + + "@eslint/object-schema": ["@eslint/object-schema@3.0.5", "", {}, "sha512-vqTaUEgxzm+YDSdElad6PiRoX4t8VGDjCtt05zn4nU810UIx/uNEV7/lZJ6KwFThKZOzOxzXy48da+No7HZaMw=="], + + "@eslint/plugin-kit": ["@eslint/plugin-kit@0.7.1", "", { "dependencies": { "@eslint/core": "^1.2.1", "levn": "^0.4.1" } }, "sha512-rZAP3aVgB9ds9KOeUSL+zZ21hPmo8dh6fnIFwRQj5EAZl9gzR7wxYbYXYysAM8CTqGmUGyp2S4kUdV17MnGuWQ=="], + + "@humanfs/core": ["@humanfs/core@0.19.2", "", { "dependencies": { "@humanfs/types": "^0.15.0" } }, "sha512-UhXNm+CFMWcbChXywFwkmhqjs3PRCmcSa/hfBgLIb7oQ5HNb1wS0icWsGtSAUNgefHeI+eBrA8I1fxmbHsGdvA=="], + + "@humanfs/node": ["@humanfs/node@0.16.8", "", { "dependencies": { "@humanfs/core": "^0.19.2", "@humanfs/types": "^0.15.0", "@humanwhocodes/retry": "^0.4.0" } }, "sha512-gE1eQNZ3R++kTzFUpdGlpmy8kDZD/MLyHqDwqjkVQI0JMdI1D51sy1H958PNXYkM2rAac7e5/CnIKZrHtPh3BQ=="], + + "@humanfs/types": ["@humanfs/types@0.15.0", "", {}, "sha512-ZZ1w0aoQkwuUuC7Yf+7sdeaNfqQiiLcSRbfI08oAxqLtpXQr9AIVX7Ay7HLDuiLYAaFPu8oBYNq/QIi9URHJ3Q=="], + + "@humanwhocodes/module-importer": ["@humanwhocodes/module-importer@1.0.1", "", {}, "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA=="], + + "@humanwhocodes/retry": ["@humanwhocodes/retry@0.4.3", "", {}, "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ=="], + + "@mozilla/readability": ["@mozilla/readability@0.6.0", "", {}, "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ=="], + + "@mswjs/interceptors": ["@mswjs/interceptors@0.41.8", "", { "dependencies": { "@open-draft/deferred-promise": "^2.2.0", "@open-draft/logger": "^0.3.0", "@open-draft/until": "^2.0.0", "is-node-process": "^1.2.0", "outvariant": "^1.4.3", "strict-event-emitter": "^0.5.1" } }, "sha512-pRLMNKTSGRoLq+KnEB/7OY5vijw1XmcheAAOiv6pj7W1FG32kAGqj1C/RK/cqxRGr1Fh+zBi8sDur8kj3EQv6A=="], + + "@open-draft/deferred-promise": ["@open-draft/deferred-promise@2.2.0", "", {}, "sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA=="], + + "@open-draft/logger": ["@open-draft/logger@0.3.0", "", { "dependencies": { "is-node-process": "^1.2.0", "outvariant": "^1.4.0" } }, "sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ=="], + + "@open-draft/until": ["@open-draft/until@2.1.0", "", {}, "sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg=="], + + "@pwshub/bellajs": ["@pwshub/bellajs@13.0.2", "", {}, "sha512-OWOaE7Ieufo7VMv957iahOzv/w5oXyyXO28Jh2x6rLU96ZLqC508kAKTgdMJRXgVjda2s+/0C6Kno+k16OmJsQ=="], + + "@types/esrecurse": ["@types/esrecurse@4.3.1", "", {}, "sha512-xJBAbDifo5hpffDBuHl0Y8ywswbiAp/Wi7Y/GtAgSlZyIABppyurxVueOPE8LUQOxdlgi6Zqce7uoEpqNTeiUw=="], + + "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="], + + "@types/json-schema": ["@types/json-schema@7.0.15", "", {}, "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA=="], + + "@types/sanitize-html": ["@types/sanitize-html@2.16.1", "", { "dependencies": { "htmlparser2": "^10.1" } }, "sha512-n9wjs8bCOTyN/ynwD8s/nTcTreIHB1vf31vhLMGqUPNHaweKC4/fAl4Dj+hUlCTKYgm4P3k83fmiFfzkZ6sgMA=="], + + "acorn": ["acorn@8.16.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw=="], + + "acorn-jsx": ["acorn-jsx@5.3.2", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="], + + "agent-base": ["agent-base@9.0.0", "", {}, "sha512-TQf59BsZnytt8GdJKLPfUZ54g/iaUL2OWDSFCCvMOhsHduDQxO8xC4PNeyIkVcA5KwL2phPSv0douC0fgWzmnA=="], + + "ajv": ["ajv@6.15.0", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-fgFx7Hfoq60ytK2c7DhnF8jIvzYgOMxfugjLOSMHjLIPgenqa7S7oaagATUq99mV6IYvN2tRmC0wnTYX6iPbMw=="], + + "balanced-match": ["balanced-match@4.0.4", "", {}, "sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA=="], + + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + + "brace-expansion": ["brace-expansion@5.0.5", "", { "dependencies": { "balanced-match": "^4.0.2" } }, "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ=="], + + "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="], + + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + + "cssom": ["cssom@0.5.0", "", {}, "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "deep-is": ["deep-is@0.1.4", "", {}, "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ=="], + + "deepmerge": ["deepmerge@4.3.1", "", {}, "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A=="], + + "dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="], + + "domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="], + + "domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="], + + "domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="], + + "entities": ["entities@7.0.1", "", {}, "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA=="], + + "escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="], + + "eslint": ["eslint@10.3.0", "", { "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.2", "@eslint/config-array": "^0.23.5", "@eslint/config-helpers": "^0.5.5", "@eslint/core": "^1.2.1", "@eslint/plugin-kit": "^0.7.1", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", "@types/estree": "^1.0.6", "ajv": "^6.14.0", "cross-spawn": "^7.0.6", "debug": "^4.3.2", "escape-string-regexp": "^4.0.0", "eslint-scope": "^9.1.2", "eslint-visitor-keys": "^5.0.1", "espree": "^11.2.0", "esquery": "^1.7.0", "esutils": "^2.0.2", "fast-deep-equal": "^3.1.3", "file-entry-cache": "^8.0.0", "find-up": "^5.0.0", "glob-parent": "^6.0.2", "ignore": "^5.2.0", "imurmurhash": "^0.1.4", "is-glob": "^4.0.0", "json-stable-stringify-without-jsonify": "^1.0.1", "minimatch": "^10.2.4", "natural-compare": "^1.4.0", "optionator": "^0.9.3" }, "peerDependencies": { "jiti": "*" }, "optionalPeers": ["jiti"], "bin": { "eslint": "bin/eslint.js" } }, "sha512-XbEXaRva5cF0ZQB8w6MluHA0kZZfV2DuCMJ3ozyEOHLwDpZX2Lmm/7Pp0xdJmI0GL1W05VH5VwIFHEm1Vcw2gw=="], + + "eslint-scope": ["eslint-scope@9.1.2", "", { "dependencies": { "@types/esrecurse": "^4.3.1", "@types/estree": "^1.0.8", "esrecurse": "^4.3.0", "estraverse": "^5.2.0" } }, "sha512-xS90H51cKw0jltxmvmHy2Iai1LIqrfbw57b79w/J7MfvDfkIkFZ+kj6zC3BjtUwh150HsSSdxXZcsuv72miDFQ=="], + + "eslint-visitor-keys": ["eslint-visitor-keys@5.0.1", "", {}, "sha512-tD40eHxA35h0PEIZNeIjkHoDR4YjjJp34biM0mDvplBe//mB+IHCqHDGV7pxF+7MklTvighcCPPZC7ynWyjdTA=="], + + "espree": ["espree@11.2.0", "", { "dependencies": { "acorn": "^8.16.0", "acorn-jsx": "^5.3.2", "eslint-visitor-keys": "^5.0.1" } }, "sha512-7p3DrVEIopW1B1avAGLuCSh1jubc01H2JHc8B4qqGblmg5gI9yumBgACjWo4JlIc04ufug4xJ3SQI8HkS/Rgzw=="], + + "esquery": ["esquery@1.7.0", "", { "dependencies": { "estraverse": "^5.1.0" } }, "sha512-Ap6G0WQwcU/LHsvLwON1fAQX9Zp0A2Y6Y/cJBl9r/JbW90Zyg4/zbG6zzKa2OTALELarYHmKu0GhpM5EO+7T0g=="], + + "esrecurse": ["esrecurse@4.3.0", "", { "dependencies": { "estraverse": "^5.2.0" } }, "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag=="], + + "estraverse": ["estraverse@5.3.0", "", {}, "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="], + + "esutils": ["esutils@2.0.3", "", {}, "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g=="], + + "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], + + "fast-json-stable-stringify": ["fast-json-stable-stringify@2.1.0", "", {}, "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw=="], + + "fast-levenshtein": ["fast-levenshtein@2.0.6", "", {}, "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw=="], + + "file-entry-cache": ["file-entry-cache@8.0.0", "", { "dependencies": { "flat-cache": "^4.0.0" } }, "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ=="], + + "find-up": ["find-up@5.0.0", "", { "dependencies": { "locate-path": "^6.0.0", "path-exists": "^4.0.0" } }, "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng=="], + + "flat-cache": ["flat-cache@4.0.1", "", { "dependencies": { "flatted": "^3.2.9", "keyv": "^4.5.4" } }, "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw=="], + + "flatted": ["flatted@3.4.2", "", {}, "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA=="], + + "glob-parent": ["glob-parent@6.0.2", "", { "dependencies": { "is-glob": "^4.0.3" } }, "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A=="], + + "globals": ["globals@17.6.0", "", {}, "sha512-sepffkT8stwnIYbsMBpoCHJuJM5l98FUF2AnE07hfvE0m/qp3R586hw4jF4uadbhvg1ooIdzuu7CsfD2jzCaNA=="], + + "html-escaper": ["html-escaper@3.0.3", "", {}, "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ=="], + + "htmlparser2": ["htmlparser2@10.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "entities": "^7.0.1" } }, "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ=="], + + "https-proxy-agent": ["https-proxy-agent@9.0.0", "", { "dependencies": { "agent-base": "9.0.0", "debug": "^4.3.4" } }, "sha512-/MVmHp58WkOypgFhCLk4fzpPcFQvTJ/e6LBI7irpIO2HfxUbpmYoHF+KzipzJpxxzJu7aJNWQ0xojJ/dzV2G5g=="], + + "ignore": ["ignore@5.3.2", "", {}, "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g=="], + + "imurmurhash": ["imurmurhash@0.1.4", "", {}, "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA=="], + + "is-extglob": ["is-extglob@2.1.1", "", {}, "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ=="], + + "is-glob": ["is-glob@4.0.3", "", { "dependencies": { "is-extglob": "^2.1.1" } }, "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg=="], + + "is-node-process": ["is-node-process@1.2.0", "", {}, "sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw=="], + + "is-plain-object": ["is-plain-object@5.0.0", "", {}, "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q=="], + + "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="], + + "json-buffer": ["json-buffer@3.0.1", "", {}, "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="], + + "json-schema-traverse": ["json-schema-traverse@0.4.1", "", {}, "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="], + + "json-stable-stringify-without-jsonify": ["json-stable-stringify-without-jsonify@1.0.1", "", {}, "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw=="], + + "json-stringify-safe": ["json-stringify-safe@5.0.1", "", {}, "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="], + + "keyv": ["keyv@4.5.4", "", { "dependencies": { "json-buffer": "3.0.1" } }, "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw=="], + + "levn": ["levn@0.4.1", "", { "dependencies": { "prelude-ls": "^1.2.1", "type-check": "~0.4.0" } }, "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ=="], + + "linkedom": ["linkedom@0.18.12", "", { "dependencies": { "css-select": "^5.1.0", "cssom": "^0.5.0", "html-escaper": "^3.0.3", "htmlparser2": "^10.0.0", "uhyphen": "^0.2.0" }, "peerDependencies": { "canvas": ">= 2" }, "optionalPeers": ["canvas"] }, "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q=="], + + "locate-path": ["locate-path@6.0.0", "", { "dependencies": { "p-locate": "^5.0.0" } }, "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw=="], + + "minimatch": ["minimatch@10.2.5", "", { "dependencies": { "brace-expansion": "^5.0.5" } }, "sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "nanoid": ["nanoid@3.3.12", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ=="], + + "natural-compare": ["natural-compare@1.4.0", "", {}, "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw=="], + + "nock": ["nock@14.0.14", "", { "dependencies": { "@mswjs/interceptors": "^0.41.0", "json-stringify-safe": "^5.0.1", "propagate": "^2.0.0" } }, "sha512-PKk7tex0O3RRXUZC5XDKJ9yM3rYRPS13myduT85VIIYDBnib42Fpxoe6KxRSzqB4iL2NDxkcJ2yiskZ18hGLEQ=="], + + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + + "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], + + "outvariant": ["outvariant@1.4.3", "", {}, "sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA=="], + + "p-limit": ["p-limit@3.1.0", "", { "dependencies": { "yocto-queue": "^0.1.0" } }, "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ=="], + + "p-locate": ["p-locate@5.0.0", "", { "dependencies": { "p-limit": "^3.0.2" } }, "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw=="], + + "parse-srcset": ["parse-srcset@1.0.2", "", {}, "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="], + + "path-exists": ["path-exists@4.0.0", "", {}, "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w=="], + + "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], + + "picocolors": ["picocolors@1.1.1", "", {}, "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA=="], + + "postcss": ["postcss@8.5.13", "", { "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-qif0+jGGZoLWdHey3UFHHWP0H7Gbmsk8T5VEqyYFbWqPr1XqvLGBbk/sl8V5exGmcYJklJOhOQq1pV9IcsiFag=="], + + "prelude-ls": ["prelude-ls@1.2.1", "", {}, "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g=="], + + "propagate": ["propagate@2.0.1", "", {}, "sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag=="], + + "punycode": ["punycode@2.3.1", "", {}, "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="], + + "sanitize-html": ["sanitize-html@2.17.3", "", { "dependencies": { "deepmerge": "^4.2.2", "escape-string-regexp": "^4.0.0", "htmlparser2": "^10.1.0", "is-plain-object": "^5.0.0", "parse-srcset": "^1.0.2", "postcss": "^8.3.11" } }, "sha512-Kn4srCAo2+wZyvCNKCSyB2g8RQ8IkX/gQs2uqoSRNu5t9I2qvUyAVvRDiFUVAiX3N3PNuwStY0eNr+ooBHVWEg=="], + + "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="], + + "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="], + + "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], + + "strict-event-emitter": ["strict-event-emitter@0.5.1", "", {}, "sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ=="], + + "type-check": ["type-check@0.4.0", "", { "dependencies": { "prelude-ls": "^1.2.1" } }, "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew=="], + + "uhyphen": ["uhyphen@0.2.0", "", {}, "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="], + + "uri-js": ["uri-js@4.4.1", "", { "dependencies": { "punycode": "^2.1.0" } }, "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg=="], + + "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], + + "word-wrap": ["word-wrap@1.2.5", "", {}, "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA=="], + + "yocto-queue": ["yocto-queue@0.1.0", "", {}, "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q=="], + + "@eslint-community/eslint-utils/eslint-visitor-keys": ["eslint-visitor-keys@3.4.3", "", {}, "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag=="], + + "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + } +} diff --git a/package.json b/package.json index 8dc8f20c..785c7a51 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "8.0.20", + "version": "8.1.0", "name": "@extractus/article-extractor", "description": "To extract main article from given URL", "homepage": "https://github.com/extractus/article-extractor", @@ -10,13 +10,6 @@ "author": "@extractus", "main": "./src/main.js", "type": "module", - "imports": { - "cross-fetch": "./src/deno/cross-fetch.js" - }, - "browser": { - "cross-fetch": "./src/deno/cross-fetch.js", - "linkedom": "./src/browser/linkedom.js" - }, "types": "./index.d.ts", "engines": { "node": ">= 20" @@ -31,18 +24,17 @@ }, "dependencies": { "@mozilla/readability": "^0.6.0", - "@ndaidong/bellajs": "^12.0.1", - "cross-fetch": "^4.1.0", + "@pwshub/bellajs": "^13.0.2", "linkedom": "^0.18.12", - "sanitize-html": "2.17.0" + "sanitize-html": "2.17.3" }, "devDependencies": { - "@eslint/js": "^9.34.0", - "@types/sanitize-html": "^2.16.0", - "eslint": "^9.34.0", - "globals": "^16.3.0", - "https-proxy-agent": "^7.0.6", - "nock": "^14.0.10" + "@eslint/js": "^10.0.1", + "@types/sanitize-html": "^2.16.1", + "eslint": "^10.3.0", + "globals": "^17.6.0", + "https-proxy-agent": "^9.0.0", + "nock": "^14.0.14" }, "keywords": [ "article", diff --git a/src/browser/linkedom.js b/src/browser/linkedom.js deleted file mode 100644 index 6d5be046..00000000 --- a/src/browser/linkedom.js +++ /dev/null @@ -1 +0,0 @@ -export const DOMParser = window.DOMParser diff --git a/src/config.js b/src/config.js index 50d5d75a..f3702fcd 100644 --- a/src/config.js +++ b/src/config.js @@ -1,6 +1,6 @@ // config.js -import { clone } from '@ndaidong/bellajs' +import { clone } from '@pwshub/bellajs' const sanitizeHtmlOptions = { allowedTags: [ diff --git a/src/deno/cross-fetch.js b/src/deno/cross-fetch.js deleted file mode 100644 index d084f98d..00000000 --- a/src/deno/cross-fetch.js +++ /dev/null @@ -1,2 +0,0 @@ -// cross-fetch.js -export default fetch diff --git a/src/main.js b/src/main.js index 7b65fba2..66b045dd 100644 --- a/src/main.js +++ b/src/main.js @@ -2,7 +2,7 @@ import { isString -} from '@ndaidong/bellajs' +} from '@pwshub/bellajs' import retrieve from './utils/retrieve.js' import parseFromHtml from './utils/parseFromHtml.js' diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js index 0c082045..9d038db6 100644 --- a/src/utils/extractLdSchema.js +++ b/src/utils/extractLdSchema.js @@ -1,6 +1,6 @@ // utils -> extractLdSchema.js -import { isArray, isObject, isString } from '@ndaidong/bellajs' +import { isArray, isObject, isString } from '@pwshub/bellajs' const typeSchemas = [ 'aboutpage', diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js index 1e2ec63f..00fec7c5 100644 --- a/src/utils/extractMetaData.test.js +++ b/src/utils/extractMetaData.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isObject, hasProperty } from '@ndaidong/bellajs' +import { isObject, hasProperty } from '@pwshub/bellajs' import extractMetaData from './extractMetaData.js' diff --git a/src/utils/extractWithReadability.js b/src/utils/extractWithReadability.js index 0e6582e6..4575ac6d 100644 --- a/src/utils/extractWithReadability.js +++ b/src/utils/extractWithReadability.js @@ -2,7 +2,7 @@ import { Readability } from '@mozilla/readability' import { DOMParser } from 'linkedom' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' export default (html, url = '') => { if (!isString(html)) { diff --git a/src/utils/extractWithReadability.test.js b/src/utils/extractWithReadability.test.js index bdcc64cb..80f4bdec 100644 --- a/src/utils/extractWithReadability.test.js +++ b/src/utils/extractWithReadability.test.js @@ -5,7 +5,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' import extractWithReadability, { extractTitleWithReadability } from './extractWithReadability.js' diff --git a/src/utils/getTimeToRead.js b/src/utils/getTimeToRead.js deleted file mode 100644 index 7d8cef37..00000000 --- a/src/utils/getTimeToRead.js +++ /dev/null @@ -1,8 +0,0 @@ -// utils -> getTimeToRead - -export default (text, wordsPerMinute) => { - const words = text.trim().split(/\s+/g).length - const minToRead = words / wordsPerMinute - const secToRead = Math.ceil(minToRead * 60) - return secToRead -} diff --git a/src/utils/html.js b/src/utils/html.js index 25b23ba4..fe87f90e 100644 --- a/src/utils/html.js +++ b/src/utils/html.js @@ -2,7 +2,7 @@ import { DOMParser } from 'linkedom' import sanitize from 'sanitize-html' -import { pipe } from '@ndaidong/bellajs' +import { pipe } from '@pwshub/bellajs' import { getSanitizeHtmlOptions } from '../config.js' @@ -48,3 +48,9 @@ export const cleanify = (inputHtml) => { input => stripMultispaces(input) )(html) } + +export const countImages = (html) => { + const doc = new DOMParser().parseFromString(html, 'text/html') + const imgTags = doc.querySelectorAll('img') || [] + return imgTags.length +} diff --git a/src/utils/html.test.js b/src/utils/html.test.js index 00fc2263..f60eb71b 100644 --- a/src/utils/html.test.js +++ b/src/utils/html.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' import { cleanify diff --git a/src/utils/linker.test.js b/src/utils/linker.test.js index 6ed89d2e..89eac3cb 100644 --- a/src/utils/linker.test.js +++ b/src/utils/linker.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isString } from '@ndaidong/bellajs' +import { isString } from '@pwshub/bellajs' import { chooseBestUrl, diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js index 406d0777..0c6a493d 100644 --- a/src/utils/parseFromHtml.js +++ b/src/utils/parseFromHtml.js @@ -1,8 +1,18 @@ // utils -> parseFromHtml -import { stripTags, truncate, unique, pipe } from '@ndaidong/bellajs' +import { + stripTags, + truncateByChar, + unique, + pipe, + getTTR +} from '@pwshub/bellajs' -import { purify, cleanify } from './html.js' +import { + purify, + cleanify, + countImages +} from './html.js' import { isValid as isValidUrl, @@ -21,12 +31,10 @@ import extractWithReadability, { import { execPreParser, execPostParser } from './transformation.js' -import getTimeToRead from './getTimeToRead.js' - -const summarize = (desc, txt, threshold, maxlen) => { // eslint-disable-line +const summarize = ({ desc, text, threshold, maxlen }) => { return desc.length > threshold ? desc - : truncate(txt, maxlen).replace(/\n/g, ' ') + : truncateByChar(text, maxlen).replace(/\n/g, ' ') } export default async (inputHtml, inputUrl = '', parserOptions = {}) => { @@ -106,16 +114,18 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { return null } - const description = summarize( - metaDesc, - textContent, - descriptionLengthThreshold, - descriptionTruncateLen - ) + const description = summarize({ + desc: metaDesc, + text: textContent, + threshold: descriptionLengthThreshold, + maxlen: descriptionTruncateLen, + }) const image = metaImg ? absolutifyUrl(bestUrl, metaImg) : '' const favicon = metaFav ? absolutifyUrl(bestUrl, metaFav) : '' + const imgcount = countImages(content) + return { url: bestUrl, title, @@ -127,7 +137,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { favicon, source: getDomain(bestUrl), published, - ttr: getTimeToRead(textContent, wordsPerMinute), + ttr: getTTR(textContent, imgcount, wordsPerMinute), type, } } diff --git a/src/utils/parseFromHtml.test.js b/src/utils/parseFromHtml.test.js index e9965038..54512ce4 100644 --- a/src/utils/parseFromHtml.test.js +++ b/src/utils/parseFromHtml.test.js @@ -4,7 +4,7 @@ import assert from 'node:assert' import { readFileSync } from 'node:fs' -import { isFunction } from '@ndaidong/bellajs' +import { isFunction } from '@pwshub/bellajs' import { extractFromHtml as parseFromHtml } from '../main.js' import { addTransformations } from './transformation.js' diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index 6922a752..18fbd2bc 100644 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -1,7 +1,5 @@ // utils -> retrieve -import fetch from 'cross-fetch' - const profetch = async (url, options = {}) => { const { proxy = {}, signal = null } = options const { diff --git a/src/utils/similarity.js b/src/utils/similarity.js index 7e291cbc..39d4cef5 100644 --- a/src/utils/similarity.js +++ b/src/utils/similarity.js @@ -1,46 +1,16 @@ // similarity.js -// https://github.com/aceakash/string-similarity -import { isArray, isString } from '@ndaidong/bellajs' +import { + isString, + compareTwoStrings, + isArray +} from '@pwshub/bellajs' const areArgsValid = (mainString, targetStrings) => { return isString(mainString) && isArray(targetStrings) && targetStrings.length > 0 && targetStrings.every(s => isString(s)) } -export const compareTwoStrings = (first, second) => { - first = first.replace(/\s+/g, '') - second = second.replace(/\s+/g, '') - - if (first === second) return 1 // identical or empty - if (first.length < 2 || second.length < 2) return 0 // if either is a 0-letter or 1-letter string - - let firstBigrams = new Map() - for (let i = 0; i < first.length - 1; i++) { - const bigram = first.substring(i, i + 2) - const count = firstBigrams.has(bigram) - ? firstBigrams.get(bigram) + 1 - : 1 - - firstBigrams.set(bigram, count) - } - - let intersectionSize = 0 - for (let i = 0; i < second.length - 1; i++) { - const bigram = second.substring(i, i + 2) - const count = firstBigrams.has(bigram) - ? firstBigrams.get(bigram) - : 0 - - if (count > 0) { - firstBigrams.set(bigram, count - 1) - intersectionSize++ - } - } - - return (2.0 * intersectionSize) / (first.length + second.length - 2) -} - export const findBestMatch = (mainString, targetStrings) => { if (!areArgsValid(mainString, targetStrings)) { throw new Error('Bad arguments: First argument should be a string, second should be an array of strings') diff --git a/src/utils/transformation.js b/src/utils/transformation.js index 5bb872b1..ea091389 100644 --- a/src/utils/transformation.js +++ b/src/utils/transformation.js @@ -1,6 +1,6 @@ // utils --> transformation.js -import { isArray, isFunction, clone } from '@ndaidong/bellajs' +import { isArray, isFunction } from '@pwshub/bellajs' import { DOMParser } from 'linkedom' const transformations = [] @@ -40,7 +40,7 @@ export const removeTransformations = (patterns) => { } export const getTransformations = () => { - return clone(transformations) + return [...transformations] } export const findTransformations = (links) => { @@ -50,7 +50,9 @@ export const findTransformations = (links) => { const { patterns } = transformation const matched = urls.some((url) => patterns.some((pattern) => pattern.test(url))) if (matched) { - tfms.push(clone(transformation)) + tfms.push({ + ...transformation, + }) } } return tfms From 45955c30fbcaf08a5edf5056140380500498a8cb Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 19:28:14 +0700 Subject: [PATCH 2/6] Update .gitignore --- .gitignore | 2 +- bun.lock | 242 ----------------------------------------------------- 2 files changed, 1 insertion(+), 243 deletions(-) delete mode 100644 bun.lock diff --git a/.gitignore b/.gitignore index a22d411c..971acbc7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,7 @@ yarn.lock coverage.lcov pnpm-lock.yaml lcov.info - +bun.lock deno.lock evaluation diff --git a/bun.lock b/bun.lock deleted file mode 100644 index a168741b..00000000 --- a/bun.lock +++ /dev/null @@ -1,242 +0,0 @@ -{ - "lockfileVersion": 1, - "configVersion": 1, - "workspaces": { - "": { - "name": "@extractus/article-extractor", - "dependencies": { - "@mozilla/readability": "^0.6.0", - "@pwshub/bellajs": "^13.0.2", - "linkedom": "^0.18.12", - "sanitize-html": "2.17.3", - }, - "devDependencies": { - "@eslint/js": "^10.0.1", - "@types/sanitize-html": "^2.16.1", - "eslint": "^10.3.0", - "globals": "^17.6.0", - "https-proxy-agent": "^9.0.0", - "nock": "^14.0.14", - }, - }, - }, - "packages": { - "@eslint-community/eslint-utils": ["@eslint-community/eslint-utils@4.9.1", "", { "dependencies": { "eslint-visitor-keys": "^3.4.3" }, "peerDependencies": { "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" } }, "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ=="], - - "@eslint-community/regexpp": ["@eslint-community/regexpp@4.12.2", "", {}, "sha512-EriSTlt5OC9/7SXkRSCAhfSxxoSUgBm33OH+IkwbdpgoqsSsUg7y3uh+IICI/Qg4BBWr3U2i39RpmycbxMq4ew=="], - - "@eslint/config-array": ["@eslint/config-array@0.23.5", "", { "dependencies": { "@eslint/object-schema": "^3.0.5", "debug": "^4.3.1", "minimatch": "^10.2.4" } }, "sha512-Y3kKLvC1dvTOT+oGlqNQ1XLqK6D1HU2YXPc52NmAlJZbMMWDzGYXMiPRJ8TYD39muD/OTjlZmNJ4ib7dvSrMBA=="], - - "@eslint/config-helpers": ["@eslint/config-helpers@0.5.5", "", { "dependencies": { "@eslint/core": "^1.2.1" } }, "sha512-eIJYKTCECbP/nsKaaruF6LW967mtbQbsw4JTtSVkUQc9MneSkbrgPJAbKl9nWr0ZeowV8BfsarBmPpBzGelA2w=="], - - "@eslint/core": ["@eslint/core@1.2.1", "", { "dependencies": { "@types/json-schema": "^7.0.15" } }, "sha512-MwcE1P+AZ4C6DWlpin/OmOA54mmIZ/+xZuJiQd4SyB29oAJjN30UW9wkKNptW2ctp4cEsvhlLY/CsQ1uoHDloQ=="], - - "@eslint/js": ["@eslint/js@10.0.1", "", { "peerDependencies": { "eslint": "^10.0.0" }, "optionalPeers": ["eslint"] }, "sha512-zeR9k5pd4gxjZ0abRoIaxdc7I3nDktoXZk2qOv9gCNWx3mVwEn32VRhyLaRsDiJjTs0xq/T8mfPtyuXu7GWBcA=="], - - "@eslint/object-schema": ["@eslint/object-schema@3.0.5", "", {}, "sha512-vqTaUEgxzm+YDSdElad6PiRoX4t8VGDjCtt05zn4nU810UIx/uNEV7/lZJ6KwFThKZOzOxzXy48da+No7HZaMw=="], - - "@eslint/plugin-kit": ["@eslint/plugin-kit@0.7.1", "", { "dependencies": { "@eslint/core": "^1.2.1", "levn": "^0.4.1" } }, "sha512-rZAP3aVgB9ds9KOeUSL+zZ21hPmo8dh6fnIFwRQj5EAZl9gzR7wxYbYXYysAM8CTqGmUGyp2S4kUdV17MnGuWQ=="], - - "@humanfs/core": ["@humanfs/core@0.19.2", "", { "dependencies": { "@humanfs/types": "^0.15.0" } }, "sha512-UhXNm+CFMWcbChXywFwkmhqjs3PRCmcSa/hfBgLIb7oQ5HNb1wS0icWsGtSAUNgefHeI+eBrA8I1fxmbHsGdvA=="], - - "@humanfs/node": ["@humanfs/node@0.16.8", "", { "dependencies": { "@humanfs/core": "^0.19.2", "@humanfs/types": "^0.15.0", "@humanwhocodes/retry": "^0.4.0" } }, "sha512-gE1eQNZ3R++kTzFUpdGlpmy8kDZD/MLyHqDwqjkVQI0JMdI1D51sy1H958PNXYkM2rAac7e5/CnIKZrHtPh3BQ=="], - - "@humanfs/types": ["@humanfs/types@0.15.0", "", {}, "sha512-ZZ1w0aoQkwuUuC7Yf+7sdeaNfqQiiLcSRbfI08oAxqLtpXQr9AIVX7Ay7HLDuiLYAaFPu8oBYNq/QIi9URHJ3Q=="], - - "@humanwhocodes/module-importer": ["@humanwhocodes/module-importer@1.0.1", "", {}, "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA=="], - - "@humanwhocodes/retry": ["@humanwhocodes/retry@0.4.3", "", {}, "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ=="], - - "@mozilla/readability": ["@mozilla/readability@0.6.0", "", {}, "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ=="], - - "@mswjs/interceptors": ["@mswjs/interceptors@0.41.8", "", { "dependencies": { "@open-draft/deferred-promise": "^2.2.0", "@open-draft/logger": "^0.3.0", "@open-draft/until": "^2.0.0", "is-node-process": "^1.2.0", "outvariant": "^1.4.3", "strict-event-emitter": "^0.5.1" } }, "sha512-pRLMNKTSGRoLq+KnEB/7OY5vijw1XmcheAAOiv6pj7W1FG32kAGqj1C/RK/cqxRGr1Fh+zBi8sDur8kj3EQv6A=="], - - "@open-draft/deferred-promise": ["@open-draft/deferred-promise@2.2.0", "", {}, "sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA=="], - - "@open-draft/logger": ["@open-draft/logger@0.3.0", "", { "dependencies": { "is-node-process": "^1.2.0", "outvariant": "^1.4.0" } }, "sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ=="], - - "@open-draft/until": ["@open-draft/until@2.1.0", "", {}, "sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg=="], - - "@pwshub/bellajs": ["@pwshub/bellajs@13.0.2", "", {}, "sha512-OWOaE7Ieufo7VMv957iahOzv/w5oXyyXO28Jh2x6rLU96ZLqC508kAKTgdMJRXgVjda2s+/0C6Kno+k16OmJsQ=="], - - "@types/esrecurse": ["@types/esrecurse@4.3.1", "", {}, "sha512-xJBAbDifo5hpffDBuHl0Y8ywswbiAp/Wi7Y/GtAgSlZyIABppyurxVueOPE8LUQOxdlgi6Zqce7uoEpqNTeiUw=="], - - "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="], - - "@types/json-schema": ["@types/json-schema@7.0.15", "", {}, "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA=="], - - "@types/sanitize-html": ["@types/sanitize-html@2.16.1", "", { "dependencies": { "htmlparser2": "^10.1" } }, "sha512-n9wjs8bCOTyN/ynwD8s/nTcTreIHB1vf31vhLMGqUPNHaweKC4/fAl4Dj+hUlCTKYgm4P3k83fmiFfzkZ6sgMA=="], - - "acorn": ["acorn@8.16.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw=="], - - "acorn-jsx": ["acorn-jsx@5.3.2", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="], - - "agent-base": ["agent-base@9.0.0", "", {}, "sha512-TQf59BsZnytt8GdJKLPfUZ54g/iaUL2OWDSFCCvMOhsHduDQxO8xC4PNeyIkVcA5KwL2phPSv0douC0fgWzmnA=="], - - "ajv": ["ajv@6.15.0", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-fgFx7Hfoq60ytK2c7DhnF8jIvzYgOMxfugjLOSMHjLIPgenqa7S7oaagATUq99mV6IYvN2tRmC0wnTYX6iPbMw=="], - - "balanced-match": ["balanced-match@4.0.4", "", {}, "sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA=="], - - "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], - - "brace-expansion": ["brace-expansion@5.0.5", "", { "dependencies": { "balanced-match": "^4.0.2" } }, "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ=="], - - "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="], - - "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], - - "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], - - "cssom": ["cssom@0.5.0", "", {}, "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="], - - "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], - - "deep-is": ["deep-is@0.1.4", "", {}, "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ=="], - - "deepmerge": ["deepmerge@4.3.1", "", {}, "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A=="], - - "dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="], - - "domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="], - - "domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="], - - "domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="], - - "entities": ["entities@7.0.1", "", {}, "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA=="], - - "escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="], - - "eslint": ["eslint@10.3.0", "", { "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.2", "@eslint/config-array": "^0.23.5", "@eslint/config-helpers": "^0.5.5", "@eslint/core": "^1.2.1", "@eslint/plugin-kit": "^0.7.1", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", "@types/estree": "^1.0.6", "ajv": "^6.14.0", "cross-spawn": "^7.0.6", "debug": "^4.3.2", "escape-string-regexp": "^4.0.0", "eslint-scope": "^9.1.2", "eslint-visitor-keys": "^5.0.1", "espree": "^11.2.0", "esquery": "^1.7.0", "esutils": "^2.0.2", "fast-deep-equal": "^3.1.3", "file-entry-cache": "^8.0.0", "find-up": "^5.0.0", "glob-parent": "^6.0.2", "ignore": "^5.2.0", "imurmurhash": "^0.1.4", "is-glob": "^4.0.0", "json-stable-stringify-without-jsonify": "^1.0.1", "minimatch": "^10.2.4", "natural-compare": "^1.4.0", "optionator": "^0.9.3" }, "peerDependencies": { "jiti": "*" }, "optionalPeers": ["jiti"], "bin": { "eslint": "bin/eslint.js" } }, "sha512-XbEXaRva5cF0ZQB8w6MluHA0kZZfV2DuCMJ3ozyEOHLwDpZX2Lmm/7Pp0xdJmI0GL1W05VH5VwIFHEm1Vcw2gw=="], - - "eslint-scope": ["eslint-scope@9.1.2", "", { "dependencies": { "@types/esrecurse": "^4.3.1", "@types/estree": "^1.0.8", "esrecurse": "^4.3.0", "estraverse": "^5.2.0" } }, "sha512-xS90H51cKw0jltxmvmHy2Iai1LIqrfbw57b79w/J7MfvDfkIkFZ+kj6zC3BjtUwh150HsSSdxXZcsuv72miDFQ=="], - - "eslint-visitor-keys": ["eslint-visitor-keys@5.0.1", "", {}, "sha512-tD40eHxA35h0PEIZNeIjkHoDR4YjjJp34biM0mDvplBe//mB+IHCqHDGV7pxF+7MklTvighcCPPZC7ynWyjdTA=="], - - "espree": ["espree@11.2.0", "", { "dependencies": { "acorn": "^8.16.0", "acorn-jsx": "^5.3.2", "eslint-visitor-keys": "^5.0.1" } }, "sha512-7p3DrVEIopW1B1avAGLuCSh1jubc01H2JHc8B4qqGblmg5gI9yumBgACjWo4JlIc04ufug4xJ3SQI8HkS/Rgzw=="], - - "esquery": ["esquery@1.7.0", "", { "dependencies": { "estraverse": "^5.1.0" } }, "sha512-Ap6G0WQwcU/LHsvLwON1fAQX9Zp0A2Y6Y/cJBl9r/JbW90Zyg4/zbG6zzKa2OTALELarYHmKu0GhpM5EO+7T0g=="], - - "esrecurse": ["esrecurse@4.3.0", "", { "dependencies": { "estraverse": "^5.2.0" } }, "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag=="], - - "estraverse": ["estraverse@5.3.0", "", {}, "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="], - - "esutils": ["esutils@2.0.3", "", {}, "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g=="], - - "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], - - "fast-json-stable-stringify": ["fast-json-stable-stringify@2.1.0", "", {}, "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw=="], - - "fast-levenshtein": ["fast-levenshtein@2.0.6", "", {}, "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw=="], - - "file-entry-cache": ["file-entry-cache@8.0.0", "", { "dependencies": { "flat-cache": "^4.0.0" } }, "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ=="], - - "find-up": ["find-up@5.0.0", "", { "dependencies": { "locate-path": "^6.0.0", "path-exists": "^4.0.0" } }, "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng=="], - - "flat-cache": ["flat-cache@4.0.1", "", { "dependencies": { "flatted": "^3.2.9", "keyv": "^4.5.4" } }, "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw=="], - - "flatted": ["flatted@3.4.2", "", {}, "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA=="], - - "glob-parent": ["glob-parent@6.0.2", "", { "dependencies": { "is-glob": "^4.0.3" } }, "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A=="], - - "globals": ["globals@17.6.0", "", {}, "sha512-sepffkT8stwnIYbsMBpoCHJuJM5l98FUF2AnE07hfvE0m/qp3R586hw4jF4uadbhvg1ooIdzuu7CsfD2jzCaNA=="], - - "html-escaper": ["html-escaper@3.0.3", "", {}, "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ=="], - - "htmlparser2": ["htmlparser2@10.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "entities": "^7.0.1" } }, "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ=="], - - "https-proxy-agent": ["https-proxy-agent@9.0.0", "", { "dependencies": { "agent-base": "9.0.0", "debug": "^4.3.4" } }, "sha512-/MVmHp58WkOypgFhCLk4fzpPcFQvTJ/e6LBI7irpIO2HfxUbpmYoHF+KzipzJpxxzJu7aJNWQ0xojJ/dzV2G5g=="], - - "ignore": ["ignore@5.3.2", "", {}, "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g=="], - - "imurmurhash": ["imurmurhash@0.1.4", "", {}, "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA=="], - - "is-extglob": ["is-extglob@2.1.1", "", {}, "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ=="], - - "is-glob": ["is-glob@4.0.3", "", { "dependencies": { "is-extglob": "^2.1.1" } }, "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg=="], - - "is-node-process": ["is-node-process@1.2.0", "", {}, "sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw=="], - - "is-plain-object": ["is-plain-object@5.0.0", "", {}, "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q=="], - - "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="], - - "json-buffer": ["json-buffer@3.0.1", "", {}, "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="], - - "json-schema-traverse": ["json-schema-traverse@0.4.1", "", {}, "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="], - - "json-stable-stringify-without-jsonify": ["json-stable-stringify-without-jsonify@1.0.1", "", {}, "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw=="], - - "json-stringify-safe": ["json-stringify-safe@5.0.1", "", {}, "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="], - - "keyv": ["keyv@4.5.4", "", { "dependencies": { "json-buffer": "3.0.1" } }, "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw=="], - - "levn": ["levn@0.4.1", "", { "dependencies": { "prelude-ls": "^1.2.1", "type-check": "~0.4.0" } }, "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ=="], - - "linkedom": ["linkedom@0.18.12", "", { "dependencies": { "css-select": "^5.1.0", "cssom": "^0.5.0", "html-escaper": "^3.0.3", "htmlparser2": "^10.0.0", "uhyphen": "^0.2.0" }, "peerDependencies": { "canvas": ">= 2" }, "optionalPeers": ["canvas"] }, "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q=="], - - "locate-path": ["locate-path@6.0.0", "", { "dependencies": { "p-locate": "^5.0.0" } }, "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw=="], - - "minimatch": ["minimatch@10.2.5", "", { "dependencies": { "brace-expansion": "^5.0.5" } }, "sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg=="], - - "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], - - "nanoid": ["nanoid@3.3.12", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ=="], - - "natural-compare": ["natural-compare@1.4.0", "", {}, "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw=="], - - "nock": ["nock@14.0.14", "", { "dependencies": { "@mswjs/interceptors": "^0.41.0", "json-stringify-safe": "^5.0.1", "propagate": "^2.0.0" } }, "sha512-PKk7tex0O3RRXUZC5XDKJ9yM3rYRPS13myduT85VIIYDBnib42Fpxoe6KxRSzqB4iL2NDxkcJ2yiskZ18hGLEQ=="], - - "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], - - "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], - - "outvariant": ["outvariant@1.4.3", "", {}, "sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA=="], - - "p-limit": ["p-limit@3.1.0", "", { "dependencies": { "yocto-queue": "^0.1.0" } }, "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ=="], - - "p-locate": ["p-locate@5.0.0", "", { "dependencies": { "p-limit": "^3.0.2" } }, "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw=="], - - "parse-srcset": ["parse-srcset@1.0.2", "", {}, "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="], - - "path-exists": ["path-exists@4.0.0", "", {}, "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w=="], - - "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], - - "picocolors": ["picocolors@1.1.1", "", {}, "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA=="], - - "postcss": ["postcss@8.5.13", "", { "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-qif0+jGGZoLWdHey3UFHHWP0H7Gbmsk8T5VEqyYFbWqPr1XqvLGBbk/sl8V5exGmcYJklJOhOQq1pV9IcsiFag=="], - - "prelude-ls": ["prelude-ls@1.2.1", "", {}, "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g=="], - - "propagate": ["propagate@2.0.1", "", {}, "sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag=="], - - "punycode": ["punycode@2.3.1", "", {}, "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="], - - "sanitize-html": ["sanitize-html@2.17.3", "", { "dependencies": { "deepmerge": "^4.2.2", "escape-string-regexp": "^4.0.0", "htmlparser2": "^10.1.0", "is-plain-object": "^5.0.0", "parse-srcset": "^1.0.2", "postcss": "^8.3.11" } }, "sha512-Kn4srCAo2+wZyvCNKCSyB2g8RQ8IkX/gQs2uqoSRNu5t9I2qvUyAVvRDiFUVAiX3N3PNuwStY0eNr+ooBHVWEg=="], - - "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="], - - "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="], - - "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], - - "strict-event-emitter": ["strict-event-emitter@0.5.1", "", {}, "sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ=="], - - "type-check": ["type-check@0.4.0", "", { "dependencies": { "prelude-ls": "^1.2.1" } }, "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew=="], - - "uhyphen": ["uhyphen@0.2.0", "", {}, "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="], - - "uri-js": ["uri-js@4.4.1", "", { "dependencies": { "punycode": "^2.1.0" } }, "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg=="], - - "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], - - "word-wrap": ["word-wrap@1.2.5", "", {}, "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA=="], - - "yocto-queue": ["yocto-queue@0.1.0", "", {}, "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q=="], - - "@eslint-community/eslint-utils/eslint-visitor-keys": ["eslint-visitor-keys@3.4.3", "", {}, "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag=="], - - "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], - } -} From 00be40fc76fd19c3f7ba2fccb2a64ffc87353cf8 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 19:43:46 +0700 Subject: [PATCH 3/6] Add JSDocs --- index.d.ts | 176 ++++++++++++++++++---------- src/config.js | 18 ++- src/main.js | 20 +++- src/utils/extractLdSchema.js | 28 ++++- src/utils/extractWithReadability.js | 13 ++ src/utils/findDate.js | 4 +- src/utils/html.js | 44 +++++++ src/utils/linker.js | 52 ++++++++ src/utils/parseFromHtml.js | 20 ++++ src/utils/retrieve.js | 19 +++ src/utils/similarity.js | 14 +++ src/utils/transformation.js | 57 ++++++++- 12 files changed, 391 insertions(+), 74 deletions(-) diff --git a/index.d.ts b/index.d.ts index 3a495780..9cb82521 100644 --- a/index.d.ts +++ b/index.d.ts @@ -1,90 +1,140 @@ // Type definitions -import { IOptions as SanitizeOptions } from "sanitize-html"; +import { IOptions as SanitizeOptions } from 'sanitize-html' +/** + * Transformation for per-site HTML pre/post processing. + */ export interface Transformation { - patterns: Array, + /** URL regex patterns to match */ + patterns: Array + /** Function to pre-process raw HTML before extraction */ pre?: (document: Document) => Document + /** Function to post-process extracted article content */ post?: (document: Document) => Document } -export function addTransformations(transformations: Array): Number; -export function removeTransformations(options: Array): Number; - -export function getSanitizeHtmlOptions(): SanitizeOptions; -export function setSanitizeHtmlOptions(options: SanitizeOptions): void; - /** - * @param input url or html + * Options for the article extraction process. */ - export interface ParserOptions { - /** - * to estimate time to read. - * Default: 300 - */ + /** Words per minute for time-to-read estimation. Default: 300 */ wordsPerMinute?: number - /** - * max num of chars generated for description - * Default: 210 - */ + /** Max chars for generated description. Default: 210 */ descriptionTruncateLen?: number - /** - * min num of chars required for description - * Default: 180 - */ + /** Min chars required for description. Default: 180 */ descriptionLengthThreshold?: number - /** - * min num of chars required for content - * Default: 200 - */ + /** Min chars required for content. Default: 200 */ contentLengthThreshold?: number } +/** + * Proxy configuration for fetching articles. + */ export interface ProxyConfig { - target?: string; - headers?: Record; + /** Proxy endpoint URL */ + target?: string + /** Headers for proxy request */ + headers?: Record } +/** + * Options for the HTTP fetch request. + */ export interface FetchOptions { - /** - * list of request headers - * default: null - */ - headers?: Record; - /** - * the values to configure proxy - * default: null - */ - proxy?: ProxyConfig; - - /** - * http proxy agent - * default: null - */ - agent?: object; - /** - * signal to terminate request - * default: null - */ - signal?: object; + /** Custom request headers */ + headers?: Record + /** Proxy configuration */ + proxy?: ProxyConfig + /** HTTP proxy agent (e.g. HttpsProxyAgent) */ + agent?: object + /** AbortSignal to cancel the request */ + signal?: object } +/** + * Extracted article data structure. + */ export interface ArticleData { - url?: string; - links?: string[]; - title?: string; - description?: string; - image?: string; - favicon?: string; - author?: string; - content?: string; - source?: string; - published?: string; - ttr?: number; - type?: string; + /** Best resolved URL of the article */ + url?: string + /** Alternative URLs (canonical, shortlink, etc.) */ + links?: string[] + /** Article title */ + title?: string + /** Short description or excerpt */ + description?: string + /** Main image URL */ + image?: string + /** Site favicon URL */ + favicon?: string + /** Author name */ + author?: string + /** Extracted article HTML content */ + content?: string + /** Original publisher/source domain */ + source?: string + /** Publication date string */ + published?: string + /** Estimated time to read in seconds (0 = unknown) */ + ttr?: number + /** Page type (e.g. article) */ + type?: string } -export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise; +/** + * Register one or more transformations for per-site HTML processing. + * + * @param transformations - Single transformation or array of transformations + * @returns Number of transformations successfully added + */ +export function addTransformations (transformations: Transformation | Array): number + +/** + * Remove transformations matching the given patterns. + * Calling without arguments removes all transformations. + * + * @param patterns - URL patterns to match for removal + * @returns Number of transformations removed + */ +export function removeTransformations (patterns?: Array): number + +/** + * Get a copy of the current sanitize-html options. + */ +export function getSanitizeHtmlOptions (): SanitizeOptions + +/** + * Update sanitize-html options by merging with the current ones. + * + * @param options - Partial sanitize options to merge + */ +export function setSanitizeHtmlOptions (options: SanitizeOptions): void -export function extractFromHtml(html: string, url?: string, parserOptions?: ParserOptions): Promise; +/** + * Load and extract article data from a URL or HTML string. + * + * @param input - URL or HTML string to extract from + * @param parserOptions - Options for parsing + * @param fetchOptions - Options for HTTP fetch + * @returns Extracted article data or null + */ +export function extract ( + input: string, + parserOptions?: ParserOptions, + fetchOptions?: FetchOptions, +): Promise + +/** + * Extract article data from an HTML string directly. + * + * @param html - Raw HTML content + * @param url - Source URL for resolving relative links + * @param parserOptions - Options for parsing + * @returns Extracted article data or null + */ +export function extractFromHtml ( + html: string, + url?: string, + parserOptions?: ParserOptions, +): Promise diff --git a/src/config.js b/src/config.js index f3702fcd..1554625f 100644 --- a/src/config.js +++ b/src/config.js @@ -2,6 +2,12 @@ import { clone } from '@pwshub/bellajs' +/** + * Default sanitize-html options for cleaning extracted article content. + * Defines allowed HTML tags, attributes, and iframe domains. + * + * @type {SanitizeOptions} + */ const sanitizeHtmlOptions = { allowedTags: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', @@ -10,7 +16,7 @@ const sanitizeHtmlOptions = { 'details', 'summary', 'pre', 'code', 'ul', 'ol', 'li', 'dd', 'dl', - 'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfood', + 'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfoot', 'fieldset', 'legend', 'figure', 'figcaption', 'img', 'picture', 'video', 'audio', 'source', @@ -53,12 +59,20 @@ const sanitizeHtmlOptions = { } /** - * @returns {SanitizeOptions} + * Get a clone of the current sanitize-html options. + * + * @returns {SanitizeOptions} Cloned sanitize options */ export const getSanitizeHtmlOptions = () => { return clone(sanitizeHtmlOptions) } +/** + * Update sanitize-html options by merging with the current ones. + * + * @param {SanitizeOptions} [opts={}] - Partial options to merge + * @returns {void} + */ export const setSanitizeHtmlOptions = (opts = {}) => { Object.keys(opts).forEach((key) => { sanitizeHtmlOptions[key] = clone(opts[key]) diff --git a/src/main.js b/src/main.js index 66b045dd..bffaf4a7 100644 --- a/src/main.js +++ b/src/main.js @@ -9,13 +9,21 @@ import parseFromHtml from './utils/parseFromHtml.js' import { getCharset } from './utils/html.js' import { isValid as isValidUrl } from './utils/linker.js' +/** + * Load and extract article data from a URL or HTML string. + * + * @param {string} input - URL or HTML string to extract from + * @param {ParserOptions} [parserOptions={}] - Options for parsing + * @param {FetchOptions} [fetchOptions={}] - Options for HTTP fetch + * @returns {Promise} Extracted article data or null + */ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => { if (!isString(input)) { throw new Error('Input must be a string') } if (!isValidUrl(input)) { - return parseFromHtml(input, null, parserOptions || {}) + return parseFromHtml(input, null, parserOptions) } const buffer = await retrieve(input, fetchOptions) const text = buffer ? Buffer.from(buffer).toString().trim() : '' @@ -25,9 +33,17 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => { const charset = getCharset(text) const decoder = new TextDecoder(charset) const html = decoder.decode(buffer) - return parseFromHtml(html, input, parserOptions || {}) + return parseFromHtml(html, input, parserOptions) } +/** + * Extract article data from an HTML string directly. + * + * @param {string} html - Raw HTML content + * @param {string} [url] - Source URL for resolving relative links + * @param {ParserOptions} [parserOptions={}] - Options for parsing + * @returns {Promise} Extracted article data or null + */ export const extractFromHtml = async (html, url, parserOptions = {}) => { return parseFromHtml(html, url, parserOptions) } diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js index 9d038db6..2f9584b5 100644 --- a/src/utils/extractLdSchema.js +++ b/src/utils/extractLdSchema.js @@ -2,6 +2,11 @@ import { isArray, isObject, isString } from '@pwshub/bellajs' +/** + * Allowed JSON-LD schema types that indicate an article or webpage. + * + * @type {string[]} + */ const typeSchemas = [ 'aboutpage', 'checkoutpage', @@ -31,6 +36,11 @@ const typeSchemas = [ 'medicalscholarlyarticle', ] +/** + * Mapping from entry keys to JSON-LD attribute names. + * + * @type {Object} + */ const attributeLists = { description: 'description', image: 'image', @@ -39,6 +49,12 @@ const attributeLists = { type: '@type', } +/** + * Safely parse a JSON string, returning an empty object on failure. + * + * @param {string} text - JSON string to parse + * @returns {Object} Parsed object or empty object + */ const parseJson = (text) => { try { return JSON.parse(text) @@ -47,6 +63,12 @@ const parseJson = (text) => { } } +/** + * Check if the given JSON-LD object has an allowed schema type. + * + * @param {Object} ldJson - Parsed JSON-LD object + * @returns {boolean} True if type is in the allowed list + */ const isAllowedLdJsonType = (ldJson) => { const rootLdJsonType = ldJson['@type'] || '' const arr = isArray(rootLdJsonType) ? rootLdJsonType : [rootLdJsonType] @@ -67,9 +89,9 @@ export default (document, entry) => { ldSchemas.forEach(ldSchema => { const ldJson = parseJson(ldSchema.textContent.replace(/[\n\r\t]/g, '')) if (ldJson && isAllowedLdJsonType(ldJson)) { - Object.entries(attributeLists).forEach(([key, attr]) => { + for (const [key, attr] of Object.entries(attributeLists)) { if (!entry[key] || !ldJson[attr]) { - return + continue } const keyValue = ldJson[attr] @@ -77,7 +99,7 @@ export default (document, entry) => { if (isString(val) && val !== '') { entry[key] = val.trim() } - }) + } } }) diff --git a/src/utils/extractWithReadability.js b/src/utils/extractWithReadability.js index 4575ac6d..56833d9b 100644 --- a/src/utils/extractWithReadability.js +++ b/src/utils/extractWithReadability.js @@ -4,6 +4,13 @@ import { Readability } from '@mozilla/readability' import { DOMParser } from 'linkedom' import { isString } from '@pwshub/bellajs' +/** + * Extract main article content from HTML using Mozilla Readability. + * + * @param {string} html - Raw HTML content + * @param {string} [url=''] - Source URL for resolving relative paths + * @returns {string|null} Extracted article HTML or null + */ export default (html, url = '') => { if (!isString(html)) { return null @@ -19,6 +26,12 @@ export default (html, url = '') => { return result.textContent ? result.content : null } +/** + * Extract article title from HTML using Mozilla Readability. + * + * @param {string} html - Raw HTML content + * @returns {string|null} Extracted title or null + */ export function extractTitleWithReadability (html) { if (!isString(html)) { return null diff --git a/src/utils/findDate.js b/src/utils/findDate.js index 3a666e02..242eaa68 100644 --- a/src/utils/findDate.js +++ b/src/utils/findDate.js @@ -11,7 +11,9 @@ function convertDateFormat (dateString) { let year, month, day - if (parseInt(parts[0]) > 12) { + if (parts[0].length === 4 || parseInt(parts[0]) > 31) { + [year, month, day] = parts + } else if (parseInt(parts[0]) > 12) { [day, month, year] = parts } else { [month, day, year] = parts diff --git a/src/utils/html.js b/src/utils/html.js index fe87f90e..33bbaa6f 100644 --- a/src/utils/html.js +++ b/src/utils/html.js @@ -6,6 +6,13 @@ import { pipe } from '@pwshub/bellajs' import { getSanitizeHtmlOptions } from '../config.js' +/** + * Lightweight HTML sanitization that fixes structural issues + * without stripping any tags or attributes. + * + * @param {string} html - Raw HTML input + * @returns {string} Sanitized HTML (all tags/attributes preserved) + */ export const purify = (html) => { return sanitize(html, { allowedTags: false, @@ -14,8 +21,20 @@ export const purify = (html) => { }) } +/** + * Regex matching strings that consist entirely of whitespace characters. + * + * @type {RegExp} + */ const WS_REGEXP = /^[\s\f\n\r\t\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff\x09\x0a\x0b\x0c\x0d\x20\xa0]+$/ // eslint-disable-line +/** + * Collapse multiple consecutive line breaks into single newlines, + * and remove lines that are entirely whitespace. + * + * @param {string} str - Input string + * @returns {string} Cleaned string + */ const stripMultiLinebreaks = (str) => { return str.replace(/(\r\n|\n|\u2424){2,}/g, '\n').split('\n').map((line) => { return WS_REGEXP.test(line) ? line.trim() : line @@ -24,10 +43,22 @@ const stripMultiLinebreaks = (str) => { }).join('\n') } +/** + * Replace all-whitespace sequences with a single space. + * + * @param {string} str - Input string + * @returns {string} Cleaned string + */ const stripMultispaces = (str) => { return str.replace(WS_REGEXP, ' ').trim() } +/** + * Detect HTML character encoding from meta tags. + * + * @param {string} html - Raw HTML content + * @returns {string} Charset name (defaults to 'utf8') + */ export const getCharset = (html) => { const doc = new DOMParser().parseFromString(html, 'text/html') const m = doc.querySelector('meta[charset]') || null @@ -39,6 +70,13 @@ export const getCharset = (html) => { return charset?.toLowerCase() || 'utf8' } +/** + * Final cleanup of extracted article content: + * sanitize to allowed tags, collapse whitespace. + * + * @param {string} inputHtml - Extracted article HTML + * @returns {string} Cleaned HTML string + */ export const cleanify = (inputHtml) => { const doc = new DOMParser().parseFromString(inputHtml, 'text/html') const html = doc.documentElement.innerHTML @@ -49,6 +87,12 @@ export const cleanify = (inputHtml) => { )(html) } +/** + * Count the number of img tags in HTML content. + * + * @param {string} html - HTML content + * @returns {number} Number of img elements + */ export const countImages = (html) => { const doc = new DOMParser().parseFromString(html, 'text/html') const imgTags = doc.querySelectorAll('img') || [] diff --git a/src/utils/linker.js b/src/utils/linker.js index 3c1a70f0..59ac04fa 100644 --- a/src/utils/linker.js +++ b/src/utils/linker.js @@ -4,6 +4,12 @@ import { DOMParser } from 'linkedom' import { findBestMatch } from './similarity.js' +/** + * Check if a string is a valid HTTP or HTTPS URL. + * + * @param {string} [url=''] - URL string to validate + * @returns {boolean} True if valid HTTP(S) URL + */ export const isValid = (url = '') => { try { const ourl = new URL(url) @@ -13,11 +19,25 @@ export const isValid = (url = '') => { } } +/** + * Pick the URL that best matches the article title using string similarity. + * + * @param {string[]} [candidates=[]] - Candidate URLs + * @param {string} [title=''] - Article title for comparison + * @returns {string} Best matching URL + */ export const chooseBestUrl = (candidates = [], title = '') => { const ranking = findBestMatch(title, candidates) return ranking.bestMatch.target } +/** + * Resolve a relative URL against a base URL. + * + * @param {string} [fullUrl=''] - Base URL + * @param {string} [relativeUrl=''] - Relative URL to resolve + * @returns {string} Absolute URL or empty string on failure + */ export const absolutify = (fullUrl = '', relativeUrl = '') => { try { const result = new URL(relativeUrl, fullUrl) @@ -27,6 +47,11 @@ export const absolutify = (fullUrl = '', relativeUrl = '') => { } } +/** + * Tracking and analytics query parameters to strip from URLs. + * + * @type {string[]} + */ const blacklistKeys = [ 'CNDID', '__twitter_impression', @@ -87,6 +112,12 @@ const blacklistKeys = [ 'pk_campaign', ] +/** + * Remove tracking parameters and hash fragment from a URL. + * + * @param {string} url - URL to clean + * @returns {string|null} Cleaned URL or null if invalid + */ export const purify = (url) => { try { const pureUrl = new URL(url) @@ -106,6 +137,14 @@ export const purify = (url) => { * @param url {string} * @returns article {string} */ +/** + * Normalize all links, images, and source elements in HTML + * by resolving relative URLs to absolute and adding target=_blank to links. + * + * @param {string} html - HTML content to normalize + * @param {string} url - Base URL for resolving relative paths + * @returns {string} Normalized HTML string + */ export const normalize = (html, url) => { const doc = new DOMParser().parseFromString(html, 'text/html') @@ -124,9 +163,22 @@ export const normalize = (html, url) => { } }) + Array.from(doc.getElementsByTagName('source')).forEach((element) => { + const src = element.getAttribute('src') + if (src) { + element.setAttribute('src', absolutify(url, src)) + } + }) + return Array.from(doc.childNodes).map(element => element.outerHTML).join('') } +/** + * Extract the domain from a URL, stripping the www. prefix. + * + * @param {string} url - Full URL + * @returns {string} Domain name + */ export const getDomain = (url) => { const host = (new URL(url)).host return host.replace('www.', '') diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js index 0c6a493d..2df173b2 100644 --- a/src/utils/parseFromHtml.js +++ b/src/utils/parseFromHtml.js @@ -31,12 +31,32 @@ import extractWithReadability, { import { execPreParser, execPostParser } from './transformation.js' +/** + * Build article description from meta description or text content. + * + * @param {Object} params + * @param {string} params.desc - Meta description + * @param {string} params.text - Stripped text content + * @param {number} params.threshold - Min length to use meta description + * @param {number} params.maxlen - Max chars for truncated description + * @returns {string} Final description string + */ const summarize = ({ desc, text, threshold, maxlen }) => { return desc.length > threshold ? desc : truncateByChar(text, maxlen).replace(/\n/g, ' ') } +/** + * Parse HTML content and extract article data. + * Orchestrates metadata extraction, URL normalization, transformations, + * Readability extraction, and content sanitization. + * + * @param {string} inputHtml - Raw HTML content + * @param {string} [inputUrl=''] - Source URL for resolving relative links + * @param {ParserOptions} [parserOptions={}] - Parsing options + * @returns {Promise} Extracted article data or null + */ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { const pureHtml = purify(inputHtml) const meta = extractMetaData(pureHtml) diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index 18fbd2bc..06f1e596 100644 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -1,5 +1,16 @@ // utils -> retrieve +/** + * Fetch content through a proxy endpoint. + * + * @param {string} url - Target URL to fetch + * @param {Object} [options={}] - Proxy options + * @param {Object} [options.proxy={}] - Proxy configuration + * @param {string} options.proxy.target - Proxy endpoint URL + * @param {Object} [options.proxy.headers={}] - Headers for proxy request + * @param {AbortSignal} [options.signal] - Optional abort signal + * @returns {Promise} Fetch response object + */ const profetch = async (url, options = {}) => { const { proxy = {}, signal = null } = options const { @@ -13,6 +24,14 @@ const profetch = async (url, options = {}) => { return res } +/** + * Retrieve raw HTML content from a URL. + * Supports direct fetch, proxy, custom headers, agent, and abort signal. + * + * @param {string} url - URL to fetch + * @param {FetchOptions} [options={}] - Fetch configuration + * @returns {Promise} Response body as ArrayBuffer + */ export default async (url, options = {}) => { const { headers = { diff --git a/src/utils/similarity.js b/src/utils/similarity.js index 39d4cef5..4201ca91 100644 --- a/src/utils/similarity.js +++ b/src/utils/similarity.js @@ -6,11 +6,25 @@ import { isArray } from '@pwshub/bellajs' +/** + * Validate arguments for findBestMatch. + * + * @param {string} mainString - Reference string + * @param {string[]} targetStrings - Strings to compare against + * @returns {boolean} True if arguments are valid + */ const areArgsValid = (mainString, targetStrings) => { return isString(mainString) && isArray(targetStrings) && targetStrings.length > 0 && targetStrings.every(s => isString(s)) } +/** + * Find the best matching string from a list using Dice coefficient. + * + * @param {string} mainString - Reference string to match against + * @param {string[]} targetStrings - Candidate strings + * @returns {{ratings: Array, bestMatch: {target: string, rating: number}, bestMatchIndex: number}} Match results with rankings + */ export const findBestMatch = (mainString, targetStrings) => { if (!areArgsValid(mainString, targetStrings)) { throw new Error('Bad arguments: First argument should be a string, second should be an array of strings') diff --git a/src/utils/transformation.js b/src/utils/transformation.js index ea091389..06aff829 100644 --- a/src/utils/transformation.js +++ b/src/utils/transformation.js @@ -3,8 +3,19 @@ import { isArray, isFunction } from '@pwshub/bellajs' import { DOMParser } from 'linkedom' +/** + * Registered transformation rules for per-site HTML pre/post processing. + * + * @type {Transformation[]} + */ const transformations = [] +/** + * Add a single transformation to the registry. + * + * @param {Transformation} tn - Transformation object with patterns and handlers + * @returns {number} 1 if added, 0 if invalid + */ const add = (tn) => { const { patterns } = tn if (!patterns || !isArray(patterns) || !patterns.length) { @@ -14,6 +25,12 @@ const add = (tn) => { return 1 } +/** + * Register one or more transformations for per-site HTML processing. + * + * @param {Transformation|Transformation[]} tfms - Transformation(s) to add + * @returns {number} Number of transformations successfully added + */ export const addTransformations = (tfms) => { if (isArray(tfms)) { return tfms.map(tfm => add(tfm)).filter(result => result === 1).length @@ -21,6 +38,13 @@ export const addTransformations = (tfms) => { return add(tfms) } +/** + * Remove transformations matching the given patterns. + * Calling without arguments removes all transformations. + * + * @param {RegExp[]} [patterns] - URL patterns to match for removal + * @returns {number} Number of transformations removed + */ export const removeTransformations = (patterns) => { if (!patterns) { const removed = transformations.length @@ -28,7 +52,7 @@ export const removeTransformations = (patterns) => { return removed } let removing = 0 - for (let i = transformations.length - 1; i > 0; i--) { + for (let i = transformations.length - 1; i >= 0; i--) { const { patterns: ipatterns } = transformations[i] const matched = ipatterns.some((ptn) => patterns.some((pattern) => String(pattern) === String(ptn))) if (matched) { @@ -39,10 +63,21 @@ export const removeTransformations = (patterns) => { return removing } +/** + * Get a copy of all registered transformations. + * + * @returns {Transformation[]} Copy of transformations array + */ export const getTransformations = () => { return [...transformations] } +/** + * Find all transformations whose patterns match any of the given URLs. + * + * @param {string|string[]} links - URL(s) to match against transformation patterns + * @returns {Transformation[]} Matching transformations + */ export const findTransformations = (links) => { const urls = !isArray(links) ? [links] : links const tfms = [] @@ -58,14 +93,30 @@ export const findTransformations = (links) => { return tfms } +/** + * Run pre-extraction transformations on raw HTML. + * Mutates the DOM in place through registered pre-processor functions. + * + * @param {string} html - Raw HTML content + * @param {string[]} links - URLs to match against transformation patterns + * @returns {string} Transformed HTML string + */ export const execPreParser = (html, links) => { const doc = new DOMParser().parseFromString(html, 'text/html') - findTransformations(links).map(tfm => tfm.pre).filter(fn => isFunction(fn)).map(fn => fn(doc)) + findTransformations(links).map(tfm => tfm.pre).filter(fn => isFunction(fn)).forEach(fn => fn(doc)) return Array.from(doc.childNodes).map(it => it.outerHTML).join('') } +/** + * Run post-extraction transformations on extracted article content. + * Mutates the DOM in place through registered post-processor functions. + * + * @param {string} html - Extracted article HTML + * @param {string[]} links - URLs to match against transformation patterns + * @returns {string} Transformed HTML string + */ export const execPostParser = (html, links) => { const doc = new DOMParser().parseFromString(html, 'text/html') - findTransformations(links).map(tfm => tfm.post).filter(fn => isFunction(fn)).map(fn => fn(doc)) + findTransformations(links).map(tfm => tfm.post).filter(fn => isFunction(fn)).forEach(fn => fn(doc)) return Array.from(doc.childNodes).map(it => it.outerHTML).join('') } From 6c89e294b6d258510df2f374b1d04ac84d69ee2e Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 19:46:22 +0700 Subject: [PATCH 4/6] Update README & exports --- README.md | 14 ++++++++++---- package.json | 7 +++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 746693fa..70020954 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,22 @@ Extract main article, main image and meta data from URL. ## Demo -- [Give it a try!](https://extractus-demo.vercel.app/article) +- [Give it a try!](https://extractus.pwshub.com/article) ## Install ```bash -# npm, pnpm, yarn -npm i @extractus/article-extractor - # bun bun add @extractus/article-extractor + +# npm +npm i @extractus/article-extractor + +# pnpm +pnpm install @extractus/article-extractor + +# yarn +yarn add @extractus/article-extractor ``` ## Usage diff --git a/package.json b/package.json index 785c7a51..ac2dfe32 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,13 @@ "author": "@extractus", "main": "./src/main.js", "type": "module", + "exports": { + ".": { + "types": "./index.d.ts", + "import": "./src/main.js", + "default": "./src/main.js" + } + }, "types": "./index.d.ts", "engines": { "node": ">= 20" From 9ad2f28d883443d16d6e9c738051f31e50cc7e91 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 19:51:47 +0700 Subject: [PATCH 5/6] Update exporting --- package.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index ac2dfe32..14fb9893 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,7 @@ "@mozilla/readability": "^0.6.0", "@pwshub/bellajs": "^13.0.2", "linkedom": "^0.18.12", - "sanitize-html": "2.17.3" + "sanitize-html": "^2.17.3" }, "devDependencies": { "@eslint/js": "^10.0.1", @@ -43,6 +43,11 @@ "https-proxy-agent": "^9.0.0", "nock": "^14.0.14" }, + "files": [ + "src", + "index.d.ts" + ], + "sideEffects": false, "keywords": [ "article", "extractor", From d4c53099cd90d86719f3facb4ffe2e226d07b6c1 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 19:53:11 +0700 Subject: [PATCH 6/6] Update CI config --- .github/workflows/ci-test.yml | 12 +++++------- .github/workflows/codeql-analysis.yml | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 60c2913d..d6c6a5fa 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -12,27 +12,25 @@ jobs: strategy: matrix: - node_version: [20.x, 22.x, 24.x] + node_version: [22.x, 24.x, 25.x] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: setup Node.js v${{ matrix.node_version }} - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: ${{ matrix.node_version }} - name: run npm scripts - env: - PROXY_SERVER: ${{ secrets.PROXY_SERVER }} run: | npm install npm run lint - npm run build --if-present + #npm run build --if-present npm run test - name: cache node modules - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: ~/.npm key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index a77d776a..5547051d 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -38,7 +38,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL