From e830a1ad6dad06cc31b56abfab1ac63a174c3794 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 17 Feb 2026 15:39:23 +0100 Subject: [PATCH 1/3] feat: replace internal API layer with scrapegraph-js SDK Remove zod, schemas, types, and the hand-rolled API client in favor of the official scrapegraph-js SDK. All commands now import directly from the package. env.ts bridges JUST_SCRAPE_* vars to SGAI_* so the SDK picks up CLI-specific config. Co-Authored-By: Claude Opus 4.6 --- README.md | 18 +- bun.lock | 6 +- package.json | 2 +- src/cli.ts | 1 + src/commands/agentic-scraper.ts | 7 +- src/commands/crawl.ts | 25 ++- src/commands/credits.ts | 2 +- src/commands/generate-schema.ts | 2 +- src/commands/history.ts | 4 +- src/commands/markdownify.ts | 2 +- src/commands/scrape.ts | 2 +- src/commands/search-scraper.ts | 2 +- src/commands/sitemap.ts | 2 +- src/commands/smart-scraper.ts | 2 +- src/commands/validate.ts | 2 +- src/lib/env.ts | 28 ++- src/lib/schemas.ts | 90 -------- src/lib/scrapegraphai.ts | 357 ------------------------------ src/types/index.ts | 125 ----------- tests/scrapegraphai.test.ts | 372 -------------------------------- tests/smoke.test.ts | 7 + 21 files changed, 55 insertions(+), 1003 deletions(-) delete mode 100644 src/lib/schemas.ts delete mode 100644 src/lib/scrapegraphai.ts delete mode 100644 src/types/index.ts delete mode 100644 tests/scrapegraphai.test.ts create mode 100644 tests/smoke.test.ts diff --git a/README.md b/README.md index f0a100f..7af3988 100644 --- a/README.md +++ b/README.md @@ -24,13 +24,9 @@ just-scrape/ ├── src/ │ ├── cli.ts # Entry point, citty main command + subcommands │ ├── lib/ -│ │ ├── env.ts # Zod-parsed env config (API key, debug, timeout) +│ │ ├── env.ts # Env config (API key, JUST_SCRAPE_* → SGAI_* bridge) │ │ ├── folders.ts # API key resolution + interactive prompt -│ │ ├── scrapegraphai.ts # SDK layer — all API functions (typed responses) -│ │ ├── schemas.ts # Zod validation schemas │ │ └── log.ts # Logger factory + syntax-highlighted JSON output -│ ├── types/ -│ │ └── index.ts # Zod-derived types + ApiResult + response types │ ├── commands/ │ │ ├── smart-scraper.ts │ │ ├── search-scraper.ts @@ -45,8 +41,6 @@ just-scrape/ │ │ └── validate.ts │ └── utils/ │ └── banner.ts # ASCII banner + version from package.json -├── tests/ -│ └── scrapegraphai.test.ts # SDK layer tests (mocked fetch) ├── dist/ # Build output (git-ignored) │ └── cli.mjs # Bundled ESM with shebang ├── package.json @@ -396,10 +390,9 @@ bun run dev --help | CLI Framework | **citty** (unjs) | | Prompts | **@clack/prompts** | | Styling | **chalk** v5 (ESM) | -| Validation | **zod** v4 | +| SDK | **scrapegraph-js** | | Env | **dotenv** | | Lint / Format | **Biome** | -| Testing | **Bun test** (built-in) | | Target | **Node.js 22+**, ESM-only | ### Scripts @@ -409,16 +402,9 @@ bun run dev # Run CLI from TS source bun run build # Bundle ESM to dist/cli.mjs bun run lint # Lint + format check bun run format # Auto-format -bun test # Run tests bun run check # Type-check + lint ``` -### Testing - -Tests mock all API calls via `spyOn(globalThis, "fetch")` — no network, no API key needed. - -Covers: success paths, polling, HTTP error mapping (401/402/422/429/500), Zod validation, timeouts, and network failures. - ## License ISC diff --git a/bun.lock b/bun.lock index 6c355ec..5a7bd89 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "zod": "^4.3.6", + "scrapegraph-js": "^1.0.0", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,6 +229,8 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], + "scrapegraph-js": ["scrapegraph-js@1.0.0", "", {}, "sha512-eQn8/HRfJHjCoj2yia5yHWQTYUae/bYNhLEx00ZXF+GLKpgUJT0OCGUQM13WGSX5cgw9onz5EiaDJDbzcbeYtQ=="], + "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], "source-map": ["source-map@0.7.6", "", {}, "sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ=="], @@ -254,7 +256,5 @@ "ufo": ["ufo@1.6.3", "", {}, "sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q=="], "undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], - - "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], } } diff --git a/package.json b/package.json index de59a49..e2ac14e 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "zod": "^4.3.6" + "scrapegraph-js": "^1.0.0" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/cli.ts b/src/cli.ts index 0ff7f84..483a94c 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1,4 +1,5 @@ import "dotenv/config"; +import "./lib/env.js"; import { defineCommand, runMain } from "citty"; import { getVersion, showBanner } from "./utils/banner.js"; diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts index ce1a146..67e9b5b 100644 --- a/src/commands/agentic-scraper.ts +++ b/src/commands/agentic-scraper.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { @@ -34,9 +34,8 @@ export default defineCommand({ out.docs("https://docs.scrapegraphai.com/services/agenticscraper"); const key = await resolveApiKey(!!args.json); - const params: scrapegraphai.AgenticScraperParams = { url: args.url }; - - if (args.steps) params.steps = args.steps.split(",").map((s) => s.trim()); + const steps = args.steps ? args.steps.split(",").map((s) => s.trim()) : []; + const params: scrapegraphai.AgenticScraperParams = { url: args.url, steps }; if (args.prompt) params.user_prompt = args.prompt; if (args.schema) params.output_schema = JSON.parse(args.schema); if (args["ai-extraction"]) params.ai_extraction = true; diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index 924ab0e..d55101d 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { @@ -36,16 +36,21 @@ export default defineCommand({ out.docs("https://docs.scrapegraphai.com/services/smartcrawler"); const key = await resolveApiKey(!!args.json); - const params: scrapegraphai.CrawlParams = { url: args.url }; + const base: Record = { url: args.url }; + if (args["max-pages"]) base.max_pages = Number(args["max-pages"]); + if (args.depth) base.depth = Number(args.depth); + if (args.rules) base.rules = JSON.parse(args.rules); + if (args["no-sitemap"]) base.sitemap = false; + if (args.stealth) base.stealth = true; - if (args.prompt) params.prompt = args.prompt; - if (args["no-extraction"]) params.extraction_mode = false; - if (args["max-pages"]) params.max_pages = Number(args["max-pages"]); - if (args.depth) params.depth = Number(args.depth); - if (args.schema) params.schema = JSON.parse(args.schema); - if (args.rules) params.rules = JSON.parse(args.rules); - if (args["no-sitemap"]) params.sitemap = false; - if (args.stealth) params.stealth = true; + if (args["no-extraction"]) { + base.extraction_mode = false; + } else { + if (args.prompt) base.prompt = args.prompt; + if (args.schema) base.schema = JSON.parse(args.schema); + } + + const params = base as scrapegraphai.CrawlParams; out.start("Crawling"); const result = await scrapegraphai.crawl(key, params, out.poll); diff --git a/src/commands/credits.ts b/src/commands/credits.ts index 720d856..0d7b75f 100644 --- a/src/commands/credits.ts +++ b/src/commands/credits.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/generate-schema.ts b/src/commands/generate-schema.ts index b24a937..8d77e57 100644 --- a/src/commands/generate-schema.ts +++ b/src/commands/generate-schema.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/history.ts b/src/commands/history.ts index 80a95da..99ab59e 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -1,10 +1,10 @@ import * as p from "@clack/prompts"; import chalk from "chalk"; import { defineCommand } from "citty"; +import { HISTORY_SERVICES } from "scrapegraph-js"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import { HISTORY_SERVICES } from "../lib/schemas.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; const VALID = HISTORY_SERVICES.join(", "); const LOAD_MORE = "__load_more__"; diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index 40e62c7..ccfc494 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index d57112a..b0517eb 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/search-scraper.ts b/src/commands/search-scraper.ts index 46f339e..041e32c 100644 --- a/src/commands/search-scraper.ts +++ b/src/commands/search-scraper.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/sitemap.ts b/src/commands/sitemap.ts index 8e1b170..2120b16 100644 --- a/src/commands/sitemap.ts +++ b/src/commands/sitemap.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/smart-scraper.ts b/src/commands/smart-scraper.ts index 40d3207..be3d2a4 100644 --- a/src/commands/smart-scraper.ts +++ b/src/commands/smart-scraper.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/commands/validate.ts b/src/commands/validate.ts index db956d7..dd2c81d 100644 --- a/src/commands/validate.ts +++ b/src/commands/validate.ts @@ -1,7 +1,7 @@ import { defineCommand } from "citty"; +import * as scrapegraphai from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -import * as scrapegraphai from "../lib/scrapegraphai.js"; export default defineCommand({ meta: { diff --git a/src/lib/env.ts b/src/lib/env.ts index e0e1f5c..8777ab0 100644 --- a/src/lib/env.ts +++ b/src/lib/env.ts @@ -1,11 +1,18 @@ import { existsSync, readFileSync } from "node:fs"; import { homedir } from "node:os"; import { join } from "node:path"; -import { z } from "zod"; export const CONFIG_DIR = join(homedir(), ".scrapegraphai"); export const CONFIG_PATH = join(CONFIG_DIR, "config.json"); +if (process.env.JUST_SCRAPE_API_URL && !process.env.SGAI_API_URL) + process.env.SGAI_API_URL = process.env.JUST_SCRAPE_API_URL; + +if (process.env.JUST_SCRAPE_DEBUG === "1" && !process.env.SGAI_DEBUG) process.env.SGAI_DEBUG = "1"; + +if (process.env.JUST_SCRAPE_TIMEOUT_S && !process.env.SGAI_TIMEOUT_S) + process.env.SGAI_TIMEOUT_S = process.env.JUST_SCRAPE_TIMEOUT_S; + function loadConfigFile(): Record { if (!existsSync(CONFIG_PATH)) return {}; try { @@ -15,24 +22,15 @@ function loadConfigFile(): Record { } } -const EnvSchema = z.object({ - apiKey: z.string().optional(), - debug: z.boolean().default(false), - timeoutS: z.number().int().positive().default(120), -}); - -export type Env = z.infer; +export type Env = { + apiKey?: string; +}; function resolve(): Env { const config = loadConfigFile(); - - return EnvSchema.parse({ + return { apiKey: process.env.SGAI_API_KEY || (config["api-key"] as string) || undefined, - debug: process.env.JUST_SCRAPE_DEBUG === "1", - timeoutS: process.env.JUST_SCRAPE_TIMEOUT_S - ? Number(process.env.JUST_SCRAPE_TIMEOUT_S) - : 60 * 2, - }); + }; } export const env = resolve(); diff --git a/src/lib/schemas.ts b/src/lib/schemas.ts deleted file mode 100644 index 7d8c585..0000000 --- a/src/lib/schemas.ts +++ /dev/null @@ -1,90 +0,0 @@ -import { z } from "zod"; - -const jsonObject = z.record(z.string(), z.unknown()); -const jsonStringObject = z.record(z.string(), z.string()); - -export const SmartScraperSchema = z.object({ - website_url: z.string().url().optional(), - website_html: z.string().optional(), - website_markdown: z.string().optional(), - user_prompt: z.string().min(1), - output_schema: jsonObject.optional(), - number_of_scrolls: z.number().int().min(0).max(100).optional(), - total_pages: z.number().int().min(1).max(100).optional(), - stealth: z.boolean().optional(), - cookies: jsonStringObject.optional(), - headers: jsonStringObject.optional(), - plain_text: z.boolean().optional(), - webhook_url: z.string().url().optional(), -}); - -export const SearchScraperSchema = z.object({ - user_prompt: z.string().min(1), - num_results: z.number().int().min(3).max(20).optional(), - extraction_mode: z.boolean().optional(), - output_schema: jsonObject.optional(), - stealth: z.boolean().optional(), - headers: jsonStringObject.optional(), - webhook_url: z.string().url().optional(), -}); - -export const MarkdownifySchema = z.object({ - website_url: z.string().url(), - stealth: z.boolean().optional(), - headers: jsonStringObject.optional(), - webhook_url: z.string().url().optional(), -}); - -export const CrawlSchema = z.object({ - url: z.string().url(), - prompt: z.string().optional(), - extraction_mode: z.boolean().optional(), - max_pages: z.number().int().positive().optional(), - depth: z.number().int().positive().optional(), - schema: jsonObject.optional(), - rules: jsonObject.optional(), - sitemap: z.boolean().optional(), - stealth: z.boolean().optional(), - webhook_url: z.string().url().optional(), -}); - -export const GenerateSchemaSchema = z.object({ - user_prompt: z.string().min(1), - existing_schema: jsonObject.optional(), -}); - -export const SitemapSchema = z.object({ - website_url: z.string().url(), -}); - -export const ScrapeSchema = z.object({ - website_url: z.string().url(), - stealth: z.boolean().optional(), - branding: z.boolean().optional(), - country_code: z.string().optional(), -}); - -export const AgenticScraperSchema = z.object({ - url: z.string().url(), - steps: z.array(z.string()).optional(), - user_prompt: z.string().optional(), - output_schema: jsonObject.optional(), - ai_extraction: z.boolean().optional(), - use_session: z.boolean().optional(), -}); - -export const HISTORY_SERVICES = [ - "markdownify", - "smartscraper", - "searchscraper", - "scrape", - "crawl", - "agentic-scraper", - "sitemap", -] as const; - -export const HistorySchema = z.object({ - service: z.enum(HISTORY_SERVICES), - page: z.number().int().positive().default(1), - page_size: z.number().int().positive().max(100).default(10), -}); diff --git a/src/lib/scrapegraphai.ts b/src/lib/scrapegraphai.ts deleted file mode 100644 index 1aa06c8..0000000 --- a/src/lib/scrapegraphai.ts +++ /dev/null @@ -1,357 +0,0 @@ -import type { - AgenticScraperParams, - AgenticScraperResponse, - ApiResult, - CrawlParams, - CrawlResponse, - CreditsResponse, - GenerateSchemaParams, - GenerateSchemaResponse, - HealthResponse, - HistoryParams, - HistoryResponse, - MarkdownifyParams, - MarkdownifyResponse, - ScrapeParams, - ScrapeResponse, - SearchScraperParams, - SearchScraperResponse, - SitemapParams, - SitemapResponse, - SmartScraperParams, - SmartScraperResponse, -} from "../types/index.js"; -import { env } from "./env.js"; -import { - AgenticScraperSchema, - CrawlSchema, - GenerateSchemaSchema, - HistorySchema, - MarkdownifySchema, - ScrapeSchema, - SearchScraperSchema, - SitemapSchema, - SmartScraperSchema, -} from "./schemas.js"; -export type { - AgenticScraperParams, - AgenticScraperResponse, - ApiResult, - CrawlParams, - CrawlResponse, - CreditsResponse, - GenerateSchemaParams, - GenerateSchemaResponse, - HealthResponse, - HistoryParams, - HistoryResponse, - MarkdownifyParams, - MarkdownifyResponse, - ScrapeParams, - ScrapeResponse, - SearchScraperParams, - SearchScraperResponse, - SitemapParams, - SitemapResponse, - SmartScraperParams, - SmartScraperResponse, -} from "../types/index.js"; - -const BASE_URL = process.env.JUST_SCRAPE_API_URL || "https://api.scrapegraphai.com/v1"; -const POLL_INTERVAL_MS = 3000; - -function debug(label: string, data?: unknown) { - if (!env.debug) return; - const ts = new Date().toISOString(); - if (data !== undefined) console.error(`[${ts}] ${label}`, JSON.stringify(data, null, 2)); - else console.error(`[${ts}] ${label}`); -} - -function ok(data: T, elapsedMs: number): ApiResult { - return { status: "success", data, elapsedMs }; -} - -function fail(err: unknown): ApiResult { - if (err instanceof DOMException && err.name === "TimeoutError") - return { status: "error", data: null, error: "Request timed out", elapsedMs: 0 }; - const message = - err instanceof Error ? err.message : typeof err === "string" ? err : "Unknown error"; - return { status: "error", data: null, error: message, elapsedMs: 0 }; -} - -function mapHttpError(status: number): string { - switch (status) { - case 401: - return "Invalid or missing API key"; - case 402: - return "Insufficient credits — purchase more at https://dashboard.scrapegraphai.com"; - case 422: - return "Invalid parameters — check your request"; - case 429: - return "Rate limited — slow down and retry"; - case 500: - return "Server error — try again later"; - default: - return `HTTP ${status}`; - } -} - -type RequestResult = { data: T; elapsedMs: number }; - -async function request( - method: "GET" | "POST", - path: string, - apiKey: string, - body?: object, -): Promise> { - const url = `${BASE_URL}${path}`; - debug(`→ ${method} ${url}`, body); - - const start = performance.now(); - const res = await fetch(url, { - method, - headers: { - "SGAI-APIKEY": apiKey, - ...(body ? { "Content-Type": "application/json" } : {}), - }, - body: body ? JSON.stringify(body) : undefined, - signal: AbortSignal.timeout(env.timeoutS * 1000), - }); - - if (!res.ok) { - let detail = mapHttpError(res.status); - try { - const errBody = await res.json(); - debug(`← ${res.status}`, errBody); - if (errBody?.detail) detail = `${detail}: ${errBody.detail}`; - } catch {} - throw new Error(detail); - } - - const data = (await res.json()) as T; - const elapsedMs = Math.round(performance.now() - start); - debug(`← ${res.status} (${elapsedMs}ms)`, data); - return { data, elapsedMs }; -} - -type PollResponse = { - status: string; - error?: string; - [key: string]: unknown; -}; - -function isDone(status: string) { - return status === "completed" || status === "done"; -} - -async function pollUntilDone( - path: string, - id: string, - apiKey: string, - onPoll?: (status: string) => void, -): Promise> { - const deadline = Date.now() + env.timeoutS * 1000; - let totalMs = 0; - - while (Date.now() < deadline) { - const { data, elapsedMs } = await request("GET", `${path}/${id}`, apiKey); - totalMs += elapsedMs; - onPoll?.(data.status); - - if (isDone(data.status)) return { data, elapsedMs: totalMs }; - if (data.status === "failed") throw new Error(data.error ?? "Job failed"); - - await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); - } - - throw new Error("Polling timed out"); -} - -async function submitAndPoll( - path: string, - apiKey: string, - body: object, - idField: string, - onPoll?: (status: string) => void, -): Promise> { - const { data: res, elapsedMs } = await request("POST", path, apiKey, body); - if (isDone(res.status)) return { data: res as unknown as T, elapsedMs }; - const id = res[idField]; - if (typeof id !== "string") throw new Error(`Missing ${idField} in response`); - const poll = await pollUntilDone(path, id, apiKey, onPoll); - return { data: poll.data as unknown as T, elapsedMs: elapsedMs + poll.elapsedMs }; -} - -export async function smartScraper( - apiKey: string, - params: SmartScraperParams, -): Promise> { - try { - SmartScraperSchema.parse(params); - const { data, elapsedMs } = await request( - "POST", - "/smartscraper", - apiKey, - params, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function searchScraper( - apiKey: string, - params: SearchScraperParams, -): Promise> { - try { - SearchScraperSchema.parse(params); - const { data, elapsedMs } = await request( - "POST", - "/searchscraper", - apiKey, - params, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function markdownify( - apiKey: string, - params: MarkdownifyParams, -): Promise> { - try { - MarkdownifySchema.parse(params); - const { data, elapsedMs } = await request( - "POST", - "/markdownify", - apiKey, - params, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function scrape( - apiKey: string, - params: ScrapeParams, -): Promise> { - try { - ScrapeSchema.parse(params); - const { data, elapsedMs } = await request("POST", "/scrape", apiKey, params); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function crawl( - apiKey: string, - params: CrawlParams, - onPoll?: (status: string) => void, -): Promise> { - try { - CrawlSchema.parse(params); - const { data, elapsedMs } = await submitAndPoll( - "/crawl", - apiKey, - params, - "crawl_id", - onPoll, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function agenticScraper( - apiKey: string, - params: AgenticScraperParams, -): Promise> { - try { - AgenticScraperSchema.parse(params); - const { data, elapsedMs } = await request( - "POST", - "/agentic-scrapper", - apiKey, - params, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function generateSchema( - apiKey: string, - params: GenerateSchemaParams, -): Promise> { - try { - GenerateSchemaSchema.parse(params); - const { data, elapsedMs } = await request( - "POST", - "/generate_schema", - apiKey, - params, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function sitemap( - apiKey: string, - params: SitemapParams, -): Promise> { - try { - SitemapSchema.parse(params); - const { data, elapsedMs } = await request("POST", "/sitemap", apiKey, params); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function getCredits(apiKey: string): Promise> { - try { - const { data, elapsedMs } = await request("GET", "/credits", apiKey); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function checkHealth(apiKey: string): Promise> { - try { - const { data, elapsedMs } = await request("GET", "/healthz", apiKey); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} - -export async function history( - apiKey: string, - params: HistoryParams, -): Promise> { - try { - const parsed = HistorySchema.parse(params); - const qs = new URLSearchParams(); - qs.set("page", String(parsed.page)); - qs.set("page_size", String(parsed.page_size)); - const { data, elapsedMs } = await request( - "GET", - `/history/${parsed.service}?${qs}`, - apiKey, - ); - return ok(data, elapsedMs); - } catch (err) { - return fail(err); - } -} diff --git a/src/types/index.ts b/src/types/index.ts deleted file mode 100644 index 6e6dba6..0000000 --- a/src/types/index.ts +++ /dev/null @@ -1,125 +0,0 @@ -import type { z } from "zod"; -import type { - AgenticScraperSchema, - CrawlSchema, - GenerateSchemaSchema, - HistorySchema, - MarkdownifySchema, - ScrapeSchema, - SearchScraperSchema, - SitemapSchema, - SmartScraperSchema, -} from "../lib/schemas.js"; - -export type SmartScraperParams = z.infer; -export type SearchScraperParams = z.infer; -export type MarkdownifyParams = z.infer; -export type CrawlParams = z.infer; -export type GenerateSchemaParams = z.infer; -export type SitemapParams = z.infer; -export type ScrapeParams = z.infer; -export type AgenticScraperParams = z.infer; -export type HistoryParams = z.input; - -export type ApiResult = { - status: "success" | "error"; - data: T | null; - error?: string; - elapsedMs: number; -}; - -export type SmartScraperResponse = { - request_id: string; - status: string; - website_url: string; - user_prompt: string; - result: Record | null; - error?: string; -}; - -export type SearchScraperResponse = { - request_id: string; - status: string; - user_prompt: string; - result: Record | null; - reference_urls: string[]; - error?: string; -}; - -export type MarkdownifyResponse = { - request_id: string; - status: string; - website_url: string; - result: string | null; - error?: string; -}; - -export type CrawlPage = { - url: string; - markdown: string; -}; - -export type CrawlResponse = { - crawl_id: string; - status: string; - result?: Record | null; - crawled_urls?: string[]; - pages?: CrawlPage[]; - error?: string; -}; - -export type ScrapeResponse = { - request_id: string; - status: string; - html: string; - branding?: Record; - error?: string; -}; - -export type AgenticScraperResponse = { - request_id: string; - status: string; - result: Record | null; - error?: string; -}; - -export type GenerateSchemaResponse = { - request_id: string; - status: string; - user_prompt: string; - refined_prompt?: string | null; - generated_schema?: Record | null; - error?: string | null; - created_at?: string | null; - updated_at?: string | null; -}; - -export type SitemapResponse = { - request_id?: string; - status?: string; - website_url?: string; - urls: string[]; - error?: string; -}; - -export type CreditsResponse = { - remaining_credits: number; - total_credits_used: number; -}; - -export type HealthResponse = { - status: string; -}; - -export type HistoryResponse = { - requests: HistoryEntry[]; - total_count: number; - page: number; - page_size: number; -}; - -export type HistoryEntry = { - request_id: string; - status: string; - [key: string]: unknown; -}; diff --git a/tests/scrapegraphai.test.ts b/tests/scrapegraphai.test.ts deleted file mode 100644 index 3b68525..0000000 --- a/tests/scrapegraphai.test.ts +++ /dev/null @@ -1,372 +0,0 @@ -import { afterEach, describe, expect, mock, spyOn, test } from "bun:test"; - -mock.module("../src/lib/env.js", () => ({ - env: { debug: false, timeoutS: 120 }, - CONFIG_DIR: "/tmp/test-scrapegraphai", - CONFIG_PATH: "/tmp/test-scrapegraphai/config.json", -})); - -import * as scrapegraphai from "../src/lib/scrapegraphai.js"; - -const API_KEY = "test-sgai-key-abc123"; -const BASE = "https://api.scrapegraphai.com/v1"; - -function json(body: unknown, status = 200): Response { - return new Response(JSON.stringify(body), { - status, - headers: { "Content-Type": "application/json" }, - }); -} - -let fetchSpy: ReturnType>; - -afterEach(() => { - fetchSpy?.mockRestore(); -}); - -function expectPost(callIndex: number, path: string, body?: object) { - const [url, init] = fetchSpy.mock.calls[callIndex] as [string, RequestInit]; - expect(url).toBe(`${BASE}${path}`); - expect(init.method).toBe("POST"); - expect((init.headers as Record)["SGAI-APIKEY"]).toBe(API_KEY); - expect((init.headers as Record)["Content-Type"]).toBe("application/json"); - if (body) expect(JSON.parse(init.body as string)).toEqual(body); -} - -function expectGet(callIndex: number, path: string) { - const [url, init] = fetchSpy.mock.calls[callIndex] as [string, RequestInit]; - expect(url).toBe(`${BASE}${path}`); - expect(init.method).toBe("GET"); - expect((init.headers as Record)["SGAI-APIKEY"]).toBe(API_KEY); -} - -// --------------------------------------------------------------------------- -// smartScraper — sync endpoint, tests shared HTTP internals -// --------------------------------------------------------------------------- - -describe("smartScraper", () => { - const params = { user_prompt: "Extract prices", website_url: "https://example.com" }; - - test("success", async () => { - const body = { status: "completed", result: { prices: [10, 20] } }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expect(res.elapsedMs).toBeGreaterThanOrEqual(0); - expect(fetchSpy).toHaveBeenCalledTimes(1); - expectPost(0, "/smartscraper", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.smartScraper(API_KEY, { user_prompt: "" } as any); - - expect(res.status).toBe("error"); - expect(res.error).toBeDefined(); - }); - - test("HTTP 401", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce( - json({ detail: "Invalid key" }, 401), - ); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toContain("Invalid or missing API key"); - }); - - test("HTTP 402", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json({}, 402)); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toContain("Insufficient credits"); - }); - - test("HTTP 422", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json({}, 422)); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toContain("Invalid parameters"); - }); - - test("HTTP 429", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json({}, 429)); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toContain("Rate limited"); - }); - - test("HTTP 500", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json({}, 500)); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toContain("Server error"); - }); - - test("HTTP error with detail", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce( - json({ detail: "quota exceeded" }, 402), - ); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toContain("quota exceeded"); - }); - - test("timeout", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockRejectedValueOnce( - new DOMException("The operation was aborted", "TimeoutError"), - ); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toBe("Request timed out"); - }); - - test("network error", async () => { - fetchSpy = spyOn(globalThis, "fetch").mockRejectedValueOnce(new Error("fetch failed")); - const res = await scrapegraphai.smartScraper(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toBe("fetch failed"); - }); -}); - -// --------------------------------------------------------------------------- -// searchScraper -// --------------------------------------------------------------------------- - -describe("searchScraper", () => { - const params = { user_prompt: "Best pizza in NYC" }; - - test("success", async () => { - const body = { status: "completed", results: [{ title: "Joe's Pizza" }] }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.searchScraper(API_KEY, params); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expectPost(0, "/searchscraper", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.searchScraper(API_KEY, { user_prompt: "" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// markdownify -// --------------------------------------------------------------------------- - -describe("markdownify", () => { - const params = { website_url: "https://example.com" }; - - test("success", async () => { - const body = { status: "completed", result: "# Hello" }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.markdownify(API_KEY, params); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expectPost(0, "/markdownify", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.markdownify(API_KEY, { website_url: "not-a-url" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// scrape -// --------------------------------------------------------------------------- - -describe("scrape", () => { - const params = { website_url: "https://example.com" }; - - test("success", async () => { - const body = { status: "completed", html: "..." }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.scrape(API_KEY, params); - - expect(res.status).toBe("success"); - expectPost(0, "/scrape", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.scrape(API_KEY, { website_url: "" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// crawl — async endpoint, uses crawl_id, tests polling internals -// --------------------------------------------------------------------------- - -describe("crawl", () => { - const params = { url: "https://example.com" }; - - test("immediate completion", async () => { - const body = { status: "done", pages: [{ url: "https://example.com", content: "data" }] }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.crawl(API_KEY, params); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expectPost(0, "/crawl", params); - }); - - test("polls with crawl_id", async () => { - fetchSpy = spyOn(globalThis, "fetch") - .mockResolvedValueOnce(json({ status: "pending", crawl_id: "crawl-99" })) - .mockResolvedValueOnce(json({ status: "done", crawl_id: "crawl-99", pages: [] })); - - const res = await scrapegraphai.crawl(API_KEY, params); - - expect(res.status).toBe("success"); - expect(fetchSpy).toHaveBeenCalledTimes(2); - expectGet(1, "/crawl/crawl-99"); - }); - - test("calls onPoll callback", async () => { - const statuses: string[] = []; - fetchSpy = spyOn(globalThis, "fetch") - .mockResolvedValueOnce(json({ status: "pending", crawl_id: "crawl-99" })) - .mockResolvedValueOnce(json({ status: "done", crawl_id: "crawl-99", pages: [] })); - - await scrapegraphai.crawl(API_KEY, params, (s) => statuses.push(s)); - - expect(statuses).toEqual(["done"]); - }); - - test("poll failure", async () => { - fetchSpy = spyOn(globalThis, "fetch") - .mockResolvedValueOnce(json({ status: "pending", crawl_id: "crawl-99" })) - .mockResolvedValueOnce(json({ status: "failed", error: "Crawl exploded" })); - - const res = await scrapegraphai.crawl(API_KEY, params); - - expect(res.status).toBe("error"); - expect(res.error).toBe("Crawl exploded"); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.crawl(API_KEY, { url: "not-a-url" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// agenticScraper -// --------------------------------------------------------------------------- - -describe("agenticScraper", () => { - const params = { url: "https://example.com", steps: ["Click login"] }; - - test("success", async () => { - const body = { status: "completed", result: { screenshot: "base64..." } }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.agenticScraper(API_KEY, params); - - expect(res.status).toBe("success"); - expectPost(0, "/agentic-scrapper", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.agenticScraper(API_KEY, { url: "nope" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// generateSchema -// --------------------------------------------------------------------------- - -describe("generateSchema", () => { - const params = { user_prompt: "Schema for product" }; - - test("success", async () => { - const body = { status: "completed", schema: { type: "object" } }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.generateSchema(API_KEY, params); - - expect(res.status).toBe("success"); - expectPost(0, "/generate_schema", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.generateSchema(API_KEY, { user_prompt: "" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// sitemap — sync endpoint (POST, no polling) -// --------------------------------------------------------------------------- - -describe("sitemap", () => { - const params = { website_url: "https://example.com" }; - - test("success", async () => { - const body = { urls: ["https://example.com/a", "https://example.com/b"] }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.sitemap(API_KEY, params); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expectPost(0, "/sitemap", params); - }); - - test("validation failure", async () => { - const res = await scrapegraphai.sitemap(API_KEY, { website_url: "garbage" } as any); - expect(res.status).toBe("error"); - }); -}); - -// --------------------------------------------------------------------------- -// getCredits — GET, no body -// --------------------------------------------------------------------------- - -describe("getCredits", () => { - test("success", async () => { - const body = { remaining_credits: 420, total_credits_used: 69 }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.getCredits(API_KEY); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expectGet(0, "/credits"); - }); -}); - -// --------------------------------------------------------------------------- -// checkHealth — GET, no body -// --------------------------------------------------------------------------- - -describe("checkHealth", () => { - test("success", async () => { - const body = { status: "ok" }; - fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); - - const res = await scrapegraphai.checkHealth(API_KEY); - - expect(res.status).toBe("success"); - expect(res.data).toEqual(body); - expectGet(0, "/healthz"); - }); -}); diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts new file mode 100644 index 0000000..e2fab44 --- /dev/null +++ b/tests/smoke.test.ts @@ -0,0 +1,7 @@ +import { expect, test } from "bun:test"; +import { HISTORY_SERVICES, smartScraper } from "scrapegraph-js"; + +test("sdk exports are available", () => { + expect(typeof smartScraper).toBe("function"); + expect(HISTORY_SERVICES.length).toBeGreaterThan(0); +}); From 8f30ab1e82dd08594180bee06f97a3aa35341354 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 17 Feb 2026 15:40:19 +0100 Subject: [PATCH 2/3] chore: bump version to 0.2.0 Co-Authored-By: Claude Opus 4.6 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index e2ac14e..de2445b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "just-scrape", - "version": "0.1.9", + "version": "0.2.0", "description": "ScrapeGraph AI CLI tool", "type": "module", "main": "dist/cli.mjs", From 433ee742a6a6897bb957d0938fc0d85e210d0e7b Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 17 Feb 2026 15:43:13 +0100 Subject: [PATCH 3/3] updated README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7af3988..4531ebb 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ pnpm add -g just-scrape # pnpm yarn global add just-scrape # yarn bun add -g just-scrape # bun npx just-scrape --help # or run without installing +bunx just-scrape --help # bun equivalent ``` Package: [just-scrape](https://www.npmjs.com/package/just-scrape) on npm.