diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 51b535002..2f8cec2fe 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -521,11 +521,15 @@ async def crawl_with_profile_cli(profile_path, url): # Run the crawler result = await run_crawler(url, browser_cfg, crawler_cfg, True) + # Get JSON output config + config = get_global_config() + ensure_ascii = config.get("JSON_ENSURE_ASCII", USER_SETTINGS["JSON_ENSURE_ASCII"]["default"]) + # Handle output if output_format == "all": - console.print(json.dumps(result.model_dump(), indent=2)) + console.print(json.dumps(result.model_dump(), indent=2, ensure_ascii=ensure_ascii)) elif output_format == "json": - console.print(json.dumps(json.loads(result.extracted_content), indent=2)) + console.print(json.dumps(json.loads(result.extracted_content), indent=2, ensure_ascii=ensure_ascii)) elif output_format in ["markdown", "md"]: console.print(result.markdown.raw_markdown) elif output_format == "title": @@ -1019,9 +1023,10 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--profile", "-p", help="Use a specific browser profile (by name)") @click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)") @click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode") +@click.option("--json-ensure-ascii/--no-json-ensure-ascii", default=None, help="Escape non-ASCII characters in JSON output (default: from global config)") def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, - output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int): + output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int, json_ensure_ascii: Optional[bool]): """Crawl a website and extract content Simple Usage: @@ -1187,6 +1192,12 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: browser_cfg.verbose = config.get("VERBOSE", False) crawler_cfg.verbose = config.get("VERBOSE", False) + # Get JSON output config (priority: CLI flag > global config) + if json_ensure_ascii is not None: + ensure_ascii = json_ensure_ascii + else: + ensure_ascii = config.get("JSON_ENSURE_ASCII", USER_SETTINGS["JSON_ENSURE_ASCII"]["default"]) + # Run crawler result : CrawlResult = anyio.run( run_crawler, @@ -1221,13 +1232,13 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: if output == "all": if isinstance(result, list): output_data = [r.model_dump() for r in all_results] - click.echo(json.dumps(output_data, indent=2)) + click.echo(json.dumps(output_data, indent=2, ensure_ascii=ensure_ascii)) else: - click.echo(json.dumps(main_result.model_dump(), indent=2)) + click.echo(json.dumps(main_result.model_dump(), indent=2, ensure_ascii=ensure_ascii)) elif output == "json": print(main_result.extracted_content) extracted_items = json.loads(main_result.extracted_content) - click.echo(json.dumps(extracted_items, indent=2)) + click.echo(json.dumps(extracted_items, indent=2, ensure_ascii=ensure_ascii)) elif output in ["markdown", "md"]: click.echo(main_result.markdown.raw_markdown) @@ -1238,9 +1249,9 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: with open(output_file, "w") as f: if isinstance(result, list): output_data = [r.model_dump() for r in all_results] - f.write(json.dumps(output_data, indent=2)) + f.write(json.dumps(output_data, indent=2, ensure_ascii=ensure_ascii)) else: - f.write(json.dumps(main_result.model_dump(), indent=2)) + f.write(json.dumps(main_result.model_dump(), indent=2, ensure_ascii=ensure_ascii)) elif output == "json": with open(output_file, "w") as f: f.write(main_result.extracted_content) @@ -1403,9 +1414,10 @@ def profiles_cmd(): @click.option("--profile", "-p", help="Use a specific browser profile (by name)") @click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy") @click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode") +@click.option("--json-ensure-ascii/--no-json-ensure-ascii", default=None, help="Escape non-ASCII characters in JSON output (default: from global config)") def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, - output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int): + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int, json_ensure_ascii: Optional[bool]): """Crawl4AI CLI - Web content extraction tool Simple Usage: @@ -1457,7 +1469,8 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f verbose=verbose, profile=profile, deep_crawl=deep_crawl, - max_pages=max_pages + max_pages=max_pages, + json_ensure_ascii=json_ensure_ascii ) def main(): diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 08f56b832..5686f633f 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -142,5 +142,10 @@ "description": "Default user agent mode (default, random, or mobile)", "type": "string", "options": ["default", "random", "mobile"] + }, + "JSON_ENSURE_ASCII": { + "default": True, + "description": "Whether to escape non-ASCII characters in JSON output (False preserves Unicode like 'š', True escapes as '\\u0161')", + "type": "boolean" } } diff --git a/docs/codebase/cli.md b/docs/codebase/cli.md index 6f5a348b1..fece7be89 100644 --- a/docs/codebase/cli.md +++ b/docs/codebase/cli.md @@ -10,7 +10,7 @@ | **config get** | `key` | Prints the value of a single setting, falls back to default if unset. | | **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). | | **examples** | – | Just spits out real-world CLI usage samples. | -| **crawl** | `url` *(positional)*
`--browser-config,-B` path
`--crawler-config,-C` path
`--filter-config,-f` path
`--extraction-config,-e` path
`--json-extract,-j` [desc]\*
`--schema,-s` path
`--browser,-b` k=v list
`--crawler,-c` k=v list
`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*
`--output-file,-O` path
`--bypass-cache,-b` *(flag, default true — note flag reuse)*
`--question,-q` str
`--verbose,-v` *(flag)*
`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. | +| **crawl** | `url` *(positional)*
`--browser-config,-B` path
`--crawler-config,-C` path
`--filter-config,-f` path
`--extraction-config,-e` path
`--json-extract,-j` [desc]\*
`--schema,-s` path
`--browser,-b` k=v list
`--crawler,-c` k=v list
`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*
`--output-file,-O` path
`--bypass-cache,-b` *(flag, default true — note flag reuse)*
`--question,-q` str
`--verbose,-v` *(flag)*
`--profile,-p` profile-name
`--json-ensure-ascii/--no-json-ensure-ascii` *(flag, default from global config)* | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. Control Unicode escaping in JSON output with `--json-ensure-ascii` flags. | | **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. | \* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions. diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md index ded35f2f6..effbc306d 100644 --- a/docs/md_v2/core/cli.md +++ b/docs/md_v2/core/cli.md @@ -235,6 +235,21 @@ crwl https://example.com -f filter_bm25.yml -o markdown-fit - `markdown` / `md` - Raw markdown output - `markdown-fit` / `md-fit` - Filtered markdown for better readability +### Unicode Handling in JSON Output + +By default, non-ASCII characters in JSON output are escaped (e.g., 'š' becomes `\u0161`). You can control this behavior: + +```bash +# Preserve Unicode characters in JSON output +crwl https://example.com -o all --no-json-ensure-ascii + +# Escape non-ASCII characters (default behavior) +crwl https://example.com -o all --json-ensure-ascii + +# Set global default +crwl config set JSON_ENSURE_ASCII false # Preserve Unicode by default +``` + ## Complete Examples 1. Basic Extraction: