diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index 51b535002..2f8cec2fe 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -521,11 +521,15 @@ async def crawl_with_profile_cli(profile_path, url):
# Run the crawler
result = await run_crawler(url, browser_cfg, crawler_cfg, True)
+ # Get JSON output config
+ config = get_global_config()
+ ensure_ascii = config.get("JSON_ENSURE_ASCII", USER_SETTINGS["JSON_ENSURE_ASCII"]["default"])
+
# Handle output
if output_format == "all":
- console.print(json.dumps(result.model_dump(), indent=2))
+ console.print(json.dumps(result.model_dump(), indent=2, ensure_ascii=ensure_ascii))
elif output_format == "json":
- console.print(json.dumps(json.loads(result.extracted_content), indent=2))
+ console.print(json.dumps(json.loads(result.extracted_content), indent=2, ensure_ascii=ensure_ascii))
elif output_format in ["markdown", "md"]:
console.print(result.markdown.raw_markdown)
elif output_format == "title":
@@ -1019,9 +1023,10 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
+@click.option("--json-ensure-ascii/--no-json-ensure-ascii", default=None, help="Escape non-ASCII characters in JSON output (default: from global config)")
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
- output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
+ output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int, json_ensure_ascii: Optional[bool]):
"""Crawl a website and extract content
Simple Usage:
@@ -1187,6 +1192,12 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
browser_cfg.verbose = config.get("VERBOSE", False)
crawler_cfg.verbose = config.get("VERBOSE", False)
+ # Get JSON output config (priority: CLI flag > global config)
+ if json_ensure_ascii is not None:
+ ensure_ascii = json_ensure_ascii
+ else:
+ ensure_ascii = config.get("JSON_ENSURE_ASCII", USER_SETTINGS["JSON_ENSURE_ASCII"]["default"])
+
# Run crawler
result : CrawlResult = anyio.run(
run_crawler,
@@ -1221,13 +1232,13 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
if output == "all":
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
- click.echo(json.dumps(output_data, indent=2))
+ click.echo(json.dumps(output_data, indent=2, ensure_ascii=ensure_ascii))
else:
- click.echo(json.dumps(main_result.model_dump(), indent=2))
+ click.echo(json.dumps(main_result.model_dump(), indent=2, ensure_ascii=ensure_ascii))
elif output == "json":
print(main_result.extracted_content)
extracted_items = json.loads(main_result.extracted_content)
- click.echo(json.dumps(extracted_items, indent=2))
+ click.echo(json.dumps(extracted_items, indent=2, ensure_ascii=ensure_ascii))
elif output in ["markdown", "md"]:
click.echo(main_result.markdown.raw_markdown)
@@ -1238,9 +1249,9 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
with open(output_file, "w") as f:
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
- f.write(json.dumps(output_data, indent=2))
+ f.write(json.dumps(output_data, indent=2, ensure_ascii=ensure_ascii))
else:
- f.write(json.dumps(main_result.model_dump(), indent=2))
+ f.write(json.dumps(main_result.model_dump(), indent=2, ensure_ascii=ensure_ascii))
elif output == "json":
with open(output_file, "w") as f:
f.write(main_result.extracted_content)
@@ -1403,9 +1414,10 @@ def profiles_cmd():
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
+@click.option("--json-ensure-ascii/--no-json-ensure-ascii", default=None, help="Escape non-ASCII characters in JSON output (default: from global config)")
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
- output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
+ output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int, json_ensure_ascii: Optional[bool]):
"""Crawl4AI CLI - Web content extraction tool
Simple Usage:
@@ -1457,7 +1469,8 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
verbose=verbose,
profile=profile,
deep_crawl=deep_crawl,
- max_pages=max_pages
+ max_pages=max_pages,
+ json_ensure_ascii=json_ensure_ascii
)
def main():
diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index 08f56b832..5686f633f 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -142,5 +142,10 @@
"description": "Default user agent mode (default, random, or mobile)",
"type": "string",
"options": ["default", "random", "mobile"]
+ },
+ "JSON_ENSURE_ASCII": {
+ "default": True,
+ "description": "Whether to escape non-ASCII characters in JSON output (False preserves Unicode like 'š', True escapes as '\\u0161')",
+ "type": "boolean"
}
}
diff --git a/docs/codebase/cli.md b/docs/codebase/cli.md
index 6f5a348b1..fece7be89 100644
--- a/docs/codebase/cli.md
+++ b/docs/codebase/cli.md
@@ -10,7 +10,7 @@
| **config get** | `key` | Prints the value of a single setting, falls back to default if unset. |
| **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). |
| **examples** | – | Just spits out real-world CLI usage samples. |
-| **crawl** | `url` *(positional)*
`--browser-config,-B` path
`--crawler-config,-C` path
`--filter-config,-f` path
`--extraction-config,-e` path
`--json-extract,-j` [desc]\*
`--schema,-s` path
`--browser,-b` k=v list
`--crawler,-c` k=v list
`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*
`--output-file,-O` path
`--bypass-cache,-b` *(flag, default true — note flag reuse)*
`--question,-q` str
`--verbose,-v` *(flag)*
`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. |
+| **crawl** | `url` *(positional)*
`--browser-config,-B` path
`--crawler-config,-C` path
`--filter-config,-f` path
`--extraction-config,-e` path
`--json-extract,-j` [desc]\*
`--schema,-s` path
`--browser,-b` k=v list
`--crawler,-c` k=v list
`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*
`--output-file,-O` path
`--bypass-cache,-b` *(flag, default true — note flag reuse)*
`--question,-q` str
`--verbose,-v` *(flag)*
`--profile,-p` profile-name
`--json-ensure-ascii/--no-json-ensure-ascii` *(flag, default from global config)* | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. Control Unicode escaping in JSON output with `--json-ensure-ascii` flags. |
| **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. |
\* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions.
diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md
index ded35f2f6..effbc306d 100644
--- a/docs/md_v2/core/cli.md
+++ b/docs/md_v2/core/cli.md
@@ -235,6 +235,21 @@ crwl https://example.com -f filter_bm25.yml -o markdown-fit
- `markdown` / `md` - Raw markdown output
- `markdown-fit` / `md-fit` - Filtered markdown for better readability
+### Unicode Handling in JSON Output
+
+By default, non-ASCII characters in JSON output are escaped (e.g., 'š' becomes `\u0161`). You can control this behavior:
+
+```bash
+# Preserve Unicode characters in JSON output
+crwl https://example.com -o all --no-json-ensure-ascii
+
+# Escape non-ASCII characters (default behavior)
+crwl https://example.com -o all --json-ensure-ascii
+
+# Set global default
+crwl config set JSON_ENSURE_ASCII false # Preserve Unicode by default
+```
+
## Complete Examples
1. Basic Extraction: