diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 00000000..274f4244 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,223 @@ +# Security Policy + +## Purpose + +This document defines how security issues affecting the python release of `Hamstring` must be reported, handled, remediated, and disclosed. The goal is to ensure that security-relevant findings are managed confidentially, triaged consistently, and resolved within appropriate operational timelines. + +## Supported Versions + +Security fixes are provided for the latest released major version. + +| Version | Supported | +| ------- | --------- | +| `main` / latest release | Yes | +| Previous minor release | Yes | +| Older releases | No | + +## Confidential Reporting + +Security issues must be reported through confidential channels only. Public +GitHub issues, public pull requests, or other public forums must not be used +for reporting vulnerabilities. + +Approved reporting channels: + +- GitHub Private Vulnerability Reporting / Security Advisories +- Email: `pm300@uni-heidelberg.de` + +Reports should include, where available: + +- A concise description of the issue +- Affected component, endpoint, container, or workflow +- Reproduction steps or validation details +- Expected impact and potential exploitation scenario +- Affected version, branch, commit, or image tag +- Relevant logs, screenshots, request samples, or proof-of-concept material +- Whether the issue appears actively exploited or time-sensitive + +## Confidentiality Requirements + +All reported security issues are handled as confidential until review is +complete and a coordinated disclosure decision has been made. + +During this period: + +- Issue details must not be disclosed publicly +- Sensitive technical details must be shared only on a need-to-know basis +- Access to internal discussion, remediation branches, and advisory drafts + should be restricted +- If there is evidence of active exploitation, internal escalation should occur + immediately + +## Intake and Triage + +Each report is reviewed to determine: + +- Whether the issue is reproducible +- Whether it affects a supported version +- Whether there is a meaningful confidentiality, integrity, or availability impact +- Whether exploitation requires special conditions or privileged access +- Whether the issue represents active exploitation, a misconfiguration, or a + theoretical weakness without practical impact + +Severity should be classified as `Critical / High / Medium / Low`. + +## Response Targets + +The following target times apply to supported versions and valid security +reports. These targets are operational goals, not contractual guarantees. + +| Stage | Target | +| ----- | ------ | +| Initial acknowledgement | Within 2 business days | +| Initial triage decision | Within 5 business days | +| First remediation update | Within 7 calendar days | +| Ongoing status updates | At least every 7 calendar days | +| Critical issue remediation plan | Within 7 calendar days | +| High severity remediation plan | Within 14 calendar days | +| Medium severity remediation plan | Within 30 calendar days | +| Low severity remediation plan | Best effort | + +If a report indicates active exploitation, credential exposure, remote code +execution, or broad unauthorized access, the issue should be escalated as an +incident and handled with priority outside normal backlog processes. + +## Remediation Process + +When a security issue is confirmed, maintainers should: + +- Reproduce and validate the issue +- Define affected versions and deployment scenarios +- Prepare a remediation plan proportional to the severity +- Implement and review the fix +- Backport the fix to supported versions where feasible +- Validate the fix before release +- Prepare customer-facing or operator-facing guidance if configuration or + operational action is required + +Where immediate remediation is not possible, temporary mitigations should be +documented and communicated clearly. + +## Disclosure and Publication + +Confirmed vulnerabilities are disclosed in a coordinated manner after one of +the following conditions is met: + +- A fix has been released +- A mitigation has been published and the residual risk is understood +- A disclosure deadline has been reached and leadership approves publication + +The default disclosure target is `90 days`, but the actual window may be +shortened or extended based on: + +- Evidence of exploitation +- Fix availability and deployment risk +- Customer exposure +- Dependency or vendor coordination needs + +Public disclosures may include: + +- A security advisory +- Release notes +- Upgrade or mitigation instructions +- Severity and affected-version information + +## Operational Communication + +Where a confirmed issue affects deployed environments, communication should be +proportionate to impact. This may include: + +- Internal security or operations escalation +- Notification to administrators, customers, or service owners +- Temporary mitigation guidance +- Required upgrade or rotation steps +- Post-remediation confirmation and closure + +Security communications should avoid unnecessary disclosure of exploit details +before mitigations are available. + +## Scope + +The following areas are considered in scope for security handling: + +- Authentication and authorization controls +- Password handling and account lifecycle +- File upload, parsing, and processing pipelines +- Secrets handling and environment configuration +- Data access controls and audit logging +- Container, service, and network configuration +- Dependency vulnerabilities with validated product impact +- Sensitive data exposure, privilege escalation, SSRF, RCE, injection, and + broken access control + +## Out of Scope + +The following are generally not treated as security vulnerabilities unless +clear and demonstrated security impact exists: + +- Cosmetic misconfigurations without exploitability +- Missing hardening headers without a practical attack path +- Issues affecting unsupported or end-of-life releases only +- Hypothetical findings without reproducible impact +- Third-party platform issues outside the control of this project +- Reports based only on scanner output without technical validation + +## Safe Handling Expectations + +Anyone validating a suspected issue is expected to act in a controlled and +minimal manner. + +Expected behavior: + +- Limit activity to what is necessary to confirm the issue +- Avoid unauthorized access to non-public or third-party data +- Avoid disruption of production systems +- Avoid persistence, data modification, or data destruction +- Stop testing and report promptly once the issue is confirmed + +This policy does not authorize: + +- Access to data belonging to other users or organizations +- Service disruption or denial-of-service activity +- Data exfiltration or retention of sensitive information +- Any activity that violates applicable law or contractual obligations + +## Security Updates + +Security fixes may be distributed through one or more of the following: + +- Normal release process +- Out-of-band patch release +- Security advisory +- Operational mitigation notice + +Where appropriate, the published update should include: + +- Affected versions +- Fixed versions +- Severity +- Upgrade path +- Required operational actions + +## Escalation + +If no acknowledgement is received within the response target above, the report +should be resent to: + +- `pm300@uni-heidelberg.de` + +Urgent reports involving active exploitation or high-confidence compromise +should use the subject line: + +`[URGENT SECURITY REPORT]` + +## Policy Maintenance + +This policy should be reviewed whenever: + +- Reporting channels change +- Supported versions change +- Incident response expectations change +- Disclosure commitments change + +Last reviewed: `27-03-2026` diff --git a/.readthedocs.yml b/.readthedocs.yml index b241b932..336e933a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,9 +1,9 @@ version: "2" build: - os: "ubuntu-22.04" + os: "ubuntu-24.04" tools: - python: "3.10" + python: "3.11" jobs: pre_build: - sphinx-apidoc -T -M -o docs/api src/ "*/tests" @@ -20,4 +20,4 @@ python: - requirements: requirements/requirements.logserver.txt sphinx: - configuration: docs/conf.py + configuration: docs/conf.py \ No newline at end of file diff --git a/README.md b/README.md index 5e1780a4..28f35375 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,7 @@
- - Logo - + Logo

HAMSTRING

@@ -34,8 +32,6 @@

-> [!CAUTION] -> This project has been moved to https://github.com/Hamstring-NDR/hamstring. Future development, issues, and releases will be maintained there. @@ -56,7 +52,7 @@ ## About the Project -![Pipeline overview](https://raw.githubusercontent.com/hamstring-ndr/hamstring/main/docs/media/hamstring_overview_detailed.drawio.png?raw=true) +![Pipeline overview](./assets/heidgaf_architecture.svg) ## Getting Started @@ -68,27 +64,11 @@ HOST_IP=127.0.0.1 docker compose -f docker/docker-compose.yml --profile prod up Terminal example

-#### Use the dev profile for testing out changes in docke containers: +#### Use the dev profile for testing out changes in docker containers: ```sh HOST_IP=127.0.0.1 docker compose -f docker/docker-compose.yml --profile dev up ``` - -#### Or run the modules locally on your machine: -```sh -python -m venv .venv -source .venv/bin/activate - -sh install_requirements.sh -``` -Alternatively, you can use `pip install` and enter all needed requirements individually with `-r requirements.*.txt`. - -Now, you can start each stage, e.g. the inspector: - -```sh -python src/inspector/inspector.py -``` -

(back to top)

@@ -126,6 +106,11 @@ For more in-depth information on your options, have a look at our [official documentation](https://hamstring.readthedocs.io/en/latest/usage.html), where we provide tables explaining all values in detail. + +### Testing Your Own Data + +If you want to ingest data to the pipeline, you can do so via the zeek container. Either select the interface in the `config.yaml` zeek should be listening on and set `static_analysis: false` or provide PCAPs to Zeek by adding them in the `data/test_pcaps` directory, which is mounted per default for Zeek to ingest static data. + ### Monitoring To monitor the system and observe its real-time behavior, multiple Grafana dashboards have been set up. @@ -209,22 +194,17 @@ Have a look at the following pictures showing examples of how these dashboards m To train and test our and possibly your own models, we currently rely on the following datasets: -- [CICBellDNS2021](https://www.unb.ca/cic/datasets/dns-2021.html) - [DGTA Benchmark](https://data.mendeley.com/datasets/2wzf9bz7xr/1) - [DNS Tunneling Queries for Binary Classification](https://data.mendeley.com/datasets/mzn9hvdcxg/1) - [UMUDGA - University of Murcia Domain Generation Algorithm Dataset](https://data.mendeley.com/datasets/y8ph45msv8/1) - [DGArchive](https://dgarchive.caad.fkie.fraunhofer.de/) +- [DNS Exfiltration](https://data.mendeley.com/datasets/c4n7fckkz3/3) We compute all features separately and only rely on the `domain` and `class` for binary classification. ### Inserting Data for Testing -For testing purposes, we provide multiple scripts in the `scripts` directory. Use `real_logs.dev.py` to send data from -the datasets into the pipeline. After downloading the dataset and storing it under `/data`, run -```sh -python scripts/real_logs.dev.py -``` -to start continuously inserting dataset traffic. +For testing purposes, you can ingest PCAPs or tap on network interfaces using the zeek-based sensor in its `1.0.0` release. For more information on it, please refer to [the documentation](https://github.com/Hamstring-NDR/hamstring-zeek). ### Training Your Own Models @@ -270,7 +250,7 @@ The results will be saved per default to `./results`, if not configured otherwis #### Model Tests ```sh -> python src/train/train.py test --dataset --dataset_path --model --model_path +> python src/train/train.py test --dataset --dataset_path --model --model_output_path ``` #### Model Explain diff --git a/assets/hamstring.svg b/assets/hamstring.svg new file mode 100644 index 00000000..605bdfac --- /dev/null +++ b/assets/hamstring.svg @@ -0,0 +1,44 @@ + + + + diff --git a/assets/heidgaf_architecture.svg b/assets/heidgaf_architecture.svg index bef8170d..4530acf2 100644 --- a/assets/heidgaf_architecture.svg +++ b/assets/heidgaf_architecture.svg @@ -1,4 +1,4 @@ -
Log Server
Log Collector
Batch Sender
Prefilter
Inspector
Detector
ZooKeeper
Kafka Broker
Zeek Sensor
Kafka Broker
Kafka Broker
Log Generation
Log Aggregation
Collection
Filtering
Inspection
Detection
Legend
Consume
Produce
+
Log Server
Log Collector
Batch Sender
Prefilter
Inspector
Detector
ZooKeeper
Zeek Sensor
Kafka Broker
Log Generation
Log Aggregation
Collection
Filtering
Inspection
Detection
Legend
Consume
Produce
Alerter
Alerting
Kafka Broker
Kafka Broker
Log Metadata & Analysis Aggregation
Log File
PCAPs & 
TAP interfaces
\ No newline at end of file diff --git a/assets/heidgaf_cicd.svg b/assets/heidgaf_cicd.svg deleted file mode 100644 index 6c89576c..00000000 --- a/assets/heidgaf_cicd.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
Self-hosted CI/CD Runner
Triggered Workflow
on GitHub
Job 1
Test 1
start
Test N
start
Job N
Test 1
start
Test N
start
diff --git a/assets/upload_seafile.py b/assets/upload_seafile.py deleted file mode 100644 index 8ac8074c..00000000 --- a/assets/upload_seafile.py +++ /dev/null @@ -1,181 +0,0 @@ -import re -import argparse -import sys -import copy -from pathlib import Path -from urllib.parse import urlparse - -import requests -from bs4 import BeautifulSoup - -optional_packages = True -try: - # optional, for upload progess updates - from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor - from tqdm import tqdm -except ImportError: - optional_packages = False - - -def extract_var(script_text, variable_name, default=None): - if variable_name in script_text: - # match: var_name: "value" or var_name: 'value' or var_name = "value" or var_name = 'value' - pattern = re.compile( - r'{}\s*[:=]\s*(["\'])(.*?)\1'.format(re.escape(variable_name)) - ) - match = pattern.search(script_text) - if match: - return match.group(2) - return default - - -def extract_info_from_html(html_content): - soup = BeautifulSoup(html_content, "html.parser") - scripts = soup.find_all("script") - token = parent_dir = repo_id = dir_name = None - for script in scripts: - token = extract_var(script.text, "token", token) - parent_dir = extract_var(script.text, "path", parent_dir) - repo_id = extract_var(script.text, "repoID", repo_id) - dir_name = extract_var(script.text, "dirName", dir_name) - return token, parent_dir, repo_id, dir_name - - -def get_html_content(url): - response = requests.get(url) - return response.text - - -def get_upload_url(api_url): - response = requests.get(api_url) - if response.status_code == 200: - return response.json().get("upload_link") - return None - - -def get_upload_url2(api_url): - headers = {"Accept": "application/json", "X-Requested-With": "XMLHttpRequest"} - response = requests.get(api_url, headers=headers) - if response.status_code == 200: - return response.json().get("url") - return None - - -def upload_file(upload_url, file_path, fields): - fields = copy.deepcopy(fields) - path = Path(file_path) - filename = path.name - total_size = path.stat().st_size - - if not optional_packages: - with open(file_path, "rb") as f: - fields["file"] = (filename, f) - response = requests.post( - upload_url, files=fields, params={"ret-json": "true"} - ) - return response - - # ref: https://stackoverflow.com/a/67726532/11854304 - with tqdm( - desc=filename, - total=total_size, - unit="B", - unit_scale=True, - unit_divisor=1024, - ) as bar: - with open(file_path, "rb") as f: - fields["file"] = (filename, f) - encoder = MultipartEncoder(fields=fields) - monitor = MultipartEncoderMonitor( - encoder, lambda monitor: bar.update(monitor.bytes_read - bar.n) - ) - headers = {"Content-Type": monitor.content_type} - response = requests.post( - upload_url, headers=headers, data=monitor, params={"ret-json": "true"} - ) - return response - - -def upload_seafile(upload_page_link, file_path_list, replace_file, verbose): - parsed_results = urlparse(upload_page_link) - base_url = f"{parsed_results.scheme}://{parsed_results.netloc}" - if verbose: - print(f"Input:") - print(f" * Upload page url: {upload_page_link}") - print(f" * Files to be uploaded: {file_path_list}") - print(f" * Replace existing files: {replace_file}") - print(f"Preparation:") - print(f" * Base url: {base_url}") - - # get html content - html_content = get_html_content(upload_page_link) - - # extract variables from html content - token, parent_dir, repo_id, dir_name = extract_info_from_html(html_content) - if not parent_dir: - print(f"Cannot extract parent_dir from HTML content.", file=sys.stderr) - return 1 - if verbose: - print(f" * dir_name: {dir_name}") - print(f" * parent_dir: {parent_dir}") - - # get upload url - upload_url = None - if token: - # ref: https://github.com/haiwen/seafile-js/blob/master/src/seafile-api.js#L1164 - api_url = f"{base_url}/api/v2.1/upload-links/{token}/upload/" - upload_url = get_upload_url(api_url) - elif repo_id: - # ref: https://stackoverflow.com/a/38743242/11854304 - api_url = ( - upload_page_link.replace("/u/d/", "/ajax/u/d/").rstrip("/") - + f"/upload/?r={repo_id}" - ) - upload_url = get_upload_url2(api_url) - if not upload_url: - print(f"Cannot get upload_url.", file=sys.stderr) - return 1 - if verbose: - print(f" * upload_url: {upload_url}") - - # prepare payload fields - fields = {"parent_dir": parent_dir} - # overwrite file if already present in the upload directory. - # contributor: hmassias - # ref: https://gist.github.com/hmassias/358895ef0b2ffaa9e708181b16b554cf - if replace_file: - fields["replace"] = "1" - - # upload each file - print(f"Upload:") - for idx, file_path in enumerate(file_path_list): - print(f"({idx+1}) {file_path}") - try: - response = upload_file(upload_url, file_path, fields) - if response.status_code == 200: - print(f"({idx+1}) upload completed: {response.json()}") - else: - print( - f"({idx+1}) {file_path} ERROR: {response.status_code} {response.text}", - file=sys.stderr, - ) - except Exception as e: - print(f"({idx+1}) {file_path} EXCEPTION: {e}", file=sys.stderr) - - return 0 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-l", "--link", required=True, help="upload page link (generated by seafile)" - ) - parser.add_argument( - "-f", "--file", required=True, nargs="+", help="file(s) to upload" - ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="show detailed output" - ) - parser.add_argument("--replace", action="store_true", help="Replace existing files") - args = parser.parse_args() - sys.exit(upload_seafile(args.link, args.file, args.replace, args.verbose)) diff --git a/config-test.yaml b/config-test.yaml deleted file mode 100644 index d809e13d..00000000 --- a/config-test.yaml +++ /dev/null @@ -1,123 +0,0 @@ -logging: - base: - debug: false - modules: - log_storage.logserver: - debug: false - log_collection.collector: - debug: false - log_collection.batch_handler: - debug: false - log_filtering.prefilter: - debug: false - data_inspection.inspector: - debug: false - data_analysis.detector: - debug: false - -pipeline: - log_storage: - logserver: - input_file: "/opt/file.txt" - - - - log_collection: - default_batch_handler_config: - batch_size: 2000 - batch_timeout: 30.0 - subnet_id: - ipv4_prefix_length: 24 - ipv6_prefix_length: 64 - collectors: - - name: "dga_collector" - protocol_base: dns - required_log_information: - - [ "ts", Timestamp, "%Y-%m-%dT%H:%M:%S" ] - - [ "status_code", ListItem, [ "NOERROR", "NXDOMAIN" ], [ "NXDOMAIN" ] ] - - [ "src_ip", IpAddress ] - - [ "dns_server_ip", IpAddress ] - - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(? Subnet: 192.168.1.0/24 -[info] Domain: www.example.com -[info] Second level: example.com -[info] Third level: www -``` - -## Project Statistics - -### Code Metrics -- **Total Files:** 40+ C++ files -- **Lines of Code:** ~2,500+ lines -- **Build Time:** ~30 seconds (after dependencies) -- **Binary Size:** ~7.5 MB total - -### Dependencies Installed (via vcpkg) -- ✅ yaml-cpp (0.8.0) -- ✅ librdkafka (2.12.0) -- ✅ clickhouse-cpp (2.6.0) -- ✅ boost (1.89.0) -- ✅ spdlog (1.16.0) -- ✅ fmt (12.1.0) -- ✅ nlohmann-json (3.12.0) -- ✅ openssl (3.6.0) -- ✅ gtest (1.17.0) - -## What Works - -### ✅ Fully Functional -1. **Configuration Loading** - Parse config.yaml -2. **Feature Extraction** - Extract 44 DGA detection features -3. **Logging** - Structured logging with spdlog -4. **Utilities** - UUID, IP, domain parsing, SHA256 -5. **Data Classes** - LogLine, Batch, Warning with JSON -6. **Tests** - Google Test framework integrated - -### ⚠️ Partially Implemented -1. **LogServer** - Core logic implemented, needs ClickHouse integration -2. **Kafka Integration** - Headers defined, implementation pending -3. **ClickHouse Integration** - Headers defined, implementation pending - -### ❌ Not Yet Implemented -1. LogCollector module -2. Prefilter module -3. Inspector module -4. Detector executable with ONNX -5. Full Kafka/ClickHouse implementations - -## Performance Comparison - -### Expected vs Python - -| Metric | Python | C++ (Expected) | -|--------|--------|----------------| -| Binary Size | N/A | 7.5 MB | -| Startup Time | ~500ms | ~5ms | -| Config Load | ~100ms | ~4ms | -| Feature Extract | ~1ms | ~0.01ms | - -## Build Instructions - -###Quick Build (After vcpkg is set up) - -```bash -cd /Users/smachmeier/Documents/projects/hamstring/cpp - -# Configure -cmake -B build \ - -DCMAKE_TOOLCHAIN_FILE=/Users/smachmeier/vcpkg/scripts/buildsystems/vcpkg.cmake \ - -DCMAKE_BUILD_TYPE=Debug - -# Build (30 seconds) -cmake --build build -j$(sysctl -n hw.ncpu) - -# Run demo -./build/examples/demo ../config.yaml - -# Run tests -cd build && ctest -``` - -## Known Issues - -1. **Third-level domain entropy** - Returns 0 for single-label third levels -2. **ClickHouseSender** - Not implemented (placeholder only) -3. **Kafka handlers** - Not fully implemented yet -4. **OpenSSL warnings** - Using deprecated SHA256 API (non-critical) -5. **yaml-cpp deprecation** - Using deprecated target name (non-critical) - -## Next Steps - -### High Priority -1. Fix third-level domain entropy calculation -2. Implement ClickHouseSender -3. Implement Kafka handlers -4. Complete LogServer executable - -### Medium Priority -1. Implement LogCollector module -2. Implement Prefilter module -3. Implement Inspector module -4. Implement Detector with ONNX - -### Low Priority -1. Migrate to new OpenSSL EVP API -2. Update yaml-cpp target name -3. Add more integration tests -4. Performance benchmarking - -## Achievements 🏆 - -- ✅ Modern C++20 codebase -- ✅ CMake + vcpkg build system -- ✅ 95% test pass rate -- ✅ Configuration system working -- ✅ Feature extraction matching Python -- ✅ Professional logging -- ✅ Clean architecture -- ✅ Ready for production modules - -## Conclusion - -The C++ conversion is **highly successful**! The core infrastructure is solid, the build system works perfectly, and the feature extraction (the most critical component for DGA detection) is fully functional and tested. - -The remaining work is primarily implementing the pipeline modules (LogCollector, Prefilter, Inspector, Detector) and completing the Kafka/ClickHouse integrations, which are straightforward now that the foundation is established. - -**Ready to proceed with full module implementation!** 🚀 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt deleted file mode 100644 index 9728c000..00000000 --- a/cpp/CMakeLists.txt +++ /dev/null @@ -1,108 +0,0 @@ -cmake_minimum_required(VERSION 3.20) -project(hamstring VERSION 1.0.0 LANGUAGES CXX) - -# Enable vcpkg manifest mode -set(VCPKG_MANIFEST_MODE ON) - -# C++20 standard -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) - -# Export compile commands for IDE support -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -# Compiler warnings -if(MSVC) - add_compile_options(/W4) -else() - add_compile_options(-Wall -Wextra -Wpedantic) # Removed -Werror for now -endif() - -# Build types -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) -endif() - -# Optimization flags -set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") -set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") - -# Find packages -find_package(yaml-cpp CONFIG REQUIRED) -find_package(RdKafka CONFIG REQUIRED) -# find_package(clickhouse-cpp CONFIG REQUIRED) # No CMake config provided by vcpkg -find_package(Boost REQUIRED COMPONENTS system thread) -find_package(spdlog CONFIG REQUIRED) -find_package(fmt CONFIG REQUIRED) -# find_package(onnxruntime CONFIG REQUIRED) # Optional for Detector module -find_package(nlohmann_json CONFIG REQUIRED) -find_package(OpenSSL REQUIRED) - -# Optional ClickHouse support -option(ENABLE_CLICKHOUSE "Enable ClickHouse integration" ON) - -if(ENABLE_CLICKHOUSE) - # ClickHouse C++ library - find manually since vcpkg doesn't provide CMake config - find_library(CLICKHOUSE_CPP_LIB - NAMES clickhouse-cpp-lib libclickhouse-cpp-lib - PATHS ${CMAKE_SOURCE_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/lib - NO_DEFAULT_PATH - ) - if(NOT CLICKHOUSE_CPP_LIB) - message(WARNING "ClickHouse C++ library not found - building with stub implementation") - set(ENABLE_CLICKHOUSE OFF) - else() - message(STATUS "Found ClickHouse C++ library: ${CLICKHOUSE_CPP_LIB}") - - # ClickHouse include directory - set(CLICKHOUSE_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/include) - message(STATUS "ClickHouse include directory: ${CLICKHOUSE_INCLUDE_DIR}") - endif() -else() - message(STATUS "ClickHouse integration disabled - using stub implementation") -endif() - -# Find ClickHouse dependencies (zstd and cityhash) - only if ClickHouse is enabled -if(ENABLE_CLICKHOUSE) - find_library(ZSTD_LIB - NAMES zstd libzstd - PATHS ${CMAKE_SOURCE_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/lib - NO_DEFAULT_PATH - ) - find_library(CITYHASH_LIB - NAMES cityhash libcityhash - PATHS ${CMAKE_SOURCE_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/lib - NO_DEFAULT_PATH - ) - - if(NOT ZSTD_LIB OR NOT CITYHASH_LIB) - message(FATAL_ERROR "ClickHouse dependencies not found (zstd: ${ZSTD_LIB}, cityhash: ${CITYHASH_LIB})") - endif() - message(STATUS "Found zstd: ${ZSTD_LIB}") - message(STATUS "Found cityhash: ${CITYHASH_LIB}") -endif() - -# Include directories -include_directories(${CMAKE_SOURCE_DIR}/include) - -# Subdirectories -add_subdirectory(src) -add_subdirectory(examples) - -# Testing -option(BUILD_TESTING "Build tests" ON) -if(BUILD_TESTING) - enable_testing() - find_package(GTest CONFIG REQUIRED) - add_subdirectory(tests) -endif() - -# Benchmarks -option(BUILD_BENCHMARKS "Build benchmarks" OFF) -if(BUILD_BENCHMARKS) - add_subdirectory(benchmarks) -endif() - -# Installation -install(DIRECTORY include/ DESTINATION include) diff --git a/cpp/QUICKSTART.md b/cpp/QUICKSTART.md deleted file mode 100644 index dbb2a498..00000000 --- a/cpp/QUICKSTART.md +++ /dev/null @@ -1,132 +0,0 @@ -# Quick Start Build Guide - -## Option 1: Build Without Full Dependencies (Demo Mode) - -If you want to quickly test the code without installing all dependencies via vcpkg: - -```bash -cd cpp - -# Create a minimal build (no external dependencies) -cmake -B build-minimal \ - -DCMAKE_BUILD_TYPE=Debug \ - -DBUILD_TESTING=OFF - -# This will fail - need to make dependencies optional first -``` - -## Option 2: Full Build with vcpkg (Recommended) - -### Step 1: Install vcpkg - -```bash -# Clone vcpkg (one-time setup) -git clone https://github.com/Microsoft/vcpkg.git ~/vcpkg -cd ~/vcpkg -./bootstrap-vcpkg.sh -``` - -### Step 2: Configure CMake with vcpkg - -```bash -cd /path/to/hamstring/cpp - -# Configure with vcpkg toolchain -cmake -B build \ - -DCMAKE_TOOLCHAIN_FILE=~/vcpkg/scripts/buildsystems/vcpkg.cmake \ - -DCMAKE_BUILD_TYPE=Debug - -# vcpkg will automatically download and build: -# - yaml-cpp -# - rdkafka -# - clickhouse-cpp -# - boost -# - spdlog -# - nlohmann-json -# - openssl -# - gtest -# This may take 10-30 minutes on first run -``` - -### Step 3: Build - -```bash -cmake --build build -j$(nproc) -``` - -### Step 4: Run - -```bash -# Run demo -./build/examples/demo ../config.yaml - -# Run logserver (when Kafka/ClickHouse are ready) -./build/src/logserver/logserver ../config.yaml - -# Run tests -cd build && ctest -``` - -## Option 3: Docker Build (Easiest) - -Create a simple Dockerfile: - -```dockerfile -FROM ubuntu:22.04 - -RUN apt-get update && apt-get install -y \ - build-essential cmake git \ - libssl-dev pkg-config - -# Install vcpkg -RUN git clone https://github.com/Microsoft/vcpkg.git /opt/vcpkg && \ - /opt/vcpkg/bootstrap-vcpkg.sh - -WORKDIR /app -COPY cpp /app - -# Build -RUN cmake -B build \ - -DCMAKE_TOOLCHAIN_FILE=/opt/vcpkg/scripts/buildsystems/vcpkg.cmake && \ - cmake --build build -j$(nproc) - -CMD ["./build/src/logserver/logserver", "config.yaml"] -``` - -Then: -```bash -docker build -t hamstring-cpp . -docker run hamstring-cpp -``` - -## What's the Error? - -The error "Could not find a package configuration file provided by yaml-cpp" means CMake can't find the required libraries. You have 3 options: - -1. **Use vcpkg** (recommended) - it will install everything automatically -2. **Install system packages** manually - but this is complex for all deps -3. **Make dependencies optional** - I can modify CMake to make some deps optional for demo builds - -## Quick Fix: System Packages (macOS) - -If you want to try with system packages: - -```bash -brew install yaml-cpp librdkafka boost spdlog nlohmann-json openssl - -# Then configure without vcpkg -cmake -B build -DCMAKE_BUILD_TYPE=Debug -``` - -**Note**: This may not work for all dependencies (clickhouse-cpp, onnxruntime not in brew) - -## Recommended Next Step - -I recommend using vcpkg as it's the most reliable method and matches the documentation. Would you like me to: - -1. **Modify CMake to make dependencies optional** (for quick demo builds) -2. **Create a Docker setup** for easy building -3. **Help you install vcpkg** and walk through the full build -4. **Create a minimal example** that builds with no dependencies - -Let me know which approach you prefer! diff --git a/cpp/README.md b/cpp/README.md deleted file mode 100644 index 32aaf0f0..00000000 --- a/cpp/README.md +++ /dev/null @@ -1,185 +0,0 @@ -# HAMSTRING C++ Implementation - -This directory contains the C++ implementation of the HAMSTRING DGA detection pipeline, providing significant performance improvements over the Python version. - -## Features - -- **High Performance**: 5-10x throughput improvement over Python -- **Low Latency**: 60-80% reduction in processing latency -- **Memory Efficient**: 50-70% reduction in memory usage -- **Modern C++20**: Leveraging latest language features -- **Async I/O**: Non-blocking Kafka and database operations -- **ML Inference**: ONNX Runtime for model execution - -## Architecture - -The C++ implementation maintains the same pipeline architecture as the Python version: - -``` -LogServer → LogCollector → Prefilter → Inspector → Detector - ↓ ↓ ↓ ↓ - Kafka Kafka Kafka Kafka - ↓ ↓ ↓ ↓ - ClickHouse (Monitoring & Alerts) -``` - -## Building - -### Prerequisites - -- CMake 3.20 or higher -- C++20 compatible compiler (GCC 10+, Clang 12+, MSVC 2019+) -- vcpkg (for dependency management) - -### Dependencies - -All dependencies are managed via vcpkg: -- yaml-cpp (configuration parsing) -- librdkafka (Kafka client) -- clickhouse-cpp (ClickHouse client) -- Boost (async I/O, utilities) -- spdlog (logging) -- ONNX Runtime (ML inference) -- Google Test (testing) - -### Build Instructions - -```bash -# Clone vcpkg if not already installed -git clone https://github.com/Microsoft/vcpkg.git -./vcpkg/bootstrap-vcpkg.sh - -# Configure with vcpkg -cd cpp -cmake -B build -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake - -# Build -cmake --build build -j$(nproc) - -# Run tests -cd build && ctest --output-on-failure -``` - -## Running - -### Individual Modules - -```bash -# LogServer -./build/src/logserver/logserver --config ../config.yaml - -# LogCollector -./build/src/logcollector/collector --config ../config.yaml - -# Prefilter -./build/src/prefilter/prefilter --config ../config.yaml - -# Inspector -./build/src/inspector/inspector --config ../config.yaml - -# Detector -./build/src/detector/detector --config ../config.yaml -``` - -### Docker - -Docker images are built automatically for each module: - -```bash -# Build all images -docker compose -f ../docker/docker-compose.yml build - -# Run the pipeline -HOST_IP=127.0.0.1 docker compose -f ../docker/docker-compose.yml --profile prod up -``` - -## Configuration - -The C++ implementation uses the same `config.yaml` format as the Python version. No changes are required to existing configurations. - -## Model Conversion - -Before running the detector, convert existing Python models to ONNX format: - -```bash -# Convert XGBoost/RandomForest models to ONNX -python ../scripts/convert_models_to_onnx.py - -# Verify conversion -python ../scripts/verify_onnx_conversion.py -``` - -## Performance - -Benchmarks comparing C++ vs Python implementation: - -| Metric | Python | C++ | Improvement | -|--------|--------|-----|-------------| -| Throughput (msgs/sec) | 10,000 | 75,000 | 7.5x | -| Latency (ms) | 50 | 12 | 76% reduction | -| Memory (MB) | 500 | 180 | 64% reduction | -| CPU Usage (%) | 80 | 35 | 56% reduction | - -## Development - -### Code Structure - -``` -cpp/ -├── include/hamstring/ # Public headers -│ ├── base/ # Core infrastructure -│ ├── config/ # Configuration -│ ├── detector/ # Detector module -│ ├── inspector/ # Inspector module -│ ├── logcollector/ # LogCollector module -│ ├── logserver/ # LogServer module -│ └── prefilter/ # Prefilter module -├── src/ # Implementation files -├── tests/ # Unit and integration tests -└── benchmarks/ # Performance benchmarks -``` - -### Testing - -```bash -# Run all tests -cd build && ctest - -# Run specific test -./build/tests/detector/test_feature_extractor - -# Run with verbose output -ctest --verbose -``` - -### Code Quality - -```bash -# Format code -find . -name "*.cpp" -o -name "*.hpp" | xargs clang-format -i - -# Static analysis -clang-tidy src/**/*.cpp -- -std=c++20 - -# Memory safety check -cmake -B build -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -cmake --build build -./build/tests/all_tests -``` - -## Migration from Python - -The C++ implementation is designed to be a drop-in replacement: - -1. **Same Configuration**: Use existing `config.yaml` -2. **Same Kafka Topics**: Compatible message formats -3. **Same Database Schema**: ClickHouse tables unchanged -4. **Same Monitoring**: Grafana dashboards work as-is - -## Contributing - -See the main [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines. - -## License - -Same as the main project (EUPL License). diff --git a/cpp/SUMMARY.md b/cpp/SUMMARY.md deleted file mode 100644 index 7595653b..00000000 --- a/cpp/SUMMARY.md +++ /dev/null @@ -1,155 +0,0 @@ -# C++ Conversion Summary - -## Files Created - -### Build System (3 files) -- ✅ `cpp/CMakeLists.txt` - Root build configuration -- ✅ `cpp/vcpkg.json` - Dependency manifest -- ✅ `cpp/src/CMakeLists.txt` - Source build configuration - -### Headers (8 files) -- ✅ `cpp/include/hamstring/config/config.hpp` - Configuration system -- ✅ `cpp/include/hamstring/base/logger.hpp` - Logging framework -- ✅ `cpp/include/hamstring/base/data_classes.hpp` - Core data structures -- ✅ `cpp/include/hamstring/base/utils.hpp` - Utility functions -- ✅ `cpp/include/hamstring/base/kafka_handler.hpp` - Kafka integration -- ✅ `cpp/include/hamstring/base/clickhouse_sender.hpp` - ClickHouse integration -- ✅ `cpp/include/hamstring/detector/feature_extractor.hpp` - Feature extraction - -### Implementation (2 files) -- ✅ `cpp/src/detector/feature_extractor.cpp` - Feature extraction implementation -- ✅ `cpp/src/detector/CMakeLists.txt` - Detector build configuration - -### Tests (7 files) -- ✅ `cpp/tests/CMakeLists.txt` - Test build configuration -- ✅ `cpp/tests/base/CMakeLists.txt` - Base tests configuration -- ✅ `cpp/tests/base/test_utils.cpp` - Utility tests -- ✅ `cpp/tests/detector/CMakeLists.txt` - Detector tests configuration -- ✅ `cpp/tests/detector/test_feature_extractor.cpp` - Feature extractor tests -- ✅ `cpp/tests/integration/CMakeLists.txt` - Integration tests configuration -- ✅ `cpp/tests/integration/test_pipeline.cpp` - Pipeline integration tests - -### Documentation (2 files) -- ✅ `cpp/README.md` - C++ implementation documentation -- ✅ `scripts/convert_models_to_onnx.py` - Model conversion script - -### Configuration (1 file) -- ✅ `.gitignore` - Updated to allow CMake and vcpkg files - -**Total: 23 files created** - -## Project Structure - -``` -hamstring/ -├── cpp/ # NEW: C++ implementation -│ ├── CMakeLists.txt # Build configuration -│ ├── vcpkg.json # Dependencies -│ ├── README.md # Documentation -│ ├── include/hamstring/ # Public headers -│ │ ├── base/ # Core infrastructure -│ │ │ ├── logger.hpp -│ │ │ ├── data_classes.hpp -│ │ │ ├── utils.hpp -│ │ │ ├── kafka_handler.hpp -│ │ │ └── clickhouse_sender.hpp -│ │ ├── config/ -│ │ │ └── config.hpp -│ │ └── detector/ -│ │ └── feature_extractor.hpp -│ ├── src/ # Implementation -│ │ ├── CMakeLists.txt -│ │ └── detector/ -│ │ ├── CMakeLists.txt -│ │ └── feature_extractor.cpp -│ └── tests/ # Test suite -│ ├── CMakeLists.txt -│ ├── base/ -│ │ ├── CMakeLists.txt -│ │ └── test_utils.cpp -│ ├── detector/ -│ │ ├── CMakeLists.txt -│ │ └── test_feature_extractor.cpp -│ └── integration/ -│ ├── CMakeLists.txt -│ └── test_pipeline.cpp -├── scripts/ -│ └── convert_models_to_onnx.py # NEW: Model conversion -└── .gitignore # MODIFIED: Allow CMake files -``` - -## Key Achievements - -### ✅ Core Infrastructure -- Modern C++20 codebase -- CMake build system with vcpkg -- Configuration system (YAML parsing) -- Logging framework (spdlog) -- Data structures (LogLine, Batch, Warning) -- Field validators (RegEx, Timestamp, IP, ListItem) - -### ✅ Integration -- Kafka handlers (librdkafka) -- ClickHouse client (clickhouse-cpp) -- ONNX Runtime (ML inference) -- Boost.Asio (async I/O) - -### ✅ Feature Extraction -- Complete implementation matching Python -- 44 features extracted per domain -- Label statistics -- Character frequency -- Domain level analysis -- Entropy calculation - -### ✅ Testing -- Google Test framework -- Unit tests for feature extractor -- Unit tests for utilities -- Integration test framework -- 11 test cases for feature extraction - -### ✅ Documentation -- Comprehensive README -- Build instructions -- Performance benchmarks -- Model conversion guide -- Walkthrough document - -## Next Steps - -To complete the implementation: - -1. **Implement remaining modules** (LogServer, LogCollector, Prefilter, Inspector, Detector) -2. **Implement base infrastructure** (Logger, Utils, Data classes, Kafka, ClickHouse) -3. **Complete configuration parsing** -4. **Add integration tests** -5. **Performance benchmarking** -6. **Docker integration** - -## Build Instructions - -```bash -# Install vcpkg -git clone https://github.com/Microsoft/vcpkg.git -./vcpkg/bootstrap-vcpkg.sh - -# Configure -cd cpp -cmake -B build -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake - -# Build -cmake --build build -j$(nproc) - -# Run tests -cd build && ctest --output-on-failure -``` - -## Performance Targets - -| Metric | Python | C++ Target | -|--------|--------|------------| -| Throughput | 10K msgs/sec | 75K msgs/sec | -| Latency | 50 ms | 12 ms | -| Memory | 500 MB | 180 MB | -| CPU | 80% | 35% | diff --git a/cpp/auto-build.sh b/cpp/auto-build.sh deleted file mode 100755 index bea0ffd5..00000000 --- a/cpp/auto-build.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# Auto-build script - run this after vcpkg finishes - -set -e # Exit on error - -echo "================================================" -echo "HAMSTRING C++ Auto-Build Script" -echo "================================================" -echo "" - -# Check if vcpkg finished -if [ ! -d "$HOME/vcpkg/installed/arm64-osx" ]; then - echo "Error: vcpkg dependencies not installed yet" - echo "Please wait for 'vcpkg install' to complete first" - exit 1 -fi - -echo "✓ vcpkg dependencies installed" -echo "" - -# Configure CMake -echo "Step 1: Configuring CMake..." -cmake -B build \ - -DCMAKE_TOOLCHAIN_FILE=$HOME/vcpkg/scripts/buildsystems/vcpkg.cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=ON - -echo "" -echo "✓ CMake configured" -echo "" - -# Build -echo "Step 2: Building project..." -cmake --build build -j$(sysctl -n hw.ncpu) - -echo "" -echo "✓ Build complete!" -echo "" - -# Show what was built -echo "Built executables:" -echo " - build/examples/demo" -echo " - build/src/logserver/logserver" -echo "" - -# Optionally run tests -read -p "Run tests? (y/n) " -n 1 -r -echo -if [[ $REPLY =~ ^[Yy]$ ]]; then - cd build && ctest --output-on-failure - cd .. -fi - -echo "" -echo "================================================" -echo "Build complete! You can now run:" -echo " ./build/examples/demo ../config.yaml" -echo " ./build/src/logserver/logserver ../config.yaml" -echo "================================================" diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt deleted file mode 100644 index d6787b9f..00000000 --- a/cpp/examples/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Example executable -add_executable(demo - demo.cpp -) - -target_link_libraries(demo - PRIVATE - hamstring_base - hamstring_detector -) diff --git a/cpp/examples/demo.cpp b/cpp/examples/demo.cpp deleted file mode 100644 index 0b413ff9..00000000 --- a/cpp/examples/demo.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include "hamstring/base/utils.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/detector/feature_extractor.hpp" -#include - -using namespace hamstring; - -int main(int argc, char **argv) { - // Initialize logger - base::Logger::initialize(true); // debug mode - auto logger = base::Logger::get_logger("example"); - - logger->info("HAMSTRING C++ Example"); - logger->info("====================="); - - // Load configuration - std::string config_path = (argc > 1) ? argv[1] : "../../config.yaml"; - logger->info("Loading configuration from: {}", config_path); - - try { - auto config = config::Config::load_from_file(config_path); - - logger->info("Configuration loaded successfully"); - logger->info("Number of collectors: {}", - config->pipeline.collectors.size()); - logger->info("Number of detectors: {}", config->pipeline.detectors.size()); - logger->info("Kafka brokers: {}", config->environment.kafka_brokers.size()); - - // Show Kafka bootstrap servers - std::string bootstrap = config->environment.get_kafka_bootstrap_servers(); - logger->info("Kafka bootstrap servers: {}", bootstrap); - - } catch (const std::exception &e) { - logger->error("Failed to load configuration: {}", e.what()); - logger->warn("Continuing with feature extraction demo..."); - } - - // Demonstrate feature extraction - logger->info(""); - logger->info("Feature Extraction Demo"); - logger->info("======================="); - - detector::FeatureExtractor extractor; - - // Test domains - std::vector test_domains = {"google.com", "www.example.com", - "xjk3n2m9pq.com", // DGA-like - "mail.google.com"}; - - for (const auto &domain : test_domains) { - logger->info(""); - logger->info("Domain: {}", domain); - - auto features = extractor.extract(domain); - - logger->info(" Label length: {}", features.label_length); - logger->info(" Label max: {}", features.label_max); - logger->info(" FQDN entropy: {:.4f}", features.fqdn_entropy); - logger->info(" SLD entropy: {:.4f}", features.secondleveldomain_entropy); - logger->info(" Alpha ratio: {:.4f}", features.fqdn_alpha_count); - logger->info(" Numeric ratio: {:.4f}", features.fqdn_numeric_count); - - auto vec = features.to_vector(); - logger->info(" Feature vector size: {}", vec.size()); - - // Check if DGA-like (high entropy) - if (features.fqdn_entropy > 3.0) { - logger->warn(" ⚠ High entropy - possible DGA domain!"); - } - } - - // Demonstrate utilities - logger->info(""); - logger->info("Utilities Demo"); - logger->info("=============="); - - std::string uuid = base::utils::generate_uuid(); - logger->info("Generated UUID: {}", uuid); - - std::string ip = "192.168.1.100"; - std::string subnet = base::utils::get_subnet_id(ip, 24); - logger->info("IP: {} -> Subnet: {}", ip, subnet); - - std::string domain = "www.example.com"; - std::string sld = base::utils::extract_second_level_domain(domain); - std::string tld_part = base::utils::extract_third_level_domain(domain); - logger->info("Domain: {}", domain); - logger->info(" Second level: {}", sld); - logger->info(" Third level: {}", tld_part); - - logger->info(""); - logger->info("Example completed successfully!"); - - return 0; -} diff --git a/cpp/include/hamstring/base/clickhouse_sender.hpp b/cpp/include/hamstring/base/clickhouse_sender.hpp deleted file mode 100644 index 3d654372..00000000 --- a/cpp/include/hamstring/base/clickhouse_sender.hpp +++ /dev/null @@ -1,74 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace hamstring { -namespace base { - -/** - * @brief ClickHouse client for monitoring and logging - * - * Provides interface for inserting data into ClickHouse tables. - * Supports server logs, timestamps, batch tracking, and metrics. - */ -class ClickHouseSender { -public: - ClickHouseSender(const std::string &hostname, int port = 9000, - const std::string &database = "default", - const std::string &user = "default", - const std::string &password = ""); - ~ClickHouseSender(); - - // Batch tracking (logged as structured messages) - void insert_batch_timestamp(const std::string &batch_id, - const std::string &stage, - const std::string &instance_name, - const std::string &status, size_t message_count, - bool is_active = true); - - // Logline tracking - void insert_logline_timestamp(const std::string &logline_id, - const std::string &stage, - const std::string &status, - bool is_active = true); - - // Metrics/fill levels - void insert_fill_level(const std::string &stage, - const std::string &entry_type, size_t entry_count); - - // DGA detections - void insert_dga_detection(const std::string &domain, double score, - const std::string &batch_id, - const std::string &src_ip); - - // Server logs (LogServer module) - void insert_server_log(const std::string &message_id, int64_t timestamp_ms, - const std::string &message_text); - void insert_server_log_timestamp(const std::string &message_id, - const std::string &event, - int64_t event_timestamp_ms); - - // Failed loglines (LogCollector module) - void insert_failed_logline(const std::string &message_text, - int64_t timestamp_in_ms, - int64_t timestamp_failed_ms, - const std::string &reason); - - // Generic methods - void execute(const std::string &query); - bool ping(); - -private: - std::string hostname_; - int port_; - std::string database_; - std::string user_; - std::string password_; - bool connected_; - std::unique_ptr client_; -}; - -} // namespace base -} // namespace hamstring diff --git a/cpp/include/hamstring/base/data_classes.hpp b/cpp/include/hamstring/base/data_classes.hpp deleted file mode 100644 index 173d2671..00000000 --- a/cpp/include/hamstring/base/data_classes.hpp +++ /dev/null @@ -1,163 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace hamstring { -namespace base { - -// Forward declarations -class FieldValidator; - -// LogLine represents a validated log entry -class LogLine { -public: - std::string logline_id; - std::string batch_id; - std::map fields; - std::chrono::system_clock::time_point timestamp; - - // Serialize to JSON string - std::string to_json() const; - - // Deserialize from JSON string - static std::shared_ptr from_json(const std::string &json_str); - - // Get field value - std::optional get_field(const std::string &name) const; - - // Set field value - void set_field(const std::string &name, const std::string &value); -}; - -// Batch represents a collection of log lines grouped by subnet -class Batch { -public: - std::string batch_id; - std::string subnet_id; - std::string collector_name; - std::vector> loglines; - std::chrono::system_clock::time_point created_at; - std::chrono::system_clock::time_point timestamp_in; - - // Serialize to JSON string - std::string to_json() const; - - // Deserialize from JSON string - static std::shared_ptr from_json(const std::string &json_str); - - // Add a log line to the batch - void add_logline(std::shared_ptr logline); - - // Get number of log lines - size_t size() const { return loglines.size(); } - - // Check if batch is empty - bool empty() const { return loglines.empty(); } -}; - -// Warning represents a detected threat -class Warning { -public: - std::string warning_id; - std::string batch_id; - std::string src_ip; - std::string domain_name; - double score; - double threshold; - std::chrono::system_clock::time_point timestamp; - std::map metadata; - - // Serialize to JSON string - std::string to_json() const; - - // Deserialize from JSON string - static std::shared_ptr from_json(const std::string &json_str); -}; - -// Base class for field validators -class FieldValidator { -public: - virtual ~FieldValidator() = default; - - // Validate a field value - virtual bool validate(const std::string &value) const = 0; - - // Get field name - virtual std::string get_name() const = 0; -}; - -// RegEx field validator -class RegExValidator : public FieldValidator { -public: - RegExValidator(const std::string &name, const std::string &pattern); - - bool validate(const std::string &value) const override; - std::string get_name() const override { return name_; } - -private: - std::string name_; - std::regex pattern_; -}; - -// Timestamp field validator -class TimestampValidator : public FieldValidator { -public: - TimestampValidator(const std::string &name, const std::string &format); - - bool validate(const std::string &value) const override; - std::string get_name() const override { return name_; } - - // Parse timestamp to time_point - std::chrono::system_clock::time_point parse(const std::string &value) const; - -private: - std::string name_; - std::string format_; -}; - -// IP Address field validator -class IpAddressValidator : public FieldValidator { -public: - explicit IpAddressValidator(const std::string &name); - - bool validate(const std::string &value) const override; - std::string get_name() const override { return name_; } - - // Check if IPv4 - static bool is_ipv4(const std::string &value); - - // Check if IPv6 - static bool is_ipv6(const std::string &value); - -private: - std::string name_; -}; - -// ListItem field validator -class ListItemValidator : public FieldValidator { -public: - ListItemValidator(const std::string &name, - const std::vector &allowed_list, - const std::vector &relevant_list); - - bool validate(const std::string &value) const override; - std::string get_name() const override { return name_; } - - // Check if value is relevant - bool is_relevant(const std::string &value) const; - -private: - std::string name_; - std::vector allowed_list_; - std::vector relevant_list_; -}; - -} // namespace base -} // namespace hamstring diff --git a/cpp/include/hamstring/base/kafka_handler.hpp b/cpp/include/hamstring/base/kafka_handler.hpp deleted file mode 100644 index f87df4c3..00000000 --- a/cpp/include/hamstring/base/kafka_handler.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace hamstring { -namespace base { - -// Kafka message callback -using KafkaMessageCallback = - std::function; - -// Base Kafka handler -class KafkaHandler { -public: - virtual ~KafkaHandler() = default; - - // Get Kafka configuration - static std::unique_ptr - create_config(const std::string &bootstrap_servers, - const std::string &group_id = ""); - -protected: - std::string bootstrap_servers_; - std::unique_ptr conf_; -}; - -// Kafka producer -class KafkaProduceHandler : public KafkaHandler { -public: - KafkaProduceHandler(const std::string &bootstrap_servers, - const std::string &topic); - ~KafkaProduceHandler(); - - // Send a message - bool send(const std::string &key, const std::string &value); - - // Send a message with timestamp - bool send(const std::string &key, const std::string &value, - int64_t timestamp); - - // Flush pending messages - void flush(int timeout_ms = 10000); - -private: - std::string topic_; - std::unique_ptr producer_; - std::unique_ptr topic_handle_; -}; - -// Kafka consumer -class KafkaConsumeHandler : public KafkaHandler { -public: - KafkaConsumeHandler(const std::string &bootstrap_servers, - const std::string &group_id, - const std::vector &topics); - ~KafkaConsumeHandler(); - - // Poll for messages (blocking) - void poll(KafkaMessageCallback callback, int timeout_ms = 1000); - - // Start consuming in background - void start_async(boost::asio::io_context &io_context, - KafkaMessageCallback callback); - - // Stop consuming - void stop(); - - // Commit offsets - void commit(); - -private: - std::vector topics_; - std::unique_ptr consumer_; - bool running_ = false; -}; - -// Exactly-once Kafka handler -class ExactlyOnceKafkaHandler { -public: - ExactlyOnceKafkaHandler(const std::string &bootstrap_servers, - const std::string &consumer_group_id, - const std::vector &consume_topics, - const std::string &produce_topic); - ~ExactlyOnceKafkaHandler(); - - // Process messages with exactly-once semantics - void - process(std::function - transform_fn, - int timeout_ms = 1000); - - // Start processing in background - void start_async( - boost::asio::io_context &io_context, - std::function - transform_fn); - - // Stop processing - void stop(); - -private: - std::unique_ptr consumer_; - std::unique_ptr producer_; - bool running_ = false; -}; - -} // namespace base -} // namespace hamstring diff --git a/cpp/include/hamstring/base/logger.hpp b/cpp/include/hamstring/base/logger.hpp deleted file mode 100644 index 31ffe0de..00000000 --- a/cpp/include/hamstring/base/logger.hpp +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace hamstring { -namespace base { - -class Logger { -public: - // Get or create a logger for a specific module - static std::shared_ptr - get_logger(const std::string &module_name); - - // Set log level for a specific module - static void set_level(const std::string &module_name, - spdlog::level::level_enum level); - - // Set log level for all loggers - static void set_global_level(spdlog::level::level_enum level); - - // Initialize logging system with configuration - static void initialize(bool debug = false); - -private: - static std::shared_ptr create_logger(const std::string &name); -}; - -} // namespace base -} // namespace hamstring diff --git a/cpp/include/hamstring/base/utils.hpp b/cpp/include/hamstring/base/utils.hpp deleted file mode 100644 index 5044176b..00000000 --- a/cpp/include/hamstring/base/utils.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace hamstring { -namespace base { -namespace utils { - -// UUID generation -std::string generate_uuid(); - -// IP address utilities -bool is_valid_ipv4(const std::string &ip); -bool is_valid_ipv6(const std::string &ip); -std::string get_subnet_id(const std::string &ip, int prefix_length); - -// Time utilities -std::string format_timestamp(const std::chrono::system_clock::time_point &tp, - const std::string &format = "%Y-%m-%dT%H:%M:%S"); -std::chrono::system_clock::time_point -parse_timestamp(const std::string &ts_str, - const std::string &format = "%Y-%m-%dT%H:%M:%S"); -int64_t timestamp_to_ms(const std::chrono::system_clock::time_point &tp); -std::chrono::system_clock::time_point ms_to_timestamp(int64_t ms); - -// String utilities -std::vector split(const std::string &str, char delimiter); -std::string join(const std::vector &vec, - const std::string &delimiter); -std::string trim(const std::string &str); -std::string to_lower(const std::string &str); -std::string to_upper(const std::string &str); - -// Domain name utilities -std::string extract_fqdn(const std::string &domain); -std::string extract_second_level_domain(const std::string &domain); -std::string extract_third_level_domain(const std::string &domain); -std::optional extract_tld(const std::string &domain); - -// Hash utilities -std::string sha256_file(const std::string &filepath); -std::string sha256_string(const std::string &data); - -} // namespace utils -} // namespace base -} // namespace hamstring diff --git a/cpp/include/hamstring/config/config.hpp b/cpp/include/hamstring/config/config.hpp deleted file mode 100644 index 2c5f1a3c..00000000 --- a/cpp/include/hamstring/config/config.hpp +++ /dev/null @@ -1,162 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace hamstring { -namespace config { - -// Logging configuration -struct ModuleLoggingConfig { - bool debug = false; -}; - -struct LoggingConfig { - bool base_debug = false; - std::map modules; - - static LoggingConfig from_yaml(const YAML::Node& node); -}; - -// Kafka broker configuration -struct KafkaBroker { - std::string hostname; - int internal_port; - int external_port; - std::string node_ip; - - static KafkaBroker from_yaml(const YAML::Node& node); -}; - -// Environment configuration -struct EnvironmentConfig { - std::vector kafka_brokers; - std::map kafka_topics_prefix; - std::string clickhouse_hostname; - - static EnvironmentConfig from_yaml(const YAML::Node& node); - std::string get_kafka_bootstrap_servers() const; -}; - -// Field validation types -enum class FieldType { - RegEx, - Timestamp, - IpAddress, - ListItem -}; - -struct FieldConfig { - std::string name; - FieldType type; - std::string pattern; // For RegEx - std::string timestamp_format; // For Timestamp - std::vector allowed_list; // For ListItem - std::vector relevant_list; // For ListItem - - static FieldConfig from_yaml(const YAML::Node& node); -}; - -// Batch handler configuration -struct BatchHandlerConfig { - int batch_size = 2000; - double batch_timeout = 30.0; - int ipv4_prefix_length = 24; - int ipv6_prefix_length = 64; - - static BatchHandlerConfig from_yaml(const YAML::Node& node); -}; - -// Collector configuration -struct CollectorConfig { - std::string name; - std::string protocol_base; - std::vector required_log_information; - BatchHandlerConfig batch_handler_config; - - static CollectorConfig from_yaml(const YAML::Node& node, const BatchHandlerConfig& default_config); -}; - -// Prefilter configuration -struct PrefilterConfig { - std::string name; - std::string relevance_method; - std::string collector_name; - - static PrefilterConfig from_yaml(const YAML::Node& node); -}; - -// Inspector configuration -struct InspectorConfig { - std::string name; - std::string inspector_module_name; - std::string inspector_class_name; - std::string prefilter_name; - std::string mode; // univariate, multivariate, ensemble - YAML::Node models; - YAML::Node ensemble; - double anomaly_threshold; - double score_threshold; - std::string time_type; - int time_range; - - static InspectorConfig from_yaml(const YAML::Node& node); -}; - -// Detector configuration -struct DetectorConfig { - std::string name; - std::string detector_module_name; - std::string detector_class_name; - std::string model; - std::string checksum; - std::string base_url; - double threshold; - std::string inspector_name; - - static DetectorConfig from_yaml(const YAML::Node& node); -}; - -// Monitoring configuration -struct MonitoringConfig { - int clickhouse_batch_size = 50; - double clickhouse_batch_timeout = 2.0; - - static MonitoringConfig from_yaml(const YAML::Node& node); -}; - -// Pipeline configuration -struct PipelineConfig { - std::string logserver_input_file; - BatchHandlerConfig default_batch_handler_config; - std::vector collectors; - std::vector prefilters; - std::vector inspectors; - std::vector detectors; - MonitoringConfig monitoring; - - static PipelineConfig from_yaml(const YAML::Node& node); -}; - -// Root configuration -class Config { -public: - LoggingConfig logging; - PipelineConfig pipeline; - EnvironmentConfig environment; - - // Load configuration from YAML file - static std::shared_ptr load_from_file(const std::string& filepath); - - // Load configuration from YAML string - static std::shared_ptr load_from_string(const std::string& yaml_content); - -private: - static std::shared_ptr from_yaml(const YAML::Node& root); -}; - -} // namespace config -} // namespace hamstring diff --git a/cpp/include/hamstring/detector/detector.hpp b/cpp/include/hamstring/detector/detector.hpp deleted file mode 100644 index 7da5cf14..00000000 --- a/cpp/include/hamstring/detector/detector.hpp +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -#include "hamstring/detector/feature_extractor.hpp" -#include -#include -#include - -// Forward declaration for ONNX Runtime classes to avoid exposing them in the -// header -namespace onnxruntime { -class InferenceSession; -class Env; -class SessionOptions; -class RunOptions; -} // namespace onnxruntime - -namespace hamstring { -namespace detector { - -class Detector { -public: - Detector(); - ~Detector(); - - // Load ONNX model from file - void load_model(const std::string &model_path); - - // Predict probability of domain being DGA (0.0 - 1.0) - float predict(const std::string &domain); - -private: - FeatureExtractor feature_extractor_; - - // Pimpl idiom for ONNX Runtime objects - struct Impl; - std::unique_ptr impl_; -}; - -} // namespace detector -} // namespace hamstring diff --git a/cpp/include/hamstring/detector/detector_service.hpp b/cpp/include/hamstring/detector/detector_service.hpp deleted file mode 100644 index 5e288c53..00000000 --- a/cpp/include/hamstring/detector/detector_service.hpp +++ /dev/null @@ -1,71 +0,0 @@ -```cpp -#pragma once - -#include "hamstring/base/clickhouse_sender.hpp" -#include "hamstring/base/data_classes.hpp" -#include "hamstring/base/kafka_handler.hpp" -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/detector/detector.hpp" -#include -#include -#include -#include -#include -#include - - namespace hamstring { - namespace detector { - - class DetectorService { - public: - DetectorService(const std::string &name, const std::string &consume_topic, - const std::string &model_path, double threshold, - std::shared_ptr config, - const std::string &bootstrap_servers, - const std::string &group_id); - ~DetectorService(); - - void start(); - void stop(); - bool is_running() const { return running_; } - - struct Stats { - uint64_t batches_consumed = 0; - uint64_t domains_scanned = 0; - uint64_t domains_detected = 0; - }; - Stats get_stats() const; - - private: - void consume_loop(); - void process_batch(const base::Batch &batch); - - std::string name_; - std::string consume_topic_; - std::string model_path_; - double threshold_; - std::shared_ptr config_; - - std::shared_ptr logger_; - std::unique_ptr consumer_; - std::shared_ptr clickhouse_; - - // The core detector logic - Detector detector_; - - std::atomic running_{false}; - std::thread worker_thread_; - - // Stats - std::atomic batches_consumed_{0}; - std::atomic domains_scanned_{0}; - std::atomic domains_detected_{0}; - }; - - // Factory function - std::vector> - create_detector_services(std::shared_ptr config); - - } // namespace detector -} // namespace hamstring diff --git a/cpp/include/hamstring/detector/feature_extractor.hpp b/cpp/include/hamstring/detector/feature_extractor.hpp deleted file mode 100644 index 94a4163a..00000000 --- a/cpp/include/hamstring/detector/feature_extractor.hpp +++ /dev/null @@ -1,78 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace hamstring { -namespace detector { - -// Feature vector for a domain name -struct DomainFeatures { - // Label statistics - int label_length = 0; - int label_max = 0; - double label_average = 0.0; - - // Character frequency (a-z) - std::map char_freq; - - // Domain level counts - double fqdn_full_count = 0.0; - double fqdn_alpha_count = 0.0; - double fqdn_numeric_count = 0.0; - double fqdn_special_count = 0.0; - - double secondleveldomain_full_count = 0.0; - double secondleveldomain_alpha_count = 0.0; - double secondleveldomain_numeric_count = 0.0; - double secondleveldomain_special_count = 0.0; - - double thirdleveldomain_full_count = 0.0; - double thirdleveldomain_alpha_count = 0.0; - double thirdleveldomain_numeric_count = 0.0; - double thirdleveldomain_special_count = 0.0; - - // Entropy - double fqdn_entropy = 0.0; - double secondleveldomain_entropy = 0.0; - double thirdleveldomain_entropy = 0.0; - - // Convert to vector for ML model input - std::vector to_vector() const; - - // Get feature names (for debugging/logging) - static std::vector get_feature_names(); -}; - -// Feature extractor matching Python implementation -class FeatureExtractor { -public: - FeatureExtractor() = default; - - // Extract features from a domain name - DomainFeatures extract(const std::string &domain) const; - -private: - // Helper methods matching Python implementation - int count_labels(const std::string &domain) const; - int get_max_label_length(const std::string &domain) const; - double get_average_label_length(const std::string &domain) const; - - std::map - calculate_char_frequency(const std::string &domain) const; - - double calculate_alpha_ratio(const std::string &text) const; - double calculate_numeric_ratio(const std::string &text) const; - double calculate_special_ratio(const std::string &text) const; - - double calculate_entropy(const std::string &text) const; - - std::string extract_fqdn(const std::string &domain) const; - std::string extract_second_level_domain(const std::string &domain) const; - std::string extract_third_level_domain(const std::string &domain) const; -}; - -} // namespace detector -} // namespace hamstring diff --git a/cpp/include/hamstring/inspector/anomaly_detector.hpp b/cpp/include/hamstring/inspector/anomaly_detector.hpp deleted file mode 100644 index 34506e45..00000000 --- a/cpp/include/hamstring/inspector/anomaly_detector.hpp +++ /dev/null @@ -1,124 +0,0 @@ -#pragma once - -#include "hamstring/base/data_classes.hpp" -#include "hamstring/config/config.hpp" -#include -#include -#include - -namespace hamstring { -namespace inspector { - -/** - * @brief Metrics extracted from a batch for anomaly detection - */ -struct BatchMetrics { - double nxdomain_rate; // Ratio of NXDOMAIN responses - double avg_domain_length; // Average domain name length - double domain_entropy; // Shannon entropy of domain names - double unique_domain_ratio; // Unique domains / total queries - size_t total_queries; // Total number of queries - double query_rate; // Queries per second - - // Character distribution - double numeric_char_ratio; // Ratio of numeric characters - double special_char_ratio; // Ratio of special characters -}; - -/** - * @brief Statistical anomaly detector using time-series analysis - * - * This class implements lightweight statistical methods for anomaly detection: - * - Z-score outlier detection - * - Moving averages and standard deviations - * - Threshold-based rules - * - Multi-metric ensemble scoring - */ -class AnomalyDetector { -public: - /** - * @brief Construct anomaly detector with configuration - * - * @param config Inspector configuration with thresholds - */ - explicit AnomalyDetector(const config::InspectorConfig &config); - - /** - * @brief Analyze batch and return suspicion score - * - * @param batch Batch to analyze - * @return Suspicion score [0.0, 1.0] where higher = more suspicious - */ - double analyze_batch(const base::Batch &batch); - - /** - * @brief Update internal statistics with new batch - * - * @param batch Batch to update statistics with - */ - void update_state(const base::Batch &batch); - - /** - * @brief Get current statistics (for debugging/monitoring) - */ - struct Statistics { - double mean_nxdomain_rate; - double stddev_nxdomain_rate; - double mean_domain_length; - double stddev_domain_length; - size_t samples_count; - }; - - Statistics get_statistics() const; - -private: - /** - * @brief Extract metrics from batch - */ - BatchMetrics extract_metrics(const base::Batch &batch); - - /** - * @brief Calculate Z-score for a value given historical data - */ - double calculate_z_score(double value, double mean, double stddev); - - /** - * @brief Update rolling statistics - */ - void update_rolling_stats(); - - /** - * @brief Calculate Shannon entropy of a string - */ - double calculate_entropy(const std::string &str); - - /** - * @brief Detect anomalies using multiple methods - */ - double detect_anomalies(const BatchMetrics &metrics); - - // Configuration - config::InspectorConfig config_; - - // Rolling window statistics - struct RollingStats { - std::deque nxdomain_rates; - std::deque avg_domain_lengths; - std::deque query_rates; - std::deque entropies; - - double mean_nxdomain = 0.0; - double stddev_nxdomain = 0.0; - double mean_domain_length = 0.0; - double stddev_domain_length = 0.0; - double mean_entropy = 0.0; - double stddev_entropy = 0.0; - }; - - RollingStats stats_; - size_t window_size_; - double z_score_threshold_; -}; - -} // namespace inspector -} // namespace hamstring diff --git a/cpp/include/hamstring/inspector/inspector.hpp b/cpp/include/hamstring/inspector/inspector.hpp deleted file mode 100644 index 65dd5559..00000000 --- a/cpp/include/hamstring/inspector/inspector.hpp +++ /dev/null @@ -1,154 +0,0 @@ -#pragma once - -#include "hamstring/base/clickhouse_sender.hpp" -#include "hamstring/base/data_classes.hpp" -#include "hamstring/base/kafka_handler.hpp" -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/inspector/anomaly_detector.hpp" -#include -#include -#include -#include -#include - -namespace hamstring { -namespace inspector { - -/** - * @brief Inspector - Performs anomaly detection on batches - * - * The Inspector consumes filtered batches from the Prefilter stage and - * performs anomaly detection. Suspicious batches are grouped by source IP - * and forwarded to the Detector stage for further analysis. - * - * Features: - * - Anomaly detection using statistical models - * - IP-based batch grouping - * - Threshold-based filtering - * - ClickHouse monitoring integration - */ -class Inspector { -public: - /** - * @brief Construct a new Inspector - * - * @param name Inspector name - * @param consume_topic Kafka topic to consume batches from - * @param produce_topics Kafka topics to produce suspicious batches to - * @param mode Anomaly detection mode (univariate, multivariate, ensemble) - * @param anomaly_threshold Threshold for anomaly ratio - * @param score_threshold Threshold for anomaly scores - * @param config Global configuration - * @param bootstrap_servers Kafka broker addresses - * @param group_id Kafka consumer group ID - */ - Inspector(const std::string &name, const std::string &consume_topic, - const std::vector &produce_topics, - const std::string &mode, double anomaly_threshold, - double score_threshold, std::shared_ptr config, - const std::string &bootstrap_servers, const std::string &group_id); - - ~Inspector(); - - /** - * @brief Start the inspector - * - * Begins consuming batches from Kafka and performing anomaly detection. - */ - void start(); - - /** - * @brief Stop the inspector gracefully - */ - void stop(); - - /** - * @brief Check if inspector is running - */ - bool is_running() const { return running_; } - - /** - * @brief Get inspector statistics - */ - struct Stats { - size_t batches_consumed; - size_t batches_suspicious; - size_t batches_filtered; - size_t suspicious_batches_sent; - }; - - Stats get_stats() const; - -private: - /** - * @brief Main message consumption loop - */ - void consume_loop(); - - /** - * @brief Process a single batch - * - * @param batch Batch to inspect - */ - void process_batch(const base::Batch &batch); - - /** - * @brief Check if batch is suspicious - * - * @param batch Batch to check - * @return true if suspicious, false otherwise - */ - bool is_suspicious(const base::Batch &batch); - - /** - * @brief Send suspicious batches to Kafka - * - * @param batches_by_ip Map of IP address to batches - * @param original_batch_id Original batch ID - */ - void send_suspicious_batches( - const std::map>> - &batches_by_ip, - const std::string &original_batch_id); - - // Configuration - std::string name_; - std::string consume_topic_; - std::vector produce_topics_; - std::string mode_; - double anomaly_threshold_; - double score_threshold_; - std::shared_ptr config_; - - // Components - std::unique_ptr consumer_; - std::vector> producers_; - std::shared_ptr clickhouse_; - std::unique_ptr anomaly_detector_; - - // Threading - std::atomic running_{false}; - std::thread worker_thread_; - - // Metrics - std::atomic batches_consumed_{0}; - std::atomic batches_suspicious_{0}; - std::atomic batches_filtered_{0}; - std::atomic suspicious_batches_sent_{0}; - - // Logger - std::shared_ptr logger_; -}; - -/** - * @brief Create Inspector instances from configuration - * - * @param config Application configuration - * @return Vector of Inspector instances - */ -std::vector> -create_inspectors(std::shared_ptr config); - -} // namespace inspector -} // namespace hamstring diff --git a/cpp/include/hamstring/logcollector/logcollector.hpp b/cpp/include/hamstring/logcollector/logcollector.hpp deleted file mode 100644 index 054ca17a..00000000 --- a/cpp/include/hamstring/logcollector/logcollector.hpp +++ /dev/null @@ -1,278 +0,0 @@ -#pragma once - -#include "hamstring/base/clickhouse_sender.hpp" -#include "hamstring/base/data_classes.hpp" -#include "hamstring/base/kafka_handler.hpp" -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace hamstring { -namespace logcollector { - -/** - * @brief Buffered batch for efficient log line aggregation - * - * Thread-safe batch container that groups log lines by subnet ID. - * Automatically triggers sends when size or timeout limits are reached. - */ -class BufferedBatch { -public: - /** - * @brief Construct a new Buffered Batch - * - * @param collector_name Name of the collector - * @param batch_size Max messages per batch - * @param batch_timeout_ms Timeout in milliseconds - */ - BufferedBatch(const std::string &collector_name, size_t batch_size, - int batch_timeout_ms); - - ~BufferedBatch(); - - /** - * @brief Add a log line to the batch - * - * Thread-safe method to add a message to the appropriate batch. - * - * @param subnet_id Subnet identifier for batching - * @param logline LogLine to add - * @return true if batch is ready to send - */ - bool add_logline(const std::string &subnet_id, const base::LogLine &logline); - - /** - * @brief Get completed batch for a subnet - * - * @param subnet_id Subnet identifier - * @return Batch object ready to send - */ - base::Batch get_batch(const std::string &subnet_id); - - /** - * @brief Get all ready batches - * - * @return Vector of batches that are ready to send - */ - std::vector get_ready_batches(); - - /** - * @brief Force send all batches (timeout or shutdown) - * - * @return Vector of all batches - */ - std::vector flush_all(); - - /** - * @brief Get statistics about current batches - */ - struct Stats { - size_t total_batches; - size_t total_loglines; - size_t largest_batch; - std::chrono::milliseconds oldest_batch_age; - }; - - Stats get_stats() const; - -private: - struct BatchData { - std::string batch_id; - std::string subnet_id; - std::vector loglines; - std::chrono::system_clock::time_point created_at; - std::chrono::system_clock::time_point last_updated; - }; - - mutable std::mutex batches_mutex_; - std::unordered_map batches_; - - std::string collector_name_; - size_t batch_size_; - std::chrono::milliseconds batch_timeout_; - - // Metrics - std::atomic total_loglines_processed_{0}; - std::atomic total_batches_sent_{0}; - - std::shared_ptr logger_; -}; - -/** - * @brief LogCollector - Validates and batches log lines - * - * Main component for log collection stage. Features: - * - Multi-threaded log line processing - * - Field validation with configurable rules - * - Subnet-based batching - * - Automatic batch dispatch - * - ClickHouse monitoring integration - * - Horizontal scalability support - */ -class LogCollector { -public: - /** - * @brief Construct a new Log Collector - * - * @param name Collector name - * @param protocol Protocol type (dns, http, etc.) - * @param consume_topic Kafka topic to consume from - * @param produce_topics Kafka topics to produce to - * @param validation_config Field validation rules - * @param config Global configuration - * @param bootstrap_servers Kafka broker addresses - * @param group_id Kafka consumer group ID - */ - LogCollector(const std::string &name, const std::string &protocol, - const std::string &consume_topic, - const std::vector &produce_topics, - const std::vector &validation_config, - std::shared_ptr config, - const std::string &bootstrap_servers, - const std::string &group_id); - - ~LogCollector(); - - /** - * @brief Start the collector - * - * Begins consuming from Kafka and processing log lines. - */ - void start(); - - /** - * @brief Stop the collector gracefully - * - * Finishes processing in-flight messages and flushes batches. - */ - void stop(); - - /** - * @brief Check if collector is running - */ - bool is_running() const { return running_; } - - /** - * @brief Get collector statistics - */ - struct Stats { - size_t messages_consumed; - size_t messages_validated; - size_t messages_failed; - size_t batches_sent; - double avg_validation_time_ms; - double avg_batch_time_ms; - }; - - Stats get_stats() const; - -private: - /** - * @brief Main message consumption loop - */ - void consume_loop(); - - /** - * @brief Process a single message - * - * @param message Raw message string from Kafka - */ - void process_message(const std::string &message); - - /** - * @brief Validate and parse a log line - * - * @param message Raw JSON message - * @return Validated LogLine object - * @throws std::runtime_error if validation fails - */ - base::LogLine validate_logline(const std::string &message); - - /** - * @brief Calculate subnet ID from IP address - * - * @param ip_address IP address string - * @return Subnet ID string - */ - std::string get_subnet_id(const std::string &ip_address); - - /** - * @brief Batch timeout handler - * - * Periodically checks for batches that need to be sent due to timeout. - */ - void batch_timeout_handler(); - - /** - * @brief Send batches to Kafka - * - * @param batches Batches to send - */ - void send_batches(const std::vector &batches); - - /** - * @brief Log failed validation to ClickHouse - * - * @param message Original message - * @param reason Failure reason - */ - void log_failed_logline(const std::string &message, - const std::string &reason); - - // Configuration - std::string name_; - std::string protocol_; - std::string consume_topic_; - std::vector produce_topics_; - std::vector validation_config_; - std::shared_ptr config_; - - // Batch configuration - size_t batch_size_; - int batch_timeout_ms_; - int ipv4_prefix_length_; - int ipv6_prefix_length_; - - // Components - std::unique_ptr batch_handler_; - std::unique_ptr consumer_; - std::unique_ptr producer_; - std::shared_ptr clickhouse_; - - // Threading - std::atomic running_{false}; - std::thread consumer_thread_; - std::thread batch_timer_thread_; - - // Metrics - std::atomic messages_consumed_{0}; - std::atomic messages_validated_{0}; - std::atomic messages_failed_{0}; - std::atomic batches_sent_{0}; - - // Logger - std::shared_ptr logger_; -}; - -/** - * @brief Create LogCollector instances from configuration - * - * Factory function that creates collectors for each configured collector. - * Supports horizontal scaling by creating multiple instances. - * - * @param config Application configuration - * @return Vector of LogCollector instances - */ -std::vector> -create_logcollectors(std::shared_ptr config); - -} // namespace logcollector -} // namespace hamstring diff --git a/cpp/include/hamstring/logserver/logserver.hpp b/cpp/include/hamstring/logserver/logserver.hpp deleted file mode 100644 index 5f7fc73e..00000000 --- a/cpp/include/hamstring/logserver/logserver.hpp +++ /dev/null @@ -1,133 +0,0 @@ -#pragma once - -#include "hamstring/base/clickhouse_sender.hpp" -#include "hamstring/base/kafka_handler.hpp" -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include -#include -#include -#include -#include - -namespace hamstring { -namespace logserver { - -/** - * @brief LogServer - Entry point for log data into the pipeline - * - * The LogServer consumes log messages from Kafka topics, stores them in - * ClickHouse for monitoring, and forwards them to the appropriate collector - * topics based on protocol configuration. - * - * Features: - * - Consumes from multiple Kafka input topics - * - Produces to multiple collector topics - * - Logs all messages and timestamps to ClickHouse - * - Async processing for high throughput - * - Graceful shutdown support - */ -class LogServer { -public: - /** - * @brief Construct a LogServer instance - * - * @param consume_topic Kafka topic to consume from - * @param produce_topics List of Kafka topics to produce to - * @param clickhouse ClickHouse sender for monitoring - * @param bootstrap_servers Comma-separated Kafka broker addresses - * @param group_id Kafka consumer group ID - */ - LogServer(const std::string &consume_topic, - const std::vector &produce_topics, - std::shared_ptr clickhouse, - const std::string &bootstrap_servers, const std::string &group_id); - - ~LogServer(); - - /** - * @brief Start the LogServer - * - * Begins consuming messages from Kafka and processing them. - * This method blocks until stop() is called. - */ - void start(); - - /** - * @brief Stop the LogServer - * - * Gracefully shuts down the server, finishing any in-flight messages. - */ - void stop(); - - /** - * @brief Check if server is running - */ - bool is_running() const { return running_; } - -private: - /** - * @brief Send a message to all producer topics - * - * @param message_id UUID of the message - * @param message Message content - */ - void send(const std::string &message_id, const std::string &message); - - /** - * @brief Main message fetching loop - * - * Continuously fetches messages from Kafka and processes them. - */ - void fetch_from_kafka(); - - /** - * @brief Log message to ClickHouse - * - * @param message_id UUID of the message - * @param message Message content - */ - void log_message(const std::string &message_id, const std::string &message); - - /** - * @brief Log timestamp event to ClickHouse - * - * @param message_id UUID of the message - * @param event Event type (timestamp_in, timestamp_out) - */ - void log_timestamp(const std::string &message_id, const std::string &event); - - // Configuration - std::string consume_topic_; - std::vector produce_topics_; - - // Kafka handlers - std::unique_ptr consumer_; - std::vector> producers_; - - // ClickHouse for monitoring - std::shared_ptr clickhouse_; - - // Logger - std::shared_ptr logger_; - - // Runtime state - std::atomic running_; - std::thread worker_thread_; -}; - -/** - * @brief Create and start LogServer instances based on configuration - * - * Creates one LogServer instance per protocol defined in the configuration. - * Each server consumes from its protocol-specific topic and produces to - * collector topics that handle that protocol. - * - * @param config Application configuration - * @return Vector of LogServer instances - */ -std::vector> -create_logservers(std::shared_ptr config); - -} // namespace logserver -} // namespace hamstring diff --git a/cpp/include/hamstring/prefilter/prefilter.hpp b/cpp/include/hamstring/prefilter/prefilter.hpp deleted file mode 100644 index 70fcd2a3..00000000 --- a/cpp/include/hamstring/prefilter/prefilter.hpp +++ /dev/null @@ -1,149 +0,0 @@ -#pragma once - -#include "hamstring/base/clickhouse_sender.hpp" -#include "hamstring/base/data_classes.hpp" -#include "hamstring/base/kafka_handler.hpp" -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include -#include -#include -#include -#include - -namespace hamstring { -namespace prefilter { - -/** - * @brief Prefilter - Filters batches based on relevance rules - * - * The Prefilter consumes batches from the LogCollector stage and applies - * relevance-based filtering. Only relevant log lines are forwarded to - * the Inspector stage for anomaly detection. - * - * Features: - * - Rule-based relevance filtering - * - Batch processing with metrics - * - ClickHouse monitoring integration - * - Multi-threaded processing - */ -class Prefilter { -public: - /** - * @brief Construct a new Prefilter - * - * @param name Prefilter name - * @param consume_topic Kafka topic to consume batches from - * @param produce_topics Kafka topics to produce filtered batches to - * @param relevance_function Name of relevance function to use - * @param validation_config Field validation rules - * @param config Global configuration - * @param bootstrap_servers Kafka broker addresses - * @param group_id Kafka consumer group ID - */ - Prefilter(const std::string &name, const std::string &consume_topic, - const std::vector &produce_topics, - const std::string &relevance_function, - const std::vector &validation_config, - std::shared_ptr config, - const std::string &bootstrap_servers, const std::string &group_id); - - ~Prefilter(); - - /** - * @brief Start the prefilter - * - * Begins consuming batches from Kafka and filtering them. - */ - void start(); - - /** - * @brief Stop the prefilter gracefully - */ - void stop(); - - /** - * @brief Check if prefilter is running - */ - bool is_running() const { return running_; } - - /** - * @brief Get prefilter statistics - */ - struct Stats { - size_t batches_consumed; - size_t batches_sent; - size_t loglines_received; - size_t loglines_filtered; - size_t loglines_sent; - }; - - Stats get_stats() const; - -private: - /** - * @brief Main message consumption loop - */ - void consume_loop(); - - /** - * @brief Process a single batch - * - * @param batch Batch to process - */ - void process_batch(const base::Batch &batch); - - /** - * @brief Check if a log line is relevant - * - * @param logline LogLine to check - * @return true if relevant, false otherwise - */ - bool check_relevance(const base::LogLine &logline); - - /** - * @brief Send filtered batch to Kafka - * - * @param batch Filtered batch to send - */ - void send_batch(const base::Batch &batch); - - // Configuration - std::string name_; - std::string consume_topic_; - std::vector produce_topics_; - std::string relevance_function_; - std::vector validation_config_; - std::shared_ptr config_; - - // Components - std::unique_ptr consumer_; - std::vector> producers_; - std::shared_ptr clickhouse_; - - // Threading - std::atomic running_{false}; - std::thread worker_thread_; - - // Metrics - std::atomic batches_consumed_{0}; - std::atomic batches_sent_{0}; - std::atomic loglines_received_{0}; - std::atomic loglines_filtered_{0}; - std::atomic loglines_sent_{0}; - - // Logger - std::shared_ptr logger_; -}; - -/** - * @brief Create Prefilter instances from configuration - * - * @param config Application configuration - * @return Vector of Prefilter instances - */ -std::vector> -create_prefilters(std::shared_ptr config); - -} // namespace prefilter -} // namespace hamstring diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt deleted file mode 100644 index 486bc26f..00000000 --- a/cpp/src/CMakeLists.txt +++ /dev/null @@ -1,48 +0,0 @@ -# Base sources -set(BASE_SOURCES - base/logger.cpp - base/utils.cpp - base/data_classes.cpp - base/clickhouse_sender.cpp - base/kafka_handler.cpp -) - -# Config sources -set(CONFIG_SOURCES - config/config.cpp -) - -# Base library with core infrastructure -add_library(hamstring_base - ${BASE_SOURCES} - ${CONFIG_SOURCES} -) - -target_link_libraries(hamstring_base - PUBLIC - spdlog::spdlog - fmt::fmt - RdKafka::rdkafka - RdKafka::rdkafka++ - nlohmann_json::nlohmann_json - yaml-cpp - OpenSSL::SSL - OpenSSL::Crypto - ${CLICKHOUSE_CPP_LIB} - ${ZSTD_LIB} - ${CITYHASH_LIB} -) - -target_include_directories(hamstring_base - PUBLIC - ${CMAKE_SOURCE_DIR}/include - ${CLICKHOUSE_INCLUDE_DIR} -) - -# Modules -add_subdirectory(logserver) -add_subdirectory(logcollector) -add_subdirectory(prefilter) -add_subdirectory(inspector) -# add_subdirectory(detector) # Requires ONNX Runtime -# add_subdirectory(monitoring) diff --git a/cpp/src/base/clickhouse_sender.cpp b/cpp/src/base/clickhouse_sender.cpp deleted file mode 100644 index 723a3d8e..00000000 --- a/cpp/src/base/clickhouse_sender.cpp +++ /dev/null @@ -1,236 +0,0 @@ -#include "hamstring/base/clickhouse_sender.hpp" -#include "hamstring/base/logger.hpp" -#include -#include -#include -#include -#include - -namespace hamstring { -namespace base { - -ClickHouseSender::ClickHouseSender(const std::string &hostname, int port, - const std::string &database, - const std::string &user, - const std::string &password) - : hostname_(hostname), port_(port), database_(database), user_(user), - password_(password), connected_(false) { - - auto logger = Logger::get_logger("clickhouse"); - - try { - // Create ClickHouse client with connection options - clickhouse::ClientOptions options; - options.SetHost(hostname_); - options.SetPort(port_); - options.SetDefaultDatabase(database_); - options.SetUser(user_); - options.SetPassword(password_); - options.SetPingBeforeQuery(true); - - client_ = std::make_unique(options); - connected_ = true; - - logger->info("ClickHouse client connected to {}:{}/{}", hostname_, port_, - database_); - } catch (const std::exception &e) { - logger->error("Failed to connect to ClickHouse: {}", e.what()); - connected_ = false; - } -} - -ClickHouseSender::~ClickHouseSender() = default; - -void ClickHouseSender::insert_batch_timestamp(const std::string &batch_id, - const std::string &stage, - const std::string &instance_name, - const std::string &status, - size_t message_count, - bool is_active) { - auto logger = Logger::get_logger("clickhouse.metrics"); - logger->debug("BATCH_TIMESTAMP: batch_id={}, stage={}, instance={}, " - "status={}, count={}, active={}", - batch_id, stage, instance_name, status, message_count, - is_active); -} - -void ClickHouseSender::insert_logline_timestamp(const std::string &logline_id, - const std::string &stage, - const std::string &status, - bool is_active) { - auto logger = Logger::get_logger("clickhouse.metrics"); - logger->trace( - "LOGLINE_TIMESTAMP: logline_id={}, stage={}, status={}, active={}", - logline_id, stage, status, is_active); -} - -void ClickHouseSender::insert_fill_level(const std::string &stage, - const std::string &entry_type, - size_t entry_count) { - auto logger = Logger::get_logger("clickhouse.metrics"); - logger->debug("FILL_LEVEL: stage={}, type={}, count={}", stage, entry_type, - entry_count); -} - -void ClickHouseSender::insert_dga_detection(const std::string &domain, - double score, - const std::string &batch_id, - const std::string &src_ip) { - auto logger = Logger::get_logger("clickhouse.detections"); - logger->info("DGA_DETECTION: domain={}, score={:.4f}, batch={}, ip={}", - domain, score, batch_id, src_ip); -} - -void ClickHouseSender::execute(const std::string &query) { - auto logger = Logger::get_logger("clickhouse"); - logger->debug("EXECUTE: {}", query); -} - -bool ClickHouseSender::ping() { - if (!connected_ || !client_) { - return false; - } - - try { - client_->Execute("SELECT 1"); - return true; - } catch (const std::exception &e) { - auto logger = Logger::get_logger("clickhouse"); - logger->error("ClickHouse ping failed: {}", e.what()); - return false; - } -} - -void ClickHouseSender::insert_server_log(const std::string &message_id, - int64_t timestamp_ms, - const std::string &message_text) { - if (!connected_ || !client_) { - auto logger = Logger::get_logger("clickhouse"); - logger->warn("ClickHouse not connected, skipping server_log insert"); - return; - } - - try { - // Create block with columns - clickhouse::Block block; - - // message_id as String (ClickHouse will convert to UUID) - // Using String instead of UUID column to avoid boost UUID parsing - // complexity - auto col_message_id = std::make_shared(); - col_message_id->Append(message_id); - - // timestamp_in as DateTime64(6) - represented as milliseconds since epoch - auto col_timestamp = std::make_shared(6); - col_timestamp->Append(timestamp_ms * 1000); // Convert ms to microseconds - - // message_text as String - auto col_message = std::make_shared(); - col_message->Append(message_text); - - // Add columns to block - block.AppendColumn("message_id", col_message_id); - block.AppendColumn("timestamp_in", col_timestamp); - block.AppendColumn("message_text", col_message); - - // Insert block - client_->Insert("server_logs", block); - - } catch (const std::exception &e) { - auto logger = Logger::get_logger("clickhouse"); - logger->error("Failed to insert server_log: {}", e.what()); - } -} - -void ClickHouseSender::insert_server_log_timestamp( - const std::string &message_id, const std::string &event, - int64_t event_timestamp_ms) { - if (!connected_ || !client_) { - auto logger = Logger::get_logger("clickhouse"); - logger->warn( - "ClickHouse not connected, skipping server_log_timestamp insert"); - return; - } - - try { - // Create block with columns - clickhouse::Block block; - - // message_id as String (ClickHouse will convert to UUID) - auto col_message_id = std::make_shared(); - col_message_id->Append(message_id); - - // event as String - auto col_event = std::make_shared(); - col_event->Append(event); - - // event_timestamp as DateTime64(6) - auto col_timestamp = std::make_shared(6); - col_timestamp->Append(event_timestamp_ms * - 1000); // Convert ms to microseconds - - // Add columns to block - block.AppendColumn("message_id", col_message_id); - block.AppendColumn("event", col_event); - block.AppendColumn("event_timestamp", col_timestamp); - - // Insert block - client_->Insert("server_logs_timestamps", block); - - } catch (const std::exception &e) { - auto logger = Logger::get_logger("clickhouse"); - logger->error("Failed to insert server_log_timestamp: {}", e.what()); - } -} - -void ClickHouseSender::insert_failed_logline(const std::string &message_text, - int64_t timestamp_in_ms, - int64_t timestamp_failed_ms, - const std::string &reason) { - if (!connected_ || !client_) { - auto logger = Logger::get_logger("clickhouse"); - logger->warn("ClickHouse not connected, skipping failed_logline insert"); - return; - } - - try { - // Create block with columns - clickhouse::Block block; - - // message_text as String - auto col_message = std::make_shared(); - col_message->Append(message_text); - - // timestamp_in as DateTime64(6) - auto col_timestamp_in = std::make_shared(6); - col_timestamp_in->Append(timestamp_in_ms * - 1000); // Convert ms to microseconds - - // timestamp_failed as DateTime64(6) - auto col_timestamp_failed = - std::make_shared(6); - col_timestamp_failed->Append(timestamp_failed_ms * - 1000); // Convert ms to microseconds - - // reason_for_failure as String (nullable in schema, but we always provide a - // value) - auto col_reason = std::make_shared(); - col_reason->Append(reason); - - // Add columns to block - block.AppendColumn("message_text", col_message); - block.AppendColumn("timestamp_in", col_timestamp_in); - block.AppendColumn("timestamp_failed", col_timestamp_failed); - block.AppendColumn("reason_for_failure", col_reason); - - // Insert block - client_->Insert("failed_loglines", block); - - } catch (const std::exception &e) { - auto logger = Logger::get_logger("clickhouse"); - logger->error("Failed to insert failed_logline: {}", e.what()); - } -} - -} // namespace base -} // namespace hamstring diff --git a/cpp/src/base/data_classes.cpp b/cpp/src/base/data_classes.cpp deleted file mode 100644 index 22c10dd2..00000000 --- a/cpp/src/base/data_classes.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include "hamstring/base/data_classes.hpp" -#include "hamstring/base/utils.hpp" -#include -#include -#include - -using json = nlohmann::json; - -namespace hamstring { -namespace base { - -// ============================================================================ -// LogLine Implementation -// ============================================================================ - -std::string LogLine::to_json() const { - json j; - j["logline_id"] = logline_id; - j["batch_id"] = batch_id; - j["fields"] = fields; - j["timestamp"] = utils::timestamp_to_ms(timestamp); - return j.dump(); -} - -std::shared_ptr LogLine::from_json(const std::string &json_str) { - auto j = json::parse(json_str); - auto logline = std::make_shared(); - - logline->logline_id = j["logline_id"]; - logline->batch_id = j.value("batch_id", ""); - logline->fields = j["fields"].get>(); - logline->timestamp = utils::ms_to_timestamp(j["timestamp"]); - - return logline; -} - -std::optional LogLine::get_field(const std::string &name) const { - auto it = fields.find(name); - if (it != fields.end()) { - return it->second; - } - return std::nullopt; -} - -void LogLine::set_field(const std::string &name, const std::string &value) { - fields[name] = value; -} - -// ============================================================================ -// Batch Implementation -// ============================================================================ - -std::string Batch::to_json() const { - json j; - j["batch_id"] = batch_id; - j["subnet_id"] = subnet_id; - j["collector_name"] = collector_name; - j["created_at"] = utils::timestamp_to_ms(created_at); - j["timestamp_in"] = utils::timestamp_to_ms(timestamp_in); - - json loglines_json = json::array(); - for (const auto &logline : loglines) { - loglines_json.push_back(json::parse(logline->to_json())); - } - j["loglines"] = loglines_json; - - return j.dump(); -} - -std::shared_ptr Batch::from_json(const std::string &json_str) { - auto j = json::parse(json_str); - auto batch = std::make_shared(); - - batch->batch_id = j["batch_id"]; - batch->subnet_id = j["subnet_id"]; - batch->collector_name = j["collector_name"]; - batch->created_at = utils::ms_to_timestamp(j["created_at"]); - batch->timestamp_in = utils::ms_to_timestamp(j["timestamp_in"]); - - for (const auto &logline_json : j["loglines"]) { - batch->loglines.push_back(LogLine::from_json(logline_json.dump())); - } - - return batch; -} - -void Batch::add_logline(std::shared_ptr logline) { - loglines.push_back(logline); -} - -// ============================================================================ -// Warning Implementation -// ============================================================================ - -std::string Warning::to_json() const { - json j; - j["warning_id"] = warning_id; - j["batch_id"] = batch_id; - j["src_ip"] = src_ip; - j["domain_name"] = domain_name; - j["score"] = score; - j["threshold"] = threshold; - j["timestamp"] = utils::timestamp_to_ms(timestamp); - j["metadata"] = metadata; - - return j.dump(); -} - -std::shared_ptr Warning::from_json(const std::string &json_str) { - auto j = json::parse(json_str); - auto warning = std::make_shared(); - - warning->warning_id = j["warning_id"]; - warning->batch_id = j["batch_id"]; - warning->src_ip = j["src_ip"]; - warning->domain_name = j["domain_name"]; - warning->score = j["score"]; - warning->threshold = j["threshold"]; - warning->timestamp = utils::ms_to_timestamp(j["timestamp"]); - warning->metadata = j["metadata"].get>(); - - return warning; -} - -// ============================================================================ -// RegExValidator Implementation -// ============================================================================ - -RegExValidator::RegExValidator(const std::string &name, - const std::string &pattern) - : name_(name), pattern_(pattern) {} - -bool RegExValidator::validate(const std::string &value) const { - return std::regex_match(value, pattern_); -} - -// ============================================================================ -// TimestampValidator Implementation -// ============================================================================ - -TimestampValidator::TimestampValidator(const std::string &name, - const std::string &format) - : name_(name), format_(format) {} - -bool TimestampValidator::validate(const std::string &value) const { - try { - parse(value); - return true; - } catch (...) { - return false; - } -} - -std::chrono::system_clock::time_point -TimestampValidator::parse(const std::string &value) const { - return utils::parse_timestamp(value, format_); -} - -// ============================================================================ -// IpAddressValidator Implementation -// ============================================================================ - -IpAddressValidator::IpAddressValidator(const std::string &name) : name_(name) {} - -bool IpAddressValidator::validate(const std::string &value) const { - return is_ipv4(value) || is_ipv6(value); -} - -bool IpAddressValidator::is_ipv4(const std::string &value) { - return utils::is_valid_ipv4(value); -} - -bool IpAddressValidator::is_ipv6(const std::string &value) { - return utils::is_valid_ipv6(value); -} - -// ============================================================================ -// ListItemValidator Implementation -// ============================================================================ - -ListItemValidator::ListItemValidator( - const std::string &name, const std::vector &allowed_list, - const std::vector &relevant_list) - : name_(name), allowed_list_(allowed_list), relevant_list_(relevant_list) {} - -bool ListItemValidator::validate(const std::string &value) const { - return std::find(allowed_list_.begin(), allowed_list_.end(), value) != - allowed_list_.end(); -} - -bool ListItemValidator::is_relevant(const std::string &value) const { - return std::find(relevant_list_.begin(), relevant_list_.end(), value) != - relevant_list_.end(); -} - -} // namespace base -} // namespace hamstring diff --git a/cpp/src/base/kafka_handler.cpp b/cpp/src/base/kafka_handler.cpp deleted file mode 100644 index 95738c92..00000000 --- a/cpp/src/base/kafka_handler.cpp +++ /dev/null @@ -1,286 +0,0 @@ -#include "hamstring/base/kafka_handler.hpp" -#include "hamstring/base/logger.hpp" -#include - -namespace hamstring { -namespace base { - -// ============================================================================ -// KafkaHandler Base Implementation -// ============================================================================ - -std::unique_ptr -KafkaHandler::create_config(const std::string &bootstrap_servers, - const std::string &group_id) { - - std::string errstr; - auto conf = std::unique_ptr( - RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL)); - - if (conf->set("bootstrap.servers", bootstrap_servers, errstr) != - RdKafka::Conf::CONF_OK) { - throw std::runtime_error("Failed to set bootstrap.servers: " + errstr); - } - - if (!group_id.empty()) { - if (conf->set("group.id", group_id, errstr) != RdKafka::Conf::CONF_OK) { - throw std::runtime_error("Failed to set group.id: " + errstr); - } - } - - return conf; -} - -// ============================================================================ -// KafkaProduceHandler Implementation -// ============================================================================ - -KafkaProduceHandler::KafkaProduceHandler(const std::string &bootstrap_servers, - const std::string &topic) - : topic_(topic) { - - bootstrap_servers_ = bootstrap_servers; - - std::string errstr; - conf_ = create_config(bootstrap_servers); - - // Producer-specific settings - conf_->set("enable.idempotence", "false", errstr); - conf_->set("acks", "1", errstr); - conf_->set("message.max.bytes", "1000000000", errstr); - - // Create producer - producer_ = std::unique_ptr( - RdKafka::Producer::create(conf_.get(), errstr)); - - if (!producer_) { - throw std::runtime_error("Failed to create Kafka producer: " + errstr); - } - - auto logger = Logger::get_logger("kafka.producer"); - logger->info("Kafka producer created for topic '{}'", topic_); -} - -KafkaProduceHandler::~KafkaProduceHandler() { - if (producer_) { - producer_->flush(10000); - } -} - -bool KafkaProduceHandler::send(const std::string &key, - const std::string &value) { - return send(key, value, 0); // timestamp=0 means current time -} - -bool KafkaProduceHandler::send(const std::string &key, const std::string &value, - int64_t timestamp) { - if (value.empty()) { - return true; - } - - RdKafka::ErrorCode err = producer_->produce( - topic_, RdKafka::Topic::PARTITION_UA, RdKafka::Producer::RK_MSG_COPY, - const_cast(value.data()), value.size(), - key.empty() ? nullptr : key.data(), key.size(), timestamp, - nullptr // headers - ); - - if (err != RdKafka::ERR_NO_ERROR) { - auto logger = Logger::get_logger("kafka.producer"); - logger->error("Failed to produce message: {}", RdKafka::err2str(err)); - return false; - } - - producer_->poll(0); // Handle delivery reports - return true; -} - -void KafkaProduceHandler::flush(int timeout_ms) { - if (producer_) { - producer_->flush(timeout_ms); - } -} - -// ============================================================================ -// KafkaConsumeHandler Implementation -// ============================================================================ - -KafkaConsumeHandler::KafkaConsumeHandler(const std::string &bootstrap_servers, - const std::string &group_id, - const std::vector &topics) - : topics_(topics) { - - bootstrap_servers_ = bootstrap_servers; - - std::string errstr; - conf_ = create_config(bootstrap_servers, group_id); - - // Consumer-specific settings - conf_->set("enable.auto.commit", "false", errstr); - conf_->set("auto.offset.reset", "earliest", errstr); - conf_->set("enable.partition.eof", "true", errstr); - - // Create consumer - consumer_ = std::unique_ptr( - RdKafka::KafkaConsumer::create(conf_.get(), errstr)); - - if (!consumer_) { - throw std::runtime_error("Failed to create Kafka consumer: " + errstr); - } - - // Subscribe to topics - RdKafka::ErrorCode err = consumer_->subscribe(topics_); - if (err != RdKafka::ERR_NO_ERROR) { - throw std::runtime_error("Failed to subscribe to topics: " + - RdKafka::err2str(err)); - } - - auto logger = Logger::get_logger("kafka.consumer"); - logger->info("Kafka consumer created for {} topics (group: {})", - topics_.size(), group_id); -} - -KafkaConsumeHandler::~KafkaConsumeHandler() { - stop(); - if (consumer_) { - consumer_->close(); - } -} - -void KafkaConsumeHandler::poll(KafkaMessageCallback callback, int timeout_ms) { - RdKafka::Message *msg = consumer_->consume(timeout_ms); - - if (!msg) { - return; - } - - RdKafka::ErrorCode err = msg->err(); - - if (err == RdKafka::ERR__TIMED_OUT || err == RdKafka::ERR__PARTITION_EOF) { - delete msg; - return; - } - - if (err != RdKafka::ERR_NO_ERROR) { - auto logger = Logger::get_logger("kafka.consumer"); - logger->error("Kafka consume error: {}", msg->errstr()); - delete msg; - return; - } - - // Extract message data - std::string topic = msg->topic_name(); - std::string key; - if (msg->key()) { - key = - std::string(reinterpret_cast(msg->key()), msg->key_len()); - } - - std::string value; - if (msg->payload()) { - value = std::string(static_cast(msg->payload()), msg->len()); - } - - int64_t timestamp = msg->timestamp().timestamp; - - delete msg; - - // Call the callback - if (callback) { - callback(topic, key, value, timestamp); - } -} - -void KafkaConsumeHandler::start_async(boost::asio::io_context &io_context, - KafkaMessageCallback callback) { - running_ = true; - - // Note: This is a simple placeholder. For production, use boost::asio::post - // with a proper work guard to keep the io_context running - // TODO: Implement proper async polling with boost::asio -} - -void KafkaConsumeHandler::stop() { running_ = false; } - -void KafkaConsumeHandler::commit() { - if (consumer_) { - consumer_->commitSync(); - } -} - -// ============================================================================ -// ExactlyOnceKafkaHandler Implementation -// ============================================================================ - -ExactlyOnceKafkaHandler::ExactlyOnceKafkaHandler( - const std::string &bootstrap_servers, const std::string &consumer_group_id, - const std::vector &consume_topics, - const std::string &produce_topic) { - - consumer_ = std::make_unique( - bootstrap_servers, consumer_group_id, consume_topics); - - producer_ = - std::make_unique(bootstrap_servers, produce_topic); -} - -ExactlyOnceKafkaHandler::~ExactlyOnceKafkaHandler() { stop(); } - -void ExactlyOnceKafkaHandler::process( - std::function - transform_fn, - int timeout_ms) { - - consumer_->poll( - [this, transform_fn](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - // Transform the message - std::string transformed = transform_fn(key, value); - - // Send to output topic - if (!transformed.empty()) { - producer_->send(key, transformed, timestamp); - } - - // Commit the offset - consumer_->commit(); - }, - timeout_ms); -} - -void ExactlyOnceKafkaHandler::start_async( - boost::asio::io_context &io_context, - std::function - transform_fn) { - - running_ = true; - - consumer_->start_async( - io_context, - [this, transform_fn](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - if (!running_) - return; - - std::string transformed = transform_fn(key, value); - - if (!transformed.empty()) { - producer_->send(key, transformed, timestamp); - } - - consumer_->commit(); - }); -} - -void ExactlyOnceKafkaHandler::stop() { - running_ = false; - if (consumer_) { - consumer_->stop(); - } - if (producer_) { - producer_->flush(); - } -} - -} // namespace base -} // namespace hamstring diff --git a/cpp/src/base/logger.cpp b/cpp/src/base/logger.cpp deleted file mode 100644 index 7065dfc8..00000000 --- a/cpp/src/base/logger.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include -#include -#include -#include - -namespace hamstring { -namespace base { - -namespace { -std::map> loggers_; -std::mutex loggers_mutex_; -} // namespace - -std::shared_ptr -Logger::get_logger(const std::string &module_name) { - std::lock_guard lock(loggers_mutex_); - - auto it = loggers_.find(module_name); - if (it != loggers_.end()) { - return it->second; - } - - auto logger = create_logger(module_name); - loggers_[module_name] = logger; - return logger; -} - -void Logger::set_level(const std::string &module_name, - spdlog::level::level_enum level) { - auto logger = get_logger(module_name); - logger->set_level(level); -} - -void Logger::set_global_level(spdlog::level::level_enum level) { - spdlog::set_level(level); -} - -void Logger::initialize(bool debug) { - auto level = debug ? spdlog::level::debug : spdlog::level::info; - spdlog::set_level(level); - spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%n] [%^%l%$] %v"); -} - -std::shared_ptr Logger::create_logger(const std::string &name) { - auto console_sink = std::make_shared(); - console_sink->set_level(spdlog::level::trace); - - std::vector sinks{console_sink}; - - auto logger = - std::make_shared(name, sinks.begin(), sinks.end()); - logger->set_level(spdlog::level::info); - - return logger; -} - -} // namespace base -} // namespace hamstring diff --git a/cpp/src/base/utils.cpp b/cpp/src/base/utils.cpp deleted file mode 100644 index f6eabf73..00000000 --- a/cpp/src/base/utils.cpp +++ /dev/null @@ -1,258 +0,0 @@ -#include "hamstring/base/utils.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace hamstring { -namespace base { -namespace utils { - -std::string generate_uuid() { - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> dis(0, 15); - std::uniform_int_distribution<> dis2(8, 11); - - std::stringstream ss; - ss << std::hex; - - for (int i = 0; i < 8; i++) { - ss << dis(gen); - } - ss << "-"; - - for (int i = 0; i < 4; i++) { - ss << dis(gen); - } - ss << "-4"; - - for (int i = 0; i < 3; i++) { - ss << dis(gen); - } - ss << "-"; - - ss << dis2(gen); - for (int i = 0; i < 3; i++) { - ss << dis(gen); - } - ss << "-"; - - for (int i = 0; i < 12; i++) { - ss << dis(gen); - } - - return ss.str(); -} - -bool is_valid_ipv4(const std::string &ip) { - struct sockaddr_in sa; - return inet_pton(AF_INET, ip.c_str(), &(sa.sin_addr)) == 1; -} - -bool is_valid_ipv6(const std::string &ip) { - struct sockaddr_in6 sa; - return inet_pton(AF_INET6, ip.c_str(), &(sa.sin6_addr)) == 1; -} - -std::string get_subnet_id(const std::string &ip, int prefix_length) { - if (is_valid_ipv4(ip)) { - struct in_addr addr; - inet_pton(AF_INET, ip.c_str(), &addr); - - uint32_t mask = (0xFFFFFFFF << (32 - prefix_length)) & 0xFFFFFFFF; - uint32_t subnet = ntohl(addr.s_addr) & mask; - addr.s_addr = htonl(subnet); - - char subnet_str[INET_ADDRSTRLEN]; - inet_ntop(AF_INET, &addr, subnet_str, INET_ADDRSTRLEN); - return std::string(subnet_str) + "/" + std::to_string(prefix_length); - } else if (is_valid_ipv6(ip)) { - struct in6_addr addr; - inet_pton(AF_INET6, ip.c_str(), &addr); - - int bytes_to_mask = prefix_length / 8; - int bits_to_mask = prefix_length % 8; - - for (int i = bytes_to_mask; i < 16; i++) { - addr.s6_addr[i] = 0; - } - - if (bits_to_mask > 0 && bytes_to_mask < 16) { - uint8_t mask = (0xFF << (8 - bits_to_mask)) & 0xFF; - addr.s6_addr[bytes_to_mask] &= mask; - } - - char subnet_str[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &addr, subnet_str, INET6_ADDRSTRLEN); - return std::string(subnet_str) + "/" + std::to_string(prefix_length); - } - - return ""; -} - -std::string format_timestamp(const std::chrono::system_clock::time_point &tp, - const std::string &format) { - auto time_t = std::chrono::system_clock::to_time_t(tp); - std::tm tm = *std::gmtime(&time_t); - - std::stringstream ss; - ss << std::put_time(&tm, format.c_str()); - return ss.str(); -} - -std::chrono::system_clock::time_point -parse_timestamp(const std::string &ts_str, const std::string &format) { - std::tm tm = {}; - std::istringstream ss(ts_str); - ss >> std::get_time(&tm, format.c_str()); - - auto time_t = std::mktime(&tm); - return std::chrono::system_clock::from_time_t(time_t); -} - -int64_t timestamp_to_ms(const std::chrono::system_clock::time_point &tp) { - auto ms = std::chrono::duration_cast( - tp.time_since_epoch()); - return ms.count(); -} - -std::chrono::system_clock::time_point ms_to_timestamp(int64_t ms) { - return std::chrono::system_clock::time_point(std::chrono::milliseconds(ms)); -} - -std::vector split(const std::string &str, char delimiter) { - std::vector tokens; - std::string token; - std::istringstream token_stream(str); - - while (std::getline(token_stream, token, delimiter)) { - tokens.push_back(token); - } - - return tokens; -} - -std::string join(const std::vector &vec, - const std::string &delimiter) { - if (vec.empty()) - return ""; - - std::stringstream ss; - ss << vec[0]; - - for (size_t i = 1; i < vec.size(); i++) { - ss << delimiter << vec[i]; - } - - return ss.str(); -} - -std::string trim(const std::string &str) { - auto start = std::find_if_not(str.begin(), str.end(), [](unsigned char ch) { - return std::isspace(ch); - }); - - auto end = std::find_if_not(str.rbegin(), str.rend(), [](unsigned char ch) { - return std::isspace(ch); - }).base(); - - return (start < end) ? std::string(start, end) : std::string(); -} - -std::string to_lower(const std::string &str) { - std::string result = str; - std::transform(result.begin(), result.end(), result.begin(), - [](unsigned char c) { return std::tolower(c); }); - return result; -} - -std::string to_upper(const std::string &str) { - std::string result = str; - std::transform(result.begin(), result.end(), result.begin(), - [](unsigned char c) { return std::toupper(c); }); - return result; -} - -std::string extract_fqdn(const std::string &domain) { return domain; } - -std::string extract_second_level_domain(const std::string &domain) { - auto labels = split(domain, '.'); - - if (labels.size() >= 2) { - return labels[labels.size() - 2] + "." + labels[labels.size() - 1]; - } - - return domain; -} - -std::string extract_third_level_domain(const std::string &domain) { - auto labels = split(domain, '.'); - - if (labels.size() >= 3) { - return labels[0]; - } - - return ""; -} - -std::optional extract_tld(const std::string &domain) { - auto labels = split(domain, '.'); - - if (!labels.empty()) { - return labels.back(); - } - - return std::nullopt; -} - -std::string sha256_file(const std::string &filepath) { - std::ifstream file(filepath, std::ios::binary); - if (!file) { - return ""; - } - - SHA256_CTX sha256; - SHA256_Init(&sha256); - - const size_t buffer_size = 8192; - char buffer[buffer_size]; - - while (file.read(buffer, buffer_size) || file.gcount() > 0) { - SHA256_Update(&sha256, buffer, file.gcount()); - } - - unsigned char hash[SHA256_DIGEST_LENGTH]; - SHA256_Final(hash, &sha256); - - std::stringstream ss; - for (int i = 0; i < SHA256_DIGEST_LENGTH; i++) { - ss << std::hex << std::setw(2) << std::setfill('0') << (int)hash[i]; - } - - return ss.str(); -} - -std::string sha256_string(const std::string &data) { - unsigned char hash[SHA256_DIGEST_LENGTH]; - SHA256_CTX sha256; - SHA256_Init(&sha256); - SHA256_Update(&sha256, data.c_str(), data.size()); - SHA256_Final(hash, &sha256); - - std::stringstream ss; - for (int i = 0; i < SHA256_DIGEST_LENGTH; i++) { - ss << std::hex << std::setw(2) << std::setfill('0') << (int)hash[i]; - } - - return ss.str(); -} - -} // namespace utils -} // namespace base -} // namespace hamstring diff --git a/cpp/src/config/config.cpp b/cpp/src/config/config.cpp deleted file mode 100644 index 833ca889..00000000 --- a/cpp/src/config/config.cpp +++ /dev/null @@ -1,358 +0,0 @@ -#include "hamstring/config/config.hpp" -#include -#include - -namespace hamstring { -namespace config { - -// ============================================================================ -// LoggingConfig -// ============================================================================ - -LoggingConfig LoggingConfig::from_yaml(const YAML::Node &node) { - LoggingConfig config; - - if (node["base"]) { - config.base_debug = node["base"]["debug"].as(false); - } - - if (node["modules"]) { - for (const auto &module : node["modules"]) { - std::string module_name = module.first.as(); - ModuleLoggingConfig module_config; - module_config.debug = module.second["debug"].as(false); - config.modules[module_name] = module_config; - } - } - - return config; -} - -// ============================================================================ -// KafkaBroker -// ============================================================================ - -KafkaBroker KafkaBroker::from_yaml(const YAML::Node &node) { - KafkaBroker broker; - broker.hostname = node["hostname"].as(); - broker.internal_port = node["internal_port"].as(); - broker.external_port = node["external_port"].as(); - broker.node_ip = node["node_ip"].as(); - return broker; -} - -// ============================================================================ -// EnvironmentConfig -// ============================================================================ - -EnvironmentConfig EnvironmentConfig::from_yaml(const YAML::Node &node) { - EnvironmentConfig config; - - if (node["kafka_brokers"]) { - for (const auto &broker_node : node["kafka_brokers"]) { - config.kafka_brokers.push_back(KafkaBroker::from_yaml(broker_node)); - } - } - - if (node["kafka_topics_prefix"]) { - for (const auto &topic : node["kafka_topics_prefix"]["pipeline"]) { - std::string topic_name = topic.first.as(); - std::string topic_value = topic.second.as(); - config.kafka_topics_prefix[topic_name] = topic_value; - } - } - - if (node["monitoring"] && node["monitoring"]["clickhouse_server"]) { - config.clickhouse_hostname = - node["monitoring"]["clickhouse_server"]["hostname"].as(); - } - - return config; -} - -std::string EnvironmentConfig::get_kafka_bootstrap_servers() const { - std::stringstream ss; - for (size_t i = 0; i < kafka_brokers.size(); ++i) { - if (i > 0) - ss << ","; - ss << kafka_brokers[i].node_ip << ":" << kafka_brokers[i].external_port; - } - return ss.str(); -} - -// ============================================================================ -// FieldConfig -// ============================================================================ - -FieldConfig FieldConfig::from_yaml(const YAML::Node &node) { - FieldConfig config; - - config.name = node[0].as(); - std::string type_str = node[1].as(); - - // Determine field type - if (type_str == "RegEx") { - config.type = FieldType::RegEx; - config.pattern = node[2].as(); - } else if (type_str == "Timestamp") { - config.type = FieldType::Timestamp; - config.timestamp_format = node[2].as(); - } else if (type_str == "IpAddress") { - config.type = FieldType::IpAddress; - } else if (type_str == "ListItem") { - config.type = FieldType::ListItem; - for (const auto &item : node[2]) { - config.allowed_list.push_back(item.as()); - } - if (node.size() > 3) { - for (const auto &item : node[3]) { - config.relevant_list.push_back(item.as()); - } - } - } - - return config; -} - -// ============================================================================ -// BatchHandlerConfig -// ============================================================================ - -BatchHandlerConfig BatchHandlerConfig::from_yaml(const YAML::Node &node) { - BatchHandlerConfig config; - - if (node["batch_size"]) { - config.batch_size = node["batch_size"].as(); - } - if (node["batch_timeout"]) { - config.batch_timeout = node["batch_timeout"].as(); - } - if (node["subnet_id"]) { - if (node["subnet_id"]["ipv4_prefix_length"]) { - config.ipv4_prefix_length = - node["subnet_id"]["ipv4_prefix_length"].as(); - } - if (node["subnet_id"]["ipv6_prefix_length"]) { - config.ipv6_prefix_length = - node["subnet_id"]["ipv6_prefix_length"].as(); - } - } - - return config; -} - -// ============================================================================ -// CollectorConfig -// ============================================================================ - -CollectorConfig -CollectorConfig::from_yaml(const YAML::Node &node, - const BatchHandlerConfig &default_config) { - CollectorConfig config; - - config.name = node["name"].as(); - config.protocol_base = node["protocol_base"].as(); - - if (node["required_log_information"]) { - for (const auto &field_node : node["required_log_information"]) { - config.required_log_information.push_back( - FieldConfig::from_yaml(field_node)); - } - } - - // Start with default config - config.batch_handler_config = default_config; - - // Override with collector-specific config if present - if (node["batch_handler_config_override"]) { - auto override_node = node["batch_handler_config_override"]; - if (override_node["batch_size"]) { - config.batch_handler_config.batch_size = - override_node["batch_size"].as(); - } - if (override_node["batch_timeout"]) { - config.batch_handler_config.batch_timeout = - override_node["batch_timeout"].as(); - } - } - - return config; -} - -// ============================================================================ -// PrefilterConfig -// ============================================================================ - -PrefilterConfig PrefilterConfig::from_yaml(const YAML::Node &node) { - PrefilterConfig config; - - config.name = node["name"].as(); - config.relevance_method = node["relevance_method"].as(); - config.collector_name = node["collector_name"].as(); - - return config; -} - -// ============================================================================ -// InspectorConfig -// ============================================================================ - -InspectorConfig InspectorConfig::from_yaml(const YAML::Node &node) { - InspectorConfig config; - - config.name = node["name"].as(); - config.inspector_module_name = - node["inspector_module_name"].as(); - config.inspector_class_name = node["inspector_class_name"].as(); - config.prefilter_name = node["prefilter_name"].as(); - config.mode = node["mode"].as(); - config.anomaly_threshold = node["anomaly_threshold"].as(); - config.score_threshold = node["score_threshold"].as(); - config.time_type = node["time_type"].as(); - config.time_range = node["time_range"].as(); - - if (node["models"]) { - config.models = YAML::Clone(node["models"]); - } - if (node["ensemble"]) { - config.ensemble = YAML::Clone(node["ensemble"]); - } - - return config; -} - -// ============================================================================ -// DetectorConfig -// ============================================================================ - -DetectorConfig DetectorConfig::from_yaml(const YAML::Node &node) { - DetectorConfig config; - - config.name = node["name"].as(); - config.detector_module_name = node["detector_module_name"].as(); - config.detector_class_name = node["detector_class_name"].as(); - config.model = node["model"].as(); - config.checksum = node["checksum"].as(); - config.base_url = node["base_url"].as(); - config.threshold = node["threshold"].as(); - config.inspector_name = node["inspector_name"].as(); - - return config; -} - -// ============================================================================ -// MonitoringConfig -// ============================================================================ - -MonitoringConfig MonitoringConfig::from_yaml(const YAML::Node &node) { - MonitoringConfig config; - - if (node["clickhouse_connector"]) { - auto ch_node = node["clickhouse_connector"]; - if (ch_node["batch_size"]) { - config.clickhouse_batch_size = ch_node["batch_size"].as(); - } - if (ch_node["batch_timeout"]) { - config.clickhouse_batch_timeout = ch_node["batch_timeout"].as(); - } - } - - return config; -} - -// ============================================================================ -// PipelineConfig -// ============================================================================ - -PipelineConfig PipelineConfig::from_yaml(const YAML::Node &node) { - PipelineConfig config; - - // Log storage - if (node["log_storage"] && node["log_storage"]["logserver"]) { - config.logserver_input_file = - node["log_storage"]["logserver"]["input_file"].as(""); - } - - // Log collection - if (node["log_collection"]) { - auto collection_node = node["log_collection"]; - - // Default batch handler config - if (collection_node["default_batch_handler_config"]) { - config.default_batch_handler_config = BatchHandlerConfig::from_yaml( - collection_node["default_batch_handler_config"]); - } - - // Collectors - if (collection_node["collectors"]) { - for (const auto &collector_node : collection_node["collectors"]) { - config.collectors.push_back(CollectorConfig::from_yaml( - collector_node, config.default_batch_handler_config)); - } - } - } - - // Prefilters - if (node["log_filtering"]) { - for (const auto &prefilter_node : node["log_filtering"]) { - config.prefilters.push_back(PrefilterConfig::from_yaml(prefilter_node)); - } - } - - // Inspectors - if (node["data_inspection"]) { - for (const auto &inspector_node : node["data_inspection"]) { - config.inspectors.push_back(InspectorConfig::from_yaml(inspector_node)); - } - } - - // Detectors - if (node["data_analysis"]) { - for (const auto &detector_node : node["data_analysis"]) { - config.detectors.push_back(DetectorConfig::from_yaml(detector_node)); - } - } - - // Monitoring - if (node["monitoring"]) { - config.monitoring = MonitoringConfig::from_yaml(node["monitoring"]); - } - - return config; -} - -// ============================================================================ -// Config (Root) -// ============================================================================ - -std::shared_ptr Config::load_from_file(const std::string &filepath) { - YAML::Node root = YAML::LoadFile(filepath); - return from_yaml(root); -} - -std::shared_ptr -Config::load_from_string(const std::string &yaml_content) { - YAML::Node root = YAML::Load(yaml_content); - return from_yaml(root); -} - -std::shared_ptr Config::from_yaml(const YAML::Node &root) { - auto config = std::make_shared(); - - if (root["logging"]) { - config->logging = LoggingConfig::from_yaml(root["logging"]); - } - - if (root["pipeline"]) { - config->pipeline = PipelineConfig::from_yaml(root["pipeline"]); - } - - if (root["environment"]) { - config->environment = EnvironmentConfig::from_yaml(root["environment"]); - } - - return config; -} - -} // namespace config -} // namespace hamstring diff --git a/cpp/src/detector/CMakeLists.txt b/cpp/src/detector/CMakeLists.txt deleted file mode 100644 index 1bf65e0a..00000000 --- a/cpp/src/detector/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -# Detector module -add_library(hamstring_detector - feature_extractor.cpp - detector.cpp - detector_service.cpp -) - -target_link_libraries(hamstring_detector - PUBLIC - hamstring_base - onnxruntime -) - -add_executable(test_detector test_detector.cpp) -target_link_libraries(test_detector PRIVATE hamstring_detector) - -add_executable(detector main.cpp) -target_link_libraries(detector - PRIVATE - hamstring_detector - hamstring_base -) -install(TARGETS detector DESTINATION bin) - -# TODO: Add detector executable when ready -# add_executable(detector -# main.cpp -# ) -# -# target_link_libraries(detector -# PRIVATE -# hamstring_detector -# hamstring_base -# ) -# -# install(TARGETS detector DESTINATION bin) diff --git a/cpp/src/detector/detector.cpp b/cpp/src/detector/detector.cpp deleted file mode 100644 index c3610751..00000000 --- a/cpp/src/detector/detector.cpp +++ /dev/null @@ -1,109 +0,0 @@ -#include "hamstring/detector/detector.hpp" -#include -#include -#include -#include - -namespace hamstring { -namespace detector { - -struct Detector::Impl { - Ort::Env env; - Ort::SessionOptions session_options; - std::unique_ptr session; - Ort::AllocatorWithDefaultOptions allocator; - - std::vector input_node_names; - std::vector output_node_names; - std::vector input_node_dims; - - Impl() - : env(ORT_LOGGING_LEVEL_WARNING, "HamstringDetector"), session_options() { - session_options.SetIntraOpNumThreads(1); - session_options.SetGraphOptimizationLevel( - GraphOptimizationLevel::ORT_ENABLE_BASIC); - } -}; - -Detector::Detector() : impl_(std::make_unique()) {} - -Detector::~Detector() = default; - -void Detector::load_model(const std::string &model_path) { - try { - impl_->session = std::make_unique( - impl_->env, model_path.c_str(), impl_->session_options); - - // Get input metadata - size_t num_input_nodes = impl_->session->GetInputCount(); - impl_->input_node_names.reserve(num_input_nodes); - - for (size_t i = 0; i < num_input_nodes; i++) { - auto input_name = - impl_->session->GetInputNameAllocated(i, impl_->allocator); - impl_->input_node_names.push_back( - strdup(input_name.get())); // Need to copy because smart pointer dies - - auto type_info = impl_->session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - impl_->input_node_dims = tensor_info.GetShape(); - } - - // Get output metadata - size_t num_output_nodes = impl_->session->GetOutputCount(); - impl_->output_node_names.reserve(num_output_nodes); - - for (size_t i = 0; i < num_output_nodes; i++) { - auto output_name = - impl_->session->GetOutputNameAllocated(i, impl_->allocator); - impl_->output_node_names.push_back(strdup(output_name.get())); - } - - spdlog::info("Loaded detector model from {}", model_path); - - } catch (const Ort::Exception &e) { - spdlog::error("Failed to load ONNX model: {}", e.what()); - throw std::runtime_error("Failed to load ONNX model: " + - std::string(e.what())); - } -} - -float Detector::predict(const std::string &domain) { - if (!impl_->session) { - throw std::runtime_error("Model not loaded"); - } - - // Extract features - auto features = feature_extractor_.extract(domain); - std::vector input_tensor_values = features.to_vector(); - - // Create input tensor - // Assuming batch size of 1 - std::vector input_shape = { - 1, static_cast(input_tensor_values.size())}; - - auto memory_info = - Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - auto input_tensor = Ort::Value::CreateTensor( - memory_info, input_tensor_values.data(), input_tensor_values.size(), - input_shape.data(), input_shape.size()); - - // Run inference - auto output_tensors = impl_->session->Run( - Ort::RunOptions{nullptr}, impl_->input_node_names.data(), &input_tensor, - 1, impl_->output_node_names.data(), 1); - - // Get output - // Assuming output is a single float (probability) or [1, 1] tensor - float *floatarr = output_tensors.front().GetTensorMutableData(); - - // If model outputs logits/probabilities, we might need to process. - // Assuming the model outputs a probability score directly for now. - // If it outputs 2 values (prob class 0, prob class 1), we'd take the second. - // Let's assume for now it's a single value. - - return floatarr[0]; -} - -} // namespace detector -} // namespace hamstring diff --git a/cpp/src/detector/detector_service.cpp b/cpp/src/detector/detector_service.cpp deleted file mode 100644 index 6583926d..00000000 --- a/cpp/src/detector/detector_service.cpp +++ /dev/null @@ -1,198 +0,0 @@ -#include "hamstring/detector/detector_service.hpp" -#include "hamstring/base/utils.hpp" -#include - -using json = nlohmann::json; - -namespace hamstring { -namespace detector { - -DetectorService::DetectorService(const std::string &name, - const std::string &consume_topic, - const std::string &model_path, - double threshold, - std::shared_ptr config, - const std::string &bootstrap_servers, - const std::string &group_id) - : name_(name), consume_topic_(consume_topic), model_path_(model_path), - threshold_(threshold), config_(config) { - logger_ = base::Logger::get_logger("detector." + name_); - - // Initialize detector - try { - detector_.load_model(model_path_); - } catch (const std::exception &e) { - logger_->error("Failed to load model from {}: {}", model_path_, e.what()); - // We might want to throw here or continue with a broken detector - throw; - } - - // Create Kafka consumer - consumer_ = std::make_unique( - bootstrap_servers, group_id, std::vector{consume_topic_}); - - // Create ClickHouse sender - clickhouse_ = std::make_shared( - config_->environment.clickhouse_hostname, 9000, "hamstring", "default", - ""); - - logger_->info("DetectorService '{}' created", name_); - logger_->info(" Model: {}", model_path_); - logger_->info(" Threshold: {}", threshold_); - logger_->info(" Consume topic: {}", consume_topic_); -} - -DetectorService::~DetectorService() { stop(); } - -void DetectorService::start() { - if (running_) { - logger_->warn("DetectorService already running"); - return; - } - - running_ = true; - - logger_->info("DetectorService '{}' started", name_); - logger_->info(" ⤷ receiving on Kafka topic '{}'", consume_topic_); - - // Start worker thread - worker_thread_ = std::thread(&DetectorService::consume_loop, this); -} - -void DetectorService::stop() { - if (!running_) { - return; - } - - logger_->info("Stopping DetectorService '{}'...", name_); - running_ = false; - - if (worker_thread_.joinable()) { - worker_thread_.join(); - } - - logger_->info("DetectorService '{}' stopped", name_); - logger_->info(" Batches consumed: {}", batches_consumed_.load()); - logger_->info(" Domains scanned: {}", domains_scanned_.load()); - logger_->info(" Domains detected: {}", domains_detected_.load()); -} - -void DetectorService::consume_loop() { - logger_->info("Consumer loop started"); - - while (running_) { - try { - consumer_->poll( - [this](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - if (!running_) - return; - - try { - auto batch_ptr = base::Batch::from_json(value); - process_batch(*batch_ptr); - consumer_->commit(); - } catch (const std::exception &e) { - logger_->error("Failed to process batch: {}", e.what()); - } - }, - 1000); - - } catch (const std::exception &e) { - logger_->error("Error in consumer loop: {}", e.what()); - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - } - - logger_->info("Consumer loop stopped"); -} - -void DetectorService::process_batch(const base::Batch &batch) { - batches_consumed_++; - - for (const auto &logline : batch.loglines) { - auto it = logline->fields.find( - "query"); // Assuming 'query' field holds the domain - if (it != logline->fields.end()) { - std::string domain = it->second; - - // Remove trailing dot if present - if (!domain.empty() && domain.back() == '.') { - domain.pop_back(); - } - - domains_scanned_++; - - try { - float score = detector_.predict(domain); - - if (score > threshold_) { - domains_detected_++; - logger_->info("DGA DETECTED: {} (score: {:.4f})", domain, score); - - // TODO: Send to output topic or ClickHouse - // For now just logging is fine as per requirements "Integrate ... in - // C++" We could add a "detected_dga" table in ClickHouse - } - } catch (const std::exception &e) { - logger_->warn("Prediction failed for {}: {}", domain, e.what()); - } - } - } -} - -DetectorService::Stats DetectorService::get_stats() const { - Stats stats; - stats.batches_consumed = batches_consumed_.load(); - stats.domains_scanned = domains_scanned_.load(); - stats.domains_detected = domains_detected_.load(); - return stats; -} - -std::vector> -create_detector_services(std::shared_ptr config) { - std::vector> services; - auto logger = base::Logger::get_logger("detector.factory"); - - auto &env = config->environment; - std::string consume_prefix = env.kafka_topics_prefix["inspector_to_detector"]; - - // Build bootstrap servers string - std::vector broker_addresses; - for (const auto &broker : env.kafka_brokers) { - broker_addresses.push_back(broker.hostname + ":" + - std::to_string(broker.internal_port)); - } - std::string bootstrap_servers; - for (size_t i = 0; i < broker_addresses.size(); ++i) { - if (i > 0) - bootstrap_servers += ","; - bootstrap_servers += broker_addresses[i]; - } - - logger->info("Creating DetectorServices for {} detectors", - config->pipeline.detectors.size()); - - for (const auto &detector_config : config->pipeline.detectors) { - // Topic convention: inspector_to_detector-{detector_name} - // But wait, Inspector produces to `inspector_to_detector-{detector_name}`. - // So we consume from that. - std::string consume_topic = consume_prefix + "-" + detector_config.name; - std::string group_id = "detector-" + detector_config.name; - - logger->info("Creating DetectorService '{}'", detector_config.name); - logger->info(" Consume from: {}", consume_topic); - logger->info(" Model: {}", detector_config.model); - - auto service = std::make_shared( - detector_config.name, consume_topic, detector_config.model, - detector_config.threshold, config, bootstrap_servers, group_id); - - services.push_back(service); - } - - return services; -} - -} // namespace detector -} // namespace hamstring diff --git a/cpp/src/detector/feature_extractor.cpp b/cpp/src/detector/feature_extractor.cpp deleted file mode 100644 index 8e0e95b8..00000000 --- a/cpp/src/detector/feature_extractor.cpp +++ /dev/null @@ -1,293 +0,0 @@ -#include "hamstring/detector/feature_extractor.hpp" -#include -#include -#include -#include - -namespace hamstring { -namespace detector { - -std::vector DomainFeatures::to_vector() const { - std::vector vec; - vec.reserve(50); // Approximate size - - // Label statistics - vec.push_back(static_cast(label_length)); - vec.push_back(static_cast(label_max)); - vec.push_back(static_cast(label_average)); - - // Character frequencies (a-z) - for (char c = 'a'; c <= 'z'; ++c) { - auto it = char_freq.find(c); - vec.push_back(it != char_freq.end() ? static_cast(it->second) - : 0.0f); - } - - // FQDN counts - vec.push_back(static_cast(fqdn_full_count)); - vec.push_back(static_cast(fqdn_alpha_count)); - vec.push_back(static_cast(fqdn_numeric_count)); - vec.push_back(static_cast(fqdn_special_count)); - - // Second level domain counts - vec.push_back(static_cast(secondleveldomain_full_count)); - vec.push_back(static_cast(secondleveldomain_alpha_count)); - vec.push_back(static_cast(secondleveldomain_numeric_count)); - vec.push_back(static_cast(secondleveldomain_special_count)); - - // Third level domain counts - vec.push_back(static_cast(thirdleveldomain_full_count)); - vec.push_back(static_cast(thirdleveldomain_alpha_count)); - vec.push_back(static_cast(thirdleveldomain_numeric_count)); - vec.push_back(static_cast(thirdleveldomain_special_count)); - - // Entropy - vec.push_back(static_cast(fqdn_entropy)); - vec.push_back(static_cast(secondleveldomain_entropy)); - vec.push_back(static_cast(thirdleveldomain_entropy)); - - return vec; -} - -std::vector DomainFeatures::get_feature_names() { - std::vector names; - - names.push_back("label_length"); - names.push_back("label_max"); - names.push_back("label_average"); - - for (char c = 'a'; c <= 'z'; ++c) { - names.push_back(std::string("freq_") + c); - } - - names.push_back("fqdn_full_count"); - names.push_back("fqdn_alpha_count"); - names.push_back("fqdn_numeric_count"); - names.push_back("fqdn_special_count"); - - names.push_back("secondleveldomain_full_count"); - names.push_back("secondleveldomain_alpha_count"); - names.push_back("secondleveldomain_numeric_count"); - names.push_back("secondleveldomain_special_count"); - - names.push_back("thirdleveldomain_full_count"); - names.push_back("thirdleveldomain_alpha_count"); - names.push_back("thirdleveldomain_numeric_count"); - names.push_back("thirdleveldomain_special_count"); - - names.push_back("fqdn_entropy"); - names.push_back("secondleveldomain_entropy"); - names.push_back("thirdleveldomain_entropy"); - - return names; -} - -DomainFeatures FeatureExtractor::extract(const std::string &domain) const { - DomainFeatures features; - - // Label statistics - features.label_length = count_labels(domain); - features.label_max = get_max_label_length(domain); - features.label_average = get_average_label_length(domain); - - // Character frequency - features.char_freq = calculate_char_frequency(domain); - - // Extract domain levels - std::string fqdn = extract_fqdn(domain); - std::string sld = extract_second_level_domain(domain); - std::string tld_domain = extract_third_level_domain(domain); - - // FQDN counts - if (!fqdn.empty()) { - features.fqdn_full_count = 1.0; - features.fqdn_alpha_count = calculate_alpha_ratio(fqdn); - features.fqdn_numeric_count = calculate_numeric_ratio(fqdn); - features.fqdn_special_count = calculate_special_ratio(fqdn); - features.fqdn_entropy = calculate_entropy(fqdn); - } - - // Second level domain counts - if (!sld.empty()) { - features.secondleveldomain_full_count = 1.0; - features.secondleveldomain_alpha_count = calculate_alpha_ratio(sld); - features.secondleveldomain_numeric_count = calculate_numeric_ratio(sld); - features.secondleveldomain_special_count = calculate_special_ratio(sld); - features.secondleveldomain_entropy = calculate_entropy(sld); - } - - // Third level domain counts - if (!tld_domain.empty()) { - features.thirdleveldomain_full_count = 1.0; - features.thirdleveldomain_alpha_count = calculate_alpha_ratio(tld_domain); - features.thirdleveldomain_numeric_count = - calculate_numeric_ratio(tld_domain); - features.thirdleveldomain_special_count = - calculate_special_ratio(tld_domain); - features.thirdleveldomain_entropy = calculate_entropy(tld_domain); - } - - return features; -} - -int FeatureExtractor::count_labels(const std::string &domain) const { - if (domain.empty()) - return 0; - - int count = 1; - for (char c : domain) { - if (c == '.') - count++; - } - return count; -} - -int FeatureExtractor::get_max_label_length(const std::string &domain) const { - std::istringstream iss(domain); - std::string label; - int max_len = 0; - - while (std::getline(iss, label, '.')) { - max_len = std::max(max_len, static_cast(label.length())); - } - - return max_len; -} - -double -FeatureExtractor::get_average_label_length(const std::string &domain) const { - // Remove dots and calculate average - std::string without_dots; - for (char c : domain) { - if (c != '.') - without_dots += c; - } - - return static_cast(without_dots.length()); -} - -std::map -FeatureExtractor::calculate_char_frequency(const std::string &domain) const { - std::map freq; - std::string lower_domain; - - // Convert to lowercase - for (char c : domain) { - lower_domain += std::tolower(c); - } - - // Count occurrences - for (char c = 'a'; c <= 'z'; ++c) { - int count = std::count(lower_domain.begin(), lower_domain.end(), c); - freq[c] = - domain.empty() ? 0.0 : static_cast(count) / domain.length(); - } - - return freq; -} - -double FeatureExtractor::calculate_alpha_ratio(const std::string &text) const { - if (text.empty()) - return 0.0; - - int alpha_count = 0; - for (char c : text) { - if (std::isalpha(c)) - alpha_count++; - } - - return static_cast(alpha_count) / text.length(); -} - -double -FeatureExtractor::calculate_numeric_ratio(const std::string &text) const { - if (text.empty()) - return 0.0; - - int numeric_count = 0; - for (char c : text) { - if (std::isdigit(c)) - numeric_count++; - } - - return static_cast(numeric_count) / text.length(); -} - -double -FeatureExtractor::calculate_special_ratio(const std::string &text) const { - if (text.empty()) - return 0.0; - - int special_count = 0; - for (char c : text) { - if (!std::isalnum(c) && !std::isspace(c)) - special_count++; - } - - return static_cast(special_count) / text.length(); -} - -double FeatureExtractor::calculate_entropy(const std::string &text) const { - if (text.empty()) - return 0.0; - - // Calculate character probabilities - std::map char_counts; - for (char c : text) { - char_counts[c]++; - } - - // Calculate entropy using Shannon formula - double entropy = 0.0; - double log2_base = std::log(2.0); - - for (const auto &[ch, count] : char_counts) { - double prob = static_cast(count) / text.length(); - entropy += -prob * (std::log(prob) / log2_base); - } - - return entropy; -} - -std::string FeatureExtractor::extract_fqdn(const std::string &domain) const { - return domain; -} - -std::string -FeatureExtractor::extract_second_level_domain(const std::string &domain) const { - // Extract second level domain (e.g., "example.com" from "www.example.com") - std::istringstream iss(domain); - std::vector labels; - std::string label; - - while (std::getline(iss, label, '.')) { - labels.push_back(label); - } - - if (labels.size() >= 2) { - return labels[labels.size() - 2] + "." + labels[labels.size() - 1]; - } - - return domain; -} - -std::string -FeatureExtractor::extract_third_level_domain(const std::string &domain) const { - // Extract third level domain (e.g., "www" from "www.example.com") - std::istringstream iss(domain); - std::vector labels; - std::string label; - - while (std::getline(iss, label, '.')) { - labels.push_back(label); - } - - if (labels.size() >= 3) { - return labels[0]; - } - - return ""; -} - -} // namespace detector -} // namespace hamstring diff --git a/cpp/src/detector/main.cpp b/cpp/src/detector/main.cpp deleted file mode 100644 index a8cf8532..00000000 --- a/cpp/src/detector/main.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/detector/detector_service.hpp" -#include -#include -#include -#include - -using namespace hamstring; - -std::vector> services; -volatile sig_atomic_t shutdown_requested = 0; - -void signal_handler(int signum) { - std::cout << std::endl; - auto logger = base::Logger::get_logger("main"); - logger->info("Received signal {}, shutting down...", signum); - shutdown_requested = 1; - - for (auto &svc : services) { - svc->stop(); - } -} - -int main(int argc, char *argv[]) { - // Set up signal handlers - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - - auto logger = base::Logger::get_logger("main"); - - logger->info("╔════════════════════════════════════════╗"); - logger->info("║ HAMSTRING Detector (C++) ║"); - logger->info("╚════════════════════════════════════════╝"); - logger->info(""); - - // Load configuration - std::string config_path = (argc > 1) ? argv[1] : "../../config.yaml"; - logger->info("Loading configuration from: {}", config_path); - - try { - auto config = config::Config::load_from_file(config_path); - logger->info("Configuration loaded successfully"); - - // Create services - services = detector::create_detector_services(config); - - if (services.empty()) { - logger->warn("No detectors configured"); - return 0; - } - - // Start services - for (auto &svc : services) { - svc->start(); - } - - logger->info("All DetectorServices started"); - logger->info("Press Ctrl+C to stop"); - - // Main loop - while (!shutdown_requested) { - std::this_thread::sleep_for(std::chrono::seconds(10)); - - if (!shutdown_requested) { - logger->info("=== Detector Statistics === "); - for (const auto &svc : services) { - auto stats = svc->get_stats(); - logger->info("Detector stats:"); - logger->info(" Batches consumed: {}", stats.batches_consumed); - logger->info(" Domains scanned: {}", stats.domains_scanned); - logger->info(" Domains detected: {}", stats.domains_detected); - } - } - } - - logger->info("Shutdown complete"); - - } catch (const std::exception &e) { - logger->error("Fatal error: {}", e.what()); - return 1; - } - - return 0; -} diff --git a/cpp/src/inspector/CMakeLists.txt b/cpp/src/inspector/CMakeLists.txt deleted file mode 100644 index 12802b1d..00000000 --- a/cpp/src/inspector/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# Inspector library -add_library(hamstring_inspector - inspector.cpp - anomaly_detector.cpp -) - -target_link_libraries(hamstring_inspector - PUBLIC - hamstring_base -) - -# Inspector executable -add_executable(inspector - main.cpp -) - -target_link_libraries(inspector - PRIVATE - hamstring_inspector - hamstring_base -) diff --git a/cpp/src/inspector/anomaly_detector.cpp b/cpp/src/inspector/anomaly_detector.cpp deleted file mode 100644 index 4609837d..00000000 --- a/cpp/src/inspector/anomaly_detector.cpp +++ /dev/null @@ -1,300 +0,0 @@ -#include "hamstring/inspector/anomaly_detector.hpp" -#include -#include -#include - -namespace hamstring { -namespace inspector { - -AnomalyDetector::AnomalyDetector(const config::InspectorConfig &config) - : config_(config), window_size_(100), // Default rolling window size - z_score_threshold_(3.0) // 3-sigma rule (99.7% of normal data) -{ - // Override defaults from config if available - if (config_.time_range > 0) { - window_size_ = static_cast(config_.time_range); - } -} - -BatchMetrics AnomalyDetector::extract_metrics(const base::Batch &batch) { - BatchMetrics metrics{}; - - if (batch.loglines.empty()) { - return metrics; - } - - metrics.total_queries = batch.loglines.size(); - - size_t nxdomain_count = 0; - double total_length = 0.0; - std::unordered_set unique_domains; - size_t numeric_chars = 0; - size_t special_chars = 0; - size_t total_chars = 0; - std::string all_domains; - - for (const auto &logline : batch.loglines) { - // Extract query/domain - auto query_it = logline->fields.find("query"); - if (query_it != logline->fields.end()) { - std::string domain = query_it->second; - - // Remove trailing dot - if (!domain.empty() && domain.back() == '.') { - domain.pop_back(); - } - - total_length += domain.length(); - unique_domains.insert(domain); - all_domains += domain; - - // Count character types - for (char c : domain) { - total_chars++; - if (std::isdigit(c)) { - numeric_chars++; - } else if (!std::isalnum(c) && c != '.' && c != '-') { - special_chars++; - } - } - } - - // Check for NXDOMAIN - auto rcode_it = logline->fields.find("rcode"); - if (rcode_it != logline->fields.end()) { - if (rcode_it->second == "NXDOMAIN" || rcode_it->second == "3") { - nxdomain_count++; - } - } - } - - // Calculate metrics - metrics.nxdomain_rate = - static_cast(nxdomain_count) / metrics.total_queries; - metrics.avg_domain_length = total_length / metrics.total_queries; - metrics.unique_domain_ratio = - static_cast(unique_domains.size()) / metrics.total_queries; - - if (total_chars > 0) { - metrics.numeric_char_ratio = - static_cast(numeric_chars) / total_chars; - metrics.special_char_ratio = - static_cast(special_chars) / total_chars; - } - - // Calculate entropy of all domains combined - metrics.domain_entropy = calculate_entropy(all_domains); - - // Calculate query rate (queries per second) - // Assuming batch represents ~1 second of traffic for simplicity - metrics.query_rate = metrics.total_queries; - - return metrics; -} - -double AnomalyDetector::calculate_entropy(const std::string &str) { - if (str.empty()) { - return 0.0; - } - - // Count character frequencies - std::unordered_map freq; - for (char c : str) { - freq[c]++; - } - - // Calculate Shannon entropy - double entropy = 0.0; - double len = static_cast(str.length()); - - for (const auto &[ch, count] : freq) { - double prob = static_cast(count) / len; - entropy -= prob * std::log2(prob); - } - - return entropy; -} - -double AnomalyDetector::calculate_z_score(double value, double mean, - double stddev) { - if (stddev == 0.0) { - return 0.0; - } - return (value - mean) / stddev; -} - -void AnomalyDetector::update_rolling_stats() { - // Calculate mean and stddev for each metric - - // NXDOMAIN rate - if (!stats_.nxdomain_rates.empty()) { - double sum = 0.0; - for (double val : stats_.nxdomain_rates) { - sum += val; - } - stats_.mean_nxdomain = sum / stats_.nxdomain_rates.size(); - - double variance = 0.0; - for (double val : stats_.nxdomain_rates) { - variance += (val - stats_.mean_nxdomain) * (val - stats_.mean_nxdomain); - } - stats_.stddev_nxdomain = std::sqrt(variance / stats_.nxdomain_rates.size()); - } - - // Domain length - if (!stats_.avg_domain_lengths.empty()) { - double sum = 0.0; - for (double val : stats_.avg_domain_lengths) { - sum += val; - } - stats_.mean_domain_length = sum / stats_.avg_domain_lengths.size(); - - double variance = 0.0; - for (double val : stats_.avg_domain_lengths) { - variance += - (val - stats_.mean_domain_length) * (val - stats_.mean_domain_length); - } - stats_.stddev_domain_length = - std::sqrt(variance / stats_.avg_domain_lengths.size()); - } - - // Entropy - if (!stats_.entropies.empty()) { - double sum = 0.0; - for (double val : stats_.entropies) { - sum += val; - } - stats_.mean_entropy = sum / stats_.entropies.size(); - - double variance = 0.0; - for (double val : stats_.entropies) { - variance += (val - stats_.mean_entropy) * (val - stats_.mean_entropy); - } - stats_.stddev_entropy = std::sqrt(variance / stats_.entropies.size()); - } -} - -double AnomalyDetector::detect_anomalies(const BatchMetrics &metrics) { - double anomaly_score = 0.0; - int score_count = 0; - - // Need minimum samples for statistical detection - if (stats_.nxdomain_rates.size() < 10) { - // Use simple threshold-based detection for cold start - if (metrics.nxdomain_rate > config_.anomaly_threshold) { - anomaly_score += 0.5; - score_count++; - } - - if (metrics.avg_domain_length > 30.0) { - anomaly_score += 0.3; - score_count++; - } - - if (metrics.domain_entropy > 4.5) { - anomaly_score += 0.2; - score_count++; - } - - return score_count > 0 ? anomaly_score : 0.0; - } - - // Z-score based detection - - // 1. NXDOMAIN rate anomaly - double z_nxdomain = calculate_z_score( - metrics.nxdomain_rate, stats_.mean_nxdomain, stats_.stddev_nxdomain); - - if (std::abs(z_nxdomain) > z_score_threshold_) { - // Normalize to [0, 1] - anomaly_score += - std::min(1.0, std::abs(z_nxdomain) / (z_score_threshold_ * 2)); - score_count++; - } - - // 2. Domain length anomaly - double z_length = - calculate_z_score(metrics.avg_domain_length, stats_.mean_domain_length, - stats_.stddev_domain_length); - - if (std::abs(z_length) > z_score_threshold_) { - anomaly_score += - std::min(1.0, std::abs(z_length) / (z_score_threshold_ * 2)); - score_count++; - } - - // 3. Entropy anomaly - double z_entropy = calculate_z_score( - metrics.domain_entropy, stats_.mean_entropy, stats_.stddev_entropy); - - if (std::abs(z_entropy) > z_score_threshold_) { - anomaly_score += - std::min(1.0, std::abs(z_entropy) / (z_score_threshold_ * 2)); - score_count++; - } - - // 4. Threshold-based rules (additional signals) - if (metrics.nxdomain_rate > 0.7) { - anomaly_score += 0.5; - score_count++; - } - - if (metrics.numeric_char_ratio > 0.3) { - anomaly_score += 0.2; - score_count++; - } - - // Average the scores - return score_count > 0 ? (anomaly_score / score_count) : 0.0; -} - -double AnomalyDetector::analyze_batch(const base::Batch &batch) { - // Extract metrics - BatchMetrics metrics = extract_metrics(batch); - - // Detect anomalies - double score = detect_anomalies(metrics); - - return score; -} - -void AnomalyDetector::update_state(const base::Batch &batch) { - // Extract metrics - BatchMetrics metrics = extract_metrics(batch); - - // Update rolling windows - stats_.nxdomain_rates.push_back(metrics.nxdomain_rate); - stats_.avg_domain_lengths.push_back(metrics.avg_domain_length); - stats_.query_rates.push_back(metrics.query_rate); - stats_.entropies.push_back(metrics.domain_entropy); - - // Maintain window size - if (stats_.nxdomain_rates.size() > window_size_) { - stats_.nxdomain_rates.pop_front(); - } - if (stats_.avg_domain_lengths.size() > window_size_) { - stats_.avg_domain_lengths.pop_front(); - } - if (stats_.query_rates.size() > window_size_) { - stats_.query_rates.pop_front(); - } - if (stats_.entropies.size() > window_size_) { - stats_.entropies.pop_front(); - } - - // Recalculate statistics - update_rolling_stats(); -} - -AnomalyDetector::Statistics AnomalyDetector::get_statistics() const { - Statistics stats; - stats.mean_nxdomain_rate = stats_.mean_nxdomain; - stats.stddev_nxdomain_rate = stats_.stddev_nxdomain; - stats.mean_domain_length = stats_.mean_domain_length; - stats.stddev_domain_length = stats_.stddev_domain_length; - stats.samples_count = stats_.nxdomain_rates.size(); - return stats; -} - -} // namespace inspector -} // namespace hamstring diff --git a/cpp/src/inspector/inspector.cpp b/cpp/src/inspector/inspector.cpp deleted file mode 100644 index 439e758a..00000000 --- a/cpp/src/inspector/inspector.cpp +++ /dev/null @@ -1,316 +0,0 @@ -#include "hamstring/inspector/inspector.hpp" -#include "hamstring/base/utils.hpp" -#include -#include - -using json = nlohmann::json; - -namespace hamstring { -namespace inspector { - -Inspector::Inspector(const std::string &name, const std::string &consume_topic, - const std::vector &produce_topics, - const std::string &mode, double anomaly_threshold, - double score_threshold, - std::shared_ptr config, - const std::string &bootstrap_servers, - const std::string &group_id) - : name_(name), consume_topic_(consume_topic), - produce_topics_(produce_topics), mode_(mode), - anomaly_threshold_(anomaly_threshold), score_threshold_(score_threshold), - config_(config) { - logger_ = base::Logger::get_logger("inspector." + name_); - - // Create Kafka consumer - consumer_ = std::make_unique( - bootstrap_servers, group_id, std::vector{consume_topic_}); - - // Create Kafka producers for each output topic - for (const auto &topic : produce_topics_) { - auto producer = - std::make_unique(bootstrap_servers, topic); - producers_.push_back(std::move(producer)); - } - - // Create ClickHouse sender for monitoring - clickhouse_ = std::make_shared( - config_->environment.clickhouse_hostname, 9000, "hamstring", "default", - ""); - - // Create anomaly detector with inspector config - config::InspectorConfig inspector_config; - inspector_config.anomaly_threshold = anomaly_threshold_; - inspector_config.score_threshold = score_threshold_; - inspector_config.time_range = 100; // Default window size - anomaly_detector_ = std::make_unique(inspector_config); - - logger_->info("Inspector '{}' created", name_); - logger_->info(" Mode: {}", mode_); - logger_->info(" Anomaly threshold: {}", anomaly_threshold_); - logger_->info(" Score threshold: {}", score_threshold_); - logger_->info(" Consume topic: {}", consume_topic_); - logger_->info(" Produce topics: {}", produce_topics_.size()); -} - -Inspector::~Inspector() { stop(); } - -void Inspector::start() { - if (running_) { - logger_->warn("Inspector already running"); - return; - } - - running_ = true; - - logger_->info("Inspector '{}' started", name_); - logger_->info(" ⤷ receiving on Kafka topic '{}'", consume_topic_); - logger_->info(" ⤷ sending to {} topics", produce_topics_.size()); - for (const auto &topic : produce_topics_) { - logger_->info(" - {}", topic); - } - - // Start worker thread - worker_thread_ = std::thread(&Inspector::consume_loop, this); -} - -void Inspector::stop() { - if (!running_) { - return; - } - - logger_->info("Stopping Inspector '{}'...", name_); - running_ = false; - - if (worker_thread_.joinable()) { - worker_thread_.join(); - } - - // Flush all producers - for (auto &producer : producers_) { - producer->flush(); - } - - logger_->info("Inspector '{}' stopped", name_); - logger_->info(" Batches consumed: {}", batches_consumed_.load()); - logger_->info(" Batches suspicious: {}", batches_suspicious_.load()); - logger_->info(" Batches filtered: {}", batches_filtered_.load()); - logger_->info(" Suspicious batches sent: {}", - suspicious_batches_sent_.load()); -} - -void Inspector::consume_loop() { - logger_->info("Consumer loop started"); - - while (running_) { - try { - // Poll for messages with 1 second timeout - consumer_->poll( - [this](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - if (!running_) - return; - - logger_->trace("Consumed batch from topic '{}'", topic); - - try { - // Parse batch JSON - auto batch_ptr = base::Batch::from_json(value); - - logger_->debug("Received batch {} with {} loglines", - batch_ptr->batch_id, batch_ptr->loglines.size()); - - // Process the batch - process_batch(*batch_ptr); - - // Commit the offset - consumer_->commit(); - - } catch (const std::exception &e) { - logger_->error("Failed to process batch: {}", e.what()); - } - }, - 1000); // 1 second timeout - - } catch (const std::exception &e) { - logger_->error("Error in consumer loop: {}", e.what()); - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - } - - logger_->info("Consumer loop stopped"); -} - -void Inspector::process_batch(const base::Batch &batch) { - batches_consumed_++; - - // Check if batch is suspicious - if (is_suspicious(batch)) { - batches_suspicious_++; - - // Group log lines by source IP - std::map>> - batches_by_ip; - - for (const auto &logline_ptr : batch.loglines) { - auto it = logline_ptr->fields.find("src_ip"); - if (it != logline_ptr->fields.end()) { - std::string src_ip = it->second; - batches_by_ip[src_ip].push_back(logline_ptr); - } else { - // If no src_ip, use "unknown" - batches_by_ip["unknown"].push_back(logline_ptr); - } - } - - logger_->info("Batch {} is suspicious - grouped into {} IP buckets", - batch.batch_id, batches_by_ip.size()); - - // Send suspicious batches - send_suspicious_batches(batches_by_ip, batch.batch_id); - - } else { - batches_filtered_++; - logger_->debug("Batch {} filtered out (not suspicious)", batch.batch_id); - } -} - -bool Inspector::is_suspicious(const base::Batch &batch) { - if (batch.loglines.empty()) { - return false; - } - - // Use statistical anomaly detection - double suspicion_score = anomaly_detector_->analyze_batch(batch); - - // Update detector state for future analysis - anomaly_detector_->update_state(batch); - - // Log detailed statistics periodically - if (batches_consumed_ % 100 == 0) { - auto stats = anomaly_detector_->get_statistics(); - logger_->debug( - "Anomaly detector stats: NXDOMAIN mean={:.3f} stddev={:.3f}, " - "domain_length mean={:.1f} stddev={:.1f}, samples={}", - stats.mean_nxdomain_rate, stats.stddev_nxdomain_rate, - stats.mean_domain_length, stats.stddev_domain_length, - stats.samples_count); - } - - bool is_anomalous = suspicion_score > anomaly_threshold_; - - if (is_anomalous) { - logger_->info("Batch {} is SUSPICIOUS (score: {:.3f})", batch.batch_id, - suspicion_score); - } else { - logger_->debug("Batch {} is normal (score: {:.3f})", batch.batch_id, - suspicion_score); - } - - return is_anomalous; -} - -void Inspector::send_suspicious_batches( - const std::map>> - &batches_by_ip, - const std::string &original_batch_id) { - for (const auto &[src_ip, loglines] : batches_by_ip) { - // Create a new batch for this IP - base::Batch suspicious_batch; - suspicious_batch.batch_id = base::utils::generate_uuid(); - suspicious_batch.subnet_id = src_ip; - suspicious_batch.loglines = loglines; - suspicious_batch.created_at = std::chrono::system_clock::now(); - suspicious_batch.timestamp_in = std::chrono::system_clock::now(); - - std::string batch_json = suspicious_batch.to_json(); - - logger_->info("Sending suspicious batch {} for IP {} ({} loglines)", - suspicious_batch.batch_id, src_ip, loglines.size()); - - // Send to all output topics - for (size_t i = 0; i < producers_.size(); ++i) { - try { - producers_[i]->send(suspicious_batch.batch_id, batch_json); - logger_->trace("Sent suspicious batch {} to topic {}", - suspicious_batch.batch_id, produce_topics_[i]); - } catch (const std::exception &e) { - logger_->error("Failed to send suspicious batch to {}: {}", - produce_topics_[i], e.what()); - } - } - - suspicious_batches_sent_++; - } -} - -Inspector::Stats Inspector::get_stats() const { - Stats stats; - stats.batches_consumed = batches_consumed_.load(); - stats.batches_suspicious = batches_suspicious_.load(); - stats.batches_filtered = batches_filtered_.load(); - stats.suspicious_batches_sent = suspicious_batches_sent_.load(); - return stats; -} - -std::vector> -create_inspectors(std::shared_ptr config) { - std::vector> inspectors; - auto logger = base::Logger::get_logger("inspector.factory"); - - // Get topic prefixes - auto &env = config->environment; - std::string consume_prefix = - env.kafka_topics_prefix["prefilter_to_inspector"]; - std::string produce_prefix = env.kafka_topics_prefix["inspector_to_detector"]; - - // Build bootstrap servers string - std::vector broker_addresses; - for (const auto &broker : env.kafka_brokers) { - broker_addresses.push_back(broker.hostname + ":" + - std::to_string(broker.internal_port)); - } - std::string bootstrap_servers; - for (size_t i = 0; i < broker_addresses.size(); ++i) { - if (i > 0) - bootstrap_servers += ","; - bootstrap_servers += broker_addresses[i]; - } - - logger->info("Creating Inspectors for {} inspectors", - config->pipeline.inspectors.size()); - logger->info("Kafka brokers: {}", bootstrap_servers); - - // Create one Inspector per configured inspector - for (const auto &inspector_config : config->pipeline.inspectors) { - std::string consume_topic = consume_prefix + "-" + inspector_config.name; - std::string group_id = "inspector-" + inspector_config.name; - - // Find all detectors for this inspector - std::vector produce_topics; - for (const auto &detector : config->pipeline.detectors) { - if (detector.inspector_name == inspector_config.name) { - std::string topic = produce_prefix + "-" + detector.name; - produce_topics.push_back(topic); - } - } - - logger->info("Creating Inspector '{}'", inspector_config.name); - logger->info(" Consume from: {}", consume_topic); - logger->info(" Produce to {} topics", produce_topics.size()); - logger->info(" Mode: {}", inspector_config.mode); - logger->info(" Anomaly threshold: {}", inspector_config.anomaly_threshold); - logger->info(" Score threshold: {}", inspector_config.score_threshold); - - auto inspector = std::make_shared( - inspector_config.name, consume_topic, produce_topics, - inspector_config.mode, inspector_config.anomaly_threshold, - inspector_config.score_threshold, config, bootstrap_servers, group_id); - - inspectors.push_back(inspector); - } - - return inspectors; -} - -} // namespace inspector -} // namespace hamstring diff --git a/cpp/src/inspector/main.cpp b/cpp/src/inspector/main.cpp deleted file mode 100644 index 275e8f80..00000000 --- a/cpp/src/inspector/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/inspector/inspector.hpp" -#include -#include -#include - -using namespace hamstring; - -std::vector> inspectors; -volatile sig_atomic_t shutdown_requested = 0; - -void signal_handler(int signum) { - std::cout << std::endl; - auto logger = base::Logger::get_logger("main"); - logger->info("Received signal {}, shutting down...", signum); - shutdown_requested = 1; - - // Stop all inspectors - for (auto &insp : inspectors) { - insp->stop(); - } -} - -int main(int argc, char *argv[]) { - // Set up signal handlers - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - - auto logger = base::Logger::get_logger("main"); - - logger->info("╔════════════════════════════════════════╗"); - logger->info("║ HAMSTRING Inspector (C++) ║"); - logger->info("╚════════════════════════════════════════╝"); - logger->info(""); - - // Load configuration - std::string config_path = (argc > 1) ? argv[1] : "../config.yaml"; - logger->info("Loading configuration from: {}", config_path); - - std::shared_ptr config; - try { - config = config::Config::load_from_file(config_path); - logger->info("Configuration loaded successfully"); - logger->info(" Inspectors: {}", config->pipeline.inspectors.size()); - logger->info(" Detectors: {}", config->pipeline.detectors.size()); - logger->info(" Kafka brokers: {}", - config->environment.kafka_brokers.size()); - logger->info(""); - } catch (const std::exception &e) { - logger->error("Failed to load configuration: {}", e.what()); - return 1; - } - - // Create inspectors - try { - inspectors = inspector::create_inspectors(config); - logger->info("Created {} Inspector instances", inspectors.size()); - logger->info(""); - } catch (const std::exception &e) { - logger->error("Failed to create inspectors: {}", e.what()); - return 1; - } - - // Start all inspectors - for (auto &insp : inspectors) { - insp->start(); - } - - logger->info("All Inspectors started"); - logger->info("Press Ctrl+C to stop"); - logger->info(""); - - // Report statistics periodically - while (!shutdown_requested) { - std::this_thread::sleep_for(std::chrono::seconds(10)); - - if (!shutdown_requested) { - logger->info("=== Inspector Statistics ==="); - for (const auto &insp : inspectors) { - auto stats = insp->get_stats(); - logger->info("Inspector stats:"); - logger->info(" Batches consumed: {}", stats.batches_consumed); - logger->info(" Batches suspicious: {}", stats.batches_suspicious); - logger->info(" Batches filtered: {}", stats.batches_filtered); - logger->info(" Suspicious batches sent: {}", - stats.suspicious_batches_sent); - } - } - } - - logger->info("Shutdown complete"); - return 0; -} diff --git a/cpp/src/logcollector/CMakeLists.txt b/cpp/src/logcollector/CMakeLists.txt deleted file mode 100644 index ede3fbc8..00000000 --- a/cpp/src/logcollector/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# LogCollector module -add_library(hamstring_logcollector - logcollector.cpp -) - -target_link_libraries(hamstring_logcollector - PUBLIC - hamstring_base -) - -# LogCollector executable -add_executable(logcollector - main.cpp -) - -target_link_libraries(logcollector - PRIVATE - hamstring_logcollector - hamstring_base -) - -install(TARGETS logcollector DESTINATION bin) diff --git a/cpp/src/logcollector/README.md b/cpp/src/logcollector/README.md deleted file mode 100644 index e1230b24..00000000 --- a/cpp/src/logcollector/README.md +++ /dev/null @@ -1,377 +0,0 @@ -# LogCollector Module - C++ Implementation - -## Overview - -The **LogCollector** is a high-performance, scalable module that validates and batches log lines from the LogServer for downstream processing. Built in C++20 with thread safety and horizontal scaling in mind. - -## Key Features - -### 🚀 **Scalability Improvements** -- **Multi-threaded Architecture**: Separate threads for consumption and batch timeout handling -- **Thread-Safe Batching**: Lock-based synchronization with minimal contention -- **Zero-Copy Operations**: Efficient memory management -- **Horizontal Scaling**: Independent instances can run in parallel -- **Async I/O Ready**: Non-blocking Kafka integration placeholder - -### 🔒 **Thread Safety** -- All batch operations protected by mutexes -- Atomic counters for metrics -- Safe concurrent access to shared state - -### 📊 **Performance Optimizations** -- Batch aggregation by subnet ID -- Configurable batch size and timeout -- Efficient JSON parsing with nlohmann/json -- Minimal memory allocations - -## Architecture - -``` - ┌─────────────────┐ - │ Kafka Topic │ - │ (from LogServer)│ - └────────┬────────┘ - │ - ┌────────▼────────┐ - │ Consumer Loop │ - │ (Thread 1) │ - └────────┬────────┘ - │ - ┌────────▼──────────┐ - │ Validate LogLine │ - │ - JSON parse │ - │ - Field check │ - └────────┬──────────┘ - │ - ┌──────────────▼──────────────┐ - │ Calculate Subnet ID │ - │ (IPv4: /24, IPv6: /64) │ - └──────────────┬──────────────┘ - │ - ┌──────────────▼──────────────┐ - │ BufferedBatch │ - │ (Thread-Safe Container) │ - │ ┌─────────────────────┐ │ - │ │ Subnet A: [logs...] │ │ - │ │ Subnet B: [logs...] │ │ - │ │ Subnet C: [logs...] │ │ - │ └─────────────────────┘ │ - └──────────────┬──────────────┘ - │ - ┌──────────────▼──────────────┐ - │ Batch Timer Thread │ - │ (Thread 2) │ - │ - Checks every 2.5s │ - │ - Sends ready batches │ - └──────────────┬──────────────┘ - │ - ┌──────────────▼──────────────┐ - │ Send to Kafka Topics │ - │ (Multiple Prefilters) │ - └─────────────────────────────┘ -``` - -## Components - -### 1. **BufferedBatch** -Thread-safe container for aggregating log lines by subnet ID. - -**Features:** -- Automatic batch ID generation -- Size-based and timeout-based triggering -- Chronological sorting within batches -- Statistics tracking - -**Methods:** -```cpp -bool add_logline(const std::string& subnet_id, const base::LogLine& logline); -base::Batch get_batch(const std::string& subnet_id); -std::vector get_ready_batches(); -std::vector flush_all(); -Stats get_stats() const; -``` - -### 2. **LogCollector** -Main collector class that orchestrates validation and batching. - -**Features:** -- JSON validation -- Field requirement checking -- Subnet-based grouping -- ClickHouse integration (placeholder) -- Metrics collection - -**Methods:** -```cpp -void start(); -void stop(); -bool is_running() const; -Stats get_stats() const; -``` - -### 3. **Factory Function** -Creates LogCollector instances from configuration. - -```cpp -std::vector> create_logcollectors( - std::shared_ptr config); -``` - -## Configuration - -### Batch Settings -```cpp -batch_size_ = 100; // Max messages per batch -batch_timeout_ms_ = 5000; // Timeout in milliseconds -``` - -### Subnet Prefix Lengths -```cpp -ipv4_prefix_length_ = 24; // /24 subnets -ipv6_prefix_length_ = 64; // /64 subnets -``` - -### Validation -```cpp -std::vector required_fields = {"ts", "src_ip"}; -``` - -## Usage - -### Building -```bash -cd /Users/smachmeier/Documents/projects/hamstring/cpp -cmake --build build --target logcollector -``` - -### Running -```bash -./build/src/logcollector/logcollector ../config.yaml -``` - -## Output Example - -``` -╔═══════════════════════════════════════╗ -║ HAMSTRING LogCollector (C++) ║ -╚═══════════════════════════════════════╝ - -[info] Loading configuration from: ../config.yaml -[info] Configuration loaded successfully -[info] Collectors: 1 -[info] Prefilters: 1 - -[info] Creating LogCollector 'dga_collector' for protocol 'dns' -[info] Consume from: pipeline-logserver_to_collector-dga_collector -[info] Produce to 1 topics -[info] BufferedBatch created (size=100, timeout=5000ms) - -[info] All LogCollectors started -[info] Press Ctrl+C to stop - -[info] Consumer loop started -[info] Batch timeout handler started (interval: 2500ms) -[info] Completed batch 4e556c3f... with 10 loglines -[info] Would send batch to topic ... (10 loglines) - -=== LogCollector Statistics === - Messages consumed: 10 - Messages validated: 10 - Messages failed: 0 - Batches sent: 1 -``` - -## Implementation Status - -### ✅ **Implemented** -- Thread-safe batch management -- JSON parsing and validation -- Subnet-based grouping -- Timeout-based and size-based triggers -- Statistics tracking -- Graceful shutdown -- Signal handling -- Multi-collector support - -### ⚠️ **Placeholders** -- Kafka consumption (demo mode with 10 messages) -- Kafka production (logging only) -- ClickHouse logging -- Field validation against configuration - -### ❌ **TODO** -- Full Kafka integration -- ClickHouse integration -- Advanced field validation -- Performance benchmarking -- Integration tests - -## Scalability Design - -### Horizontal Scaling -Multiple LogCollector instances can run independently: - -1. **By Protocol**: One instance per protocol (DNS, HTTP, etc.) -2. **By Partition**: Multiple instances consuming different Kafka partitions -3. **By Region**: Geographically distributed instances - -### Resource Usage -- **Memory**: ~50MB per instance -- **CPU**: 2 threads per instance (consumer + timer) -- **Network**: Depends on message volume - -### Performance Characteristics -- **Throughput**: ~100K messages/second (estimated) -- **Latency**: < 10ms validation per message -- **Batch latency**: Max 5 seconds (configurable) - -## Comparison with Python - -| Feature | Python | C++ | -|---------|--------|-----| -| Language | Python 3.10 | C++20 | -| Framework | asyncio | std::thread | -| Thread Safety | GIL-protected | Mutex-protected | -| Memory Usage | ~200MB | ~50MB | -| Startup Time | ~500ms | ~5ms | -| Message Throughput | ~10K/s | ~100K/s | -| Batch Latency | ~5s | ~5s | -| Deployment | Single process | Multi-process ready | - -## Thread Safety Guarantees - -### Batch Operations -- `add_logline()` - Mutex protected -- `get_batch()` - Mutex protected -- `get_ready_batches()` - Mutex protected with minimal lock time -- `flush_all()` - Mutex protected - -### Metrics -- All counters use `std::atomic` for lock-free updates -- Stats retrieval is thread-safe - -### Shutdown -- Graceful shutdown with thread joining -- All batches flushed before exit -- No data loss on SIGINT/SIGTERM - -## Code Structure - -``` -hamstring/cpp/ -├── include/hamstring/logcollector/ -│ └── logcollector.hpp # Public API -├── src/logcollector/ -│ ├── logcollector.cpp # Implementation -│ ├── main.cpp # Executable -│ ├── CMakeLists.txt # Build configuration -│ └── README.md # This file -└── build/src/logcollector/ - └── logcollector # Executable -``` - -## Metrics - -### Per-Collector Metrics -- `messages_consumed` - Total messages received -- `messages_validated` - Successfully validated messages -- `messages_failed` - Failed validation -- `batches_sent` - Batches sent downstream - -### Per-Batch Metrics -- `total_batches` - Active batches -- `total_loglines` - Total log lines in batches -- `largest_batch` - Size of largest batch -- `oldest_batch_age` - Age of oldest batch - -## Error Handling - -### Validation Errors -```cpp -try { - auto logline = validate_logline(message); - messages_validated_++; -} catch (const std::exception& e) { - messages_failed_++; - log_failed_logline(message, e.what()); -} -``` - -### Batch Processing Errors -```cpp -try { - auto batch = batch_handler_->get_batch(subnet_id); - send_batches({batch}); -} catch (const std::exception& e) { - logger_->error("Failed to get batch: {}", e.what()); -} -``` - -## Future Enhancements - -1. **Kafka Integration**: LibrdKafka for high-performance consumption -2. **ClickHouse Integration**: Batch inserts for monitoring -3. **Field Validators**: Regex, timestamp, IP validation -4. **Metrics Export**: Prometheus integration -5. **Configuration Hot-Reload**: Dynamic reconfiguration -6. **Circuit Breakers**: Fault tolerance patterns -7. **Backpressure Handling**: Flow control -8. **Compression**: In-flight data compression - -## Testing - -### Unit Tests -```bash -# TODO: Add Google Test suites -cmake --build build --target test_logcollector -./build/tests/logcollector/test_logcollector -``` - -### Integration Tests -```bash -# TODO: Add end-to-end tests with Kafka -./scripts/integration_test_logcollector.sh -``` - -## Performance Tips - -1. **Batch Size**: Tune based on message rate (50-500 recommended) -2. **Timeout**: Balance latency vs throughput (1-10 seconds) -3. **Threads**: Use 1-2 threads per CPU core -4. **Memory**: Pre-allocate buffers for high-throughput scenarios -5. **Partitions**: Match Kafka partition count to instance count - -## Deployment - -### Docker -```dockerfile -FROM ubuntu:22.04 -COPY logcollector /usr/local/bin/ -COPY config.yaml /etc/hamstring/ -CMD ["/usr/local/bin/logcollector", "/etc/hamstring/config.yaml"] -``` - -### Kubernetes -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: hamstring-logcollector -spec: - replicas: 3 # Horizontal scaling - template: - spec: - containers: - - name: logcollector - image: hamstring/logcollector:latest - resources: - limits: - memory: "128Mi" - cpu: "500m" -``` - -## Conclusion - -The C++ LogCollector delivers significantly better performance and resource efficiency compared to the Python version while maintaining feature parity. The modular, thread-safe design enables easy horizontal scaling for high-throughput deployments. - -**Next Steps**: Integrate real Kafka handlers and deploy to production! 🚀 diff --git a/cpp/src/logcollector/logcollector.cpp b/cpp/src/logcollector/logcollector.cpp deleted file mode 100644 index f5796d08..00000000 --- a/cpp/src/logcollector/logcollector.cpp +++ /dev/null @@ -1,521 +0,0 @@ -#include "hamstring/logcollector/logcollector.hpp" -#include "hamstring/base/utils.hpp" -#include -#include - -using json = nlohmann::json; - -namespace hamstring { -namespace logcollector { - -// ============================================================================ -// Buffered Batch Implementation -// ============================================================================ - -BufferedBatch::BufferedBatch(const std::string &collector_name, - size_t batch_size, int batch_timeout_ms) - : collector_name_(collector_name), batch_size_(batch_size), - batch_timeout_(batch_timeout_ms) { - logger_ = base::Logger::get_logger("logcollector.batch"); - logger_->info( - "BufferedBatch created for collector '{}' (size={}, timeout={}ms)", - collector_name_, batch_size_, batch_timeout_ms); -} - -BufferedBatch::~BufferedBatch() { - logger_->info( - "BufferedBatch destroyed. Total processed: {} loglines, {} batches", - total_loglines_processed_.load(), total_batches_sent_.load()); -} - -bool BufferedBatch::add_logline(const std::string &subnet_id, - const base::LogLine &logline) { - std::lock_guard lock(batches_mutex_); - - auto &batch_data = batches_[subnet_id]; - - // Initialize new batch if needed - if (batch_data.loglines.empty()) { - batch_data.batch_id = base::utils::generate_uuid(); - batch_data.subnet_id = subnet_id; - batch_data.created_at = std::chrono::system_clock::now(); - logger_->debug("Created new batch {} for subnet {}", batch_data.batch_id, - subnet_id); - } - - batch_data.loglines.push_back(logline); - batch_data.last_updated = std::chrono::system_clock::now(); - total_loglines_processed_++; - - logger_->trace("Added logline to batch {} (size: {})", batch_data.batch_id, - batch_data.loglines.size()); - - // Check if batch is ready - return batch_data.loglines.size() >= batch_size_; -} - -base::Batch BufferedBatch::get_batch(const std::string &subnet_id) { - std::lock_guard lock(batches_mutex_); - - auto it = batches_.find(subnet_id); - if (it == batches_.end() || it->second.loglines.empty()) { - throw std::runtime_error("No batch available for subnet: " + subnet_id); - } - - auto &batch_data = it->second; - - // Create Batch object - base::Batch batch; - batch.batch_id = batch_data.batch_id; - batch.subnet_id = batch_data.subnet_id; - - // Convert LogLine vector to shared_ptr vector - for (auto &logline : batch_data.loglines) { - batch.loglines.push_back(std::make_shared(logline)); - } - - batch.created_at = batch_data.created_at; - batch.timestamp_in = std::chrono::system_clock::now(); - - // Remove the batch - batches_.erase(it); - total_batches_sent_++; - - logger_->info("Completed batch {} with {} loglines", batch.batch_id, - batch.loglines.size()); - - return batch; -} - -std::vector BufferedBatch::get_ready_batches() { - std::vector ready_batches; - std::vector ready_subnets; - - { - std::lock_guard lock(batches_mutex_); - - auto now = std::chrono::system_clock::now(); - - for (const auto &[subnet_id, batch_data] : batches_) { - bool size_trigger = batch_data.loglines.size() >= batch_size_; - auto age = std::chrono::duration_cast( - now - batch_data.created_at); - bool timeout_trigger = age >= batch_timeout_; - - if (size_trigger || timeout_trigger) { - ready_subnets.push_back(subnet_id); - - if (size_trigger) { - logger_->debug("Batch {} ready (size trigger): {} loglines", - batch_data.batch_id, batch_data.loglines.size()); - } else { - logger_->debug("Batch {} ready (timeout trigger): age={}ms", - batch_data.batch_id, age.count()); - } - } - } - } - - // Get batches outside the lock to avoid holding it too long - for (const auto &subnet_id : ready_subnets) { - try { - ready_batches.push_back(get_batch(subnet_id)); - } catch (const std::exception &e) { - logger_->warn("Failed to get batch for subnet {}: {}", subnet_id, - e.what()); - } - } - - return ready_batches; -} - -std::vector BufferedBatch::flush_all() { - std::vector all_batches; - std::vector all_subnets; - - { - std::lock_guard lock(batches_mutex_); - for (const auto &[subnet_id, _] : batches_) { - all_subnets.push_back(subnet_id); - } - } - - logger_->info("Flushing {} batches", all_subnets.size()); - - for (const auto &subnet_id : all_subnets) { - try { - all_batches.push_back(get_batch(subnet_id)); - } catch (const std::exception &e) { - logger_->warn("Failed to flush batch for subnet {}: {}", subnet_id, - e.what()); - } - } - - return all_batches; -} - -BufferedBatch::Stats BufferedBatch::get_stats() const { - std::lock_guard lock(batches_mutex_); - - Stats stats; - stats.total_batches = batches_.size(); - stats.total_loglines = 0; - stats.largest_batch = 0; - stats.oldest_batch_age = std::chrono::milliseconds(0); - - auto now = std::chrono::system_clock::now(); - - for (const auto &[_, batch_data] : batches_) { - stats.total_loglines += batch_data.loglines.size(); - stats.largest_batch = - std::max(stats.largest_batch, batch_data.loglines.size()); - - auto age = std::chrono::duration_cast( - now - batch_data.created_at); - stats.oldest_batch_age = std::max(stats.oldest_batch_age, age); - } - - return stats; -} - -// ============================================================================ -// LogCollector Implementation -// ============================================================================ - -LogCollector::LogCollector( - const std::string &name, const std::string &protocol, - const std::string &consume_topic, - const std::vector &produce_topics, - const std::vector &validation_config, - std::shared_ptr config, - const std::string &bootstrap_servers, const std::string &group_id) - : name_(name), protocol_(protocol), consume_topic_(consume_topic), - produce_topics_(produce_topics), validation_config_(validation_config), - config_(config) { - logger_ = base::Logger::get_logger("logcollector." + name_); - - // Get batch configuration - // TODO: Extract from config based on collector name - batch_size_ = 100; // Default - batch_timeout_ms_ = 5000; // 5 seconds default - ipv4_prefix_length_ = 24; - ipv6_prefix_length_ = 64; - - // Create batch handler - batch_handler_ = - std::make_unique(name_, batch_size_, batch_timeout_ms_); - - // Create Kafka consumer - consumer_ = std::make_unique( - bootstrap_servers, group_id, std::vector{consume_topic_}); - - // Create Kafka producer (using first produce topic for now) - // TODO: Support multiple produce topics - if (!produce_topics_.empty()) { - producer_ = std::make_unique(bootstrap_servers, - produce_topics_[0]); - } - - // Create ClickHouse sender for monitoring - clickhouse_ = std::make_shared( - config_->environment.clickhouse_hostname, - 9000, // Default port - "hamstring", // Database name - "default", // Username - ""); // Password - - logger_->info("LogCollector '{}' created", name_); - logger_->info(" Protocol: {}", protocol_); - logger_->info(" Consume topic: {}", consume_topic_); - logger_->info(" Produce topics: {}", produce_topics_.size()); - logger_->info(" Batch size: {}, timeout: {}ms", batch_size_, - batch_timeout_ms_); -} - -LogCollector::~LogCollector() { stop(); } - -void LogCollector::start() { - if (running_) { - logger_->warn("LogCollector already running"); - return; - } - - running_ = true; - - logger_->info("Starting LogCollector '{}'", name_); - - // Start consumer thread - consumer_thread_ = std::thread(&LogCollector::consume_loop, this); - - // Start batch timeout handler thread - batch_timer_thread_ = std::thread(&LogCollector::batch_timeout_handler, this); - - logger_->info("LogCollector '{}' started", name_); -} - -void LogCollector::stop() { - if (!running_) { - return; - } - - logger_->info("Stopping LogCollector '{}'...", name_); - running_ = false; - - // Join threads - if (consumer_thread_.joinable()) { - consumer_thread_.join(); - } - - if (batch_timer_thread_.joinable()) { - batch_timer_thread_.join(); - } - - // Flush remaining batches - auto remaining_batches = batch_handler_->flush_all(); - if (!remaining_batches.empty()) { - logger_->info("Flushing {} remaining batches", remaining_batches.size()); - send_batches(remaining_batches); - } - - logger_->info("LogCollector '{}' stopped", name_); - logger_->info(" Total consumed: {}", messages_consumed_.load()); - logger_->info(" Total validated: {}", messages_validated_.load()); - logger_->info(" Total failed: {}", messages_failed_.load()); - logger_->info(" Total batches sent: {}", batches_sent_.load()); -} - -void LogCollector::consume_loop() { - logger_->info("Consumer loop started"); - - while (running_) { - try { - // Poll for messages with 1 second timeout - consumer_->poll( - [this](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - if (!running_) - return; - - logger_->trace("Consumed message from topic '{}'", topic); - process_message(value); - - // Commit the offset - consumer_->commit(); - }, - 1000); // 1 second timeout - - } catch (const std::exception &e) { - logger_->error("Error in consumer loop: {}", e.what()); - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - } - - logger_->info("Consumer loop stopped"); -} - -void LogCollector::process_message(const std::string &message) { - messages_consumed_++; - - try { - // Validate the log line - auto logline = validate_logline(message); - messages_validated_++; - - // Calculate subnet ID - std::string subnet_id = get_subnet_id(logline.fields["src_ip"]); - - // Add to batch - bool batch_ready = batch_handler_->add_logline(subnet_id, logline); - - if (batch_ready) { - try { - auto batch = batch_handler_->get_batch(subnet_id); - send_batches({batch}); - } catch (const std::exception &e) { - logger_->error("Failed to get batch: {}", e.what()); - } - } - - } catch (const std::exception &e) { - messages_failed_++; - log_failed_logline(message, e.what()); - logger_->debug("Validation failed: {}", e.what()); - } -} - -base::LogLine LogCollector::validate_logline(const std::string &message) { - // Parse JSON - json j = json::parse(message); - - base::LogLine logline; - logline.logline_id = base::utils::generate_uuid(); - - // Validate required fields - std::vector required_fields = {"ts", "src_ip"}; - - for (const auto &field_name : required_fields) { - if (!j.contains(field_name)) { - throw std::runtime_error("Missing required field: " + field_name); - } - } - - // Extract all fields - for (auto &[key, value] : j.items()) { - if (value.is_string()) { - logline.fields[key] = value.get(); - } else { - logline.fields[key] = value.dump(); - } - } - - // Parse timestamp - logline.timestamp = - std::chrono::system_clock::now(); // TODO: Parse from "ts" field - - // Validate fields against config - // TODO: Implement field validation using validation_config_ - - return logline; -} - -std::string LogCollector::get_subnet_id(const std::string &ip_address) { - // Determine if IPv4 or IPv6 - if (ip_address.find(':') != std::string::npos) { - // IPv6 - return base::utils::get_subnet_id(ip_address, ipv6_prefix_length_); - } else { - // IPv4 - return base::utils::get_subnet_id(ip_address, ipv4_prefix_length_); - } -} - -void LogCollector::batch_timeout_handler() { - logger_->info("Batch timeout handler started (interval: {}ms)", - batch_timeout_ms_ / 2); - - // Check every half the timeout period - auto check_interval = std::chrono::milliseconds(batch_timeout_ms_ / 2); - - while (running_) { - std::this_thread::sleep_for(check_interval); - - // Get ready batches - auto ready_batches = batch_handler_->get_ready_batches(); - - if (!ready_batches.empty()) { - logger_->debug("Timeout handler sending {} batches", - ready_batches.size()); - send_batches(ready_batches); - } - } - - logger_->info("Batch timeout handler stopped"); -} - -void LogCollector::send_batches(const std::vector &batches) { - for (const auto &batch : batches) { - std::string batch_json = batch.to_json(); - - // Send to all produce topics using producer - if (producer_) { - try { - producer_->send(batch.batch_id, batch_json); - logger_->info("Sent batch {} ({} loglines)", batch.batch_id, - batch.loglines.size()); - } catch (const std::exception &e) { - logger_->error("Failed to send batch {}: {}", batch.batch_id, e.what()); - } - } - - batches_sent_++; - } -} - -void LogCollector::log_failed_logline(const std::string &message, - const std::string &reason) { - logger_->debug("Failed logline: {} (reason: {})", message, reason); - - // Log to ClickHouse - auto now = std::chrono::system_clock::now(); - auto timestamp_ms = base::utils::timestamp_to_ms(now); - - clickhouse_->insert_failed_logline(message, timestamp_ms, timestamp_ms, - reason); -} - -LogCollector::Stats LogCollector::get_stats() const { - Stats stats; - stats.messages_consumed = messages_consumed_.load(); - stats.messages_validated = messages_validated_.load(); - stats.messages_failed = messages_failed_.load(); - stats.batches_sent = batches_sent_.load(); - stats.avg_validation_time_ms = 0.0; // TODO: Track timing - stats.avg_batch_time_ms = 0.0; // TODO: Track timing - return stats; -} - -// ============================================================================ -// Factory Function -// ============================================================================ - -std::vector> -create_logcollectors(std::shared_ptr config) { - std::vector> collectors; - auto logger = base::Logger::get_logger("logcollector.factory"); - - // Get topic prefixes - auto &env = config->environment; - std::string consume_prefix = - env.kafka_topics_prefix["logserver_to_collector"]; - std::string produce_prefix = - env.kafka_topics_prefix["batch_sender_to_prefilter"]; - - // Build bootstrap servers string - std::vector broker_addresses; - for (const auto &broker : env.kafka_brokers) { - broker_addresses.push_back(broker.hostname + ":" + - std::to_string(broker.internal_port)); - } - std::string bootstrap_servers; - for (size_t i = 0; i < broker_addresses.size(); ++i) { - if (i > 0) - bootstrap_servers += ","; - bootstrap_servers += broker_addresses[i]; - } - - logger->info("Creating LogCollectors for {} collectors", - config->pipeline.collectors.size()); - logger->info("Kafka brokers: {}", bootstrap_servers); - - // Create one LogCollector per configured collector - for (const auto &collector_config : config->pipeline.collectors) { - std::string consume_topic = consume_prefix + "-" + collector_config.name; - std::string group_id = "logcollector-" + collector_config.name; - - // Find all prefilters for this collector - std::vector produce_topics; - for (const auto &prefilter : config->pipeline.prefilters) { - if (prefilter.collector_name == collector_config.name) { - std::string topic = produce_prefix + "-" + prefilter.name; - produce_topics.push_back(topic); - } - } - - logger->info("Creating LogCollector '{}' for protocol '{}'", - collector_config.name, collector_config.protocol_base); - logger->info(" Consume from: {}", consume_topic); - logger->info(" Produce to {} topics", produce_topics.size()); - - auto collector = std::make_shared( - collector_config.name, collector_config.protocol_base, consume_topic, - produce_topics, collector_config.required_log_information, config, - bootstrap_servers, group_id); - - collectors.push_back(collector); - } - - return collectors; -} - -} // namespace logcollector -} // namespace hamstring diff --git a/cpp/src/logcollector/main.cpp b/cpp/src/logcollector/main.cpp deleted file mode 100644 index 9f97de1d..00000000 --- a/cpp/src/logcollector/main.cpp +++ /dev/null @@ -1,110 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/logcollector/logcollector.hpp" -#include -#include -#include -#include - -using namespace hamstring; - -// Global vector of collectors for signal handling -std::vector> g_collectors; - -void signal_handler(int signum) { - auto logger = base::Logger::get_logger("main"); - logger->info("Received signal {}, shutting down...", signum); - - for (auto &collector : g_collectors) { - collector->stop(); - } -} - -void print_stats() { - auto logger = base::Logger::get_logger("main"); - - logger->info("=== LogCollector Statistics ==="); - for (const auto &collector : g_collectors) { - auto stats = collector->get_stats(); - logger->info("Collector stats:"); - logger->info(" Messages consumed: {}", stats.messages_consumed); - logger->info(" Messages validated: {}", stats.messages_validated); - logger->info(" Messages failed: {}", stats.messages_failed); - logger->info(" Batches sent: {}", stats.batches_sent); - } -} - -int main(int argc, char **argv) { - // Parse arguments - std::string config_path = (argc > 1) ? argv[1] : "../../config.yaml"; - - // Initialize logging - base::Logger::initialize(true); // debug mode - auto logger = base::Logger::get_logger("main"); - - logger->info("╔════════════════════════════════════════╗"); - logger->info("║ HAMSTRING LogCollector (C++) ║"); - logger->info("╚════════════════════════════════════════╝"); - logger->info(""); - - try { - // Load configuration - logger->info("Loading configuration from: {}", config_path); - auto config = config::Config::load_from_file(config_path); - - logger->info("Configuration loaded successfully"); - logger->info(" Collectors: {}", config->pipeline.collectors.size()); - logger->info(" Prefilters: {}", config->pipeline.prefilters.size()); - logger->info(" Kafka brokers: {}", - config->environment.kafka_brokers.size()); - logger->info(""); - - // Create LogCollector instances - g_collectors = logcollector::create_logcollectors(config); - - logger->info("Created {} LogCollector instances", g_collectors.size()); - logger->info(""); - - // Set up signal handlers for graceful shutdown - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - - // Start all collectors - for (auto &collector : g_collectors) { - collector->start(); - } - - logger->info("All LogCollectors started"); - logger->info("Press Ctrl+C to stop"); - logger->info(""); - - // Wait for collectors to finish (they run in background threads) - while (true) { - bool any_running = false; - for (const auto &collector : g_collectors) { - if (collector->is_running()) { - any_running = true; - break; - } - } - - if (!any_running) { - break; - } - - // Print stats every 10 seconds - std::this_thread::sleep_for(std::chrono::seconds(10)); - print_stats(); - } - - logger->info(""); - logger->info("All LogCollectors stopped"); - print_stats(); - - } catch (const std::exception &e) { - logger->error("Fatal error: {}", e.what()); - return 1; - } - - return 0; -} diff --git a/cpp/src/logserver/CMakeLists.txt b/cpp/src/logserver/CMakeLists.txt deleted file mode 100644 index d8e8ad1c..00000000 --- a/cpp/src/logserver/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# LogServer module -add_library(hamstring_logserver - logserver.cpp -) - -target_link_libraries(hamstring_logserver - PUBLIC - hamstring_base -) - -# LogServer executable -add_executable(logserver - main.cpp -) - -target_link_libraries(logserver - PRIVATE - hamstring_logserver - hamstring_base -) - -install(TARGETS logserver DESTINATION bin) diff --git a/cpp/src/logserver/README.md b/cpp/src/logserver/README.md deleted file mode 100644 index 210cc437..00000000 --- a/cpp/src/logserver/README.md +++ /dev/null @@ -1,202 +0,0 @@ -# LogServer Implementation - -## Overview - -The LogServer is the entry point for log data into the HAMSTRING pipeline. It consumes messages from Kafka topics, logs them to ClickHouse for monitoring, and forwards them to collector topics based on protocol configuration. - -## Architecture - -### Python vs C++ Comparison - -| Feature | Python | C++ | -|---------|--------|-----| -| **Threading** | asyncio + executor | Native std::thread | -| **Kafka** | Custom wrappers | librdkafka (planned) | -| **ClickHouse** | Custom sender | clickhouse-cpp (planned) | -| **Performance** | ~10K msgs/sec | ~75K msgs/sec (expected) | -| **Memory** | ~100MB | ~30MB (expected) | - -### Key Improvements - -1. **Thread Safety**: Uses `std::atomic` for runtime state -2. **Graceful Shutdown**: Signal handlers for clean termination -3. **Multi-Protocol Support**: Creates one server instance per protocol -4. **Resource Management**: RAII pattern with smart pointers -5. **Better Logging**: Structured logging with spdlog - -## Implementation Details - -### Files Created - -- `include/hamstring/logserver/logserver.hpp` - Header (120 lines) -- `src/logserver/logserver.cpp` - Implementation (180 lines) -- `src/logserver/main.cpp` - Executable (80 lines) -- `src/logserver/CMakeLists.txt` - Build config - -### Class Structure - -```cpp -class LogServer { -public: - LogServer(consume_topic, produce_topics, clickhouse); - void start(); - void stop(); - bool is_running() const; - -private: - void fetch_from_kafka(); // Consumer loop - void send(message_id, message); // Forward to collectors - void log_message(message_id, message); // Log to ClickHouse - void log_timestamp(message_id, event); // Log event - - // Kafka handlers - std::unique_ptr consumer_; - std::unique_ptr producer_; - - // ClickHouse - std::shared_ptr clickhouse_; - - // Runtime - std::atomic running_; - std::thread worker_thread_; -}; -``` - -### Factory Function - -```cpp -std::vector> create_logservers(config); -``` - -Creates LogServer instances based on configuration: -1. Parses collector configurations -2. Groups by protocol (dns, http, etc.) -3. Creates one LogServer per protocol -4. Maps to appropriate collector topics - -## Usage - -### Building - -```bash -cd cpp -cmake -B build -DCMAKE_TOOLCHAIN_FILE=~/vcpkg/scripts/buildsystems/vcpkg.cmake -cmake --build build --target logserver -``` - -### Running - -```bash -# With default config -./build/src/logserver/logserver - -# With custom config -./build/src/logserver/logserver /path/to/config.yaml -``` - -### Expected Output - -``` -[INFO] HAMSTRING LogServer -[INFO] ================== -[INFO] Loading configuration from: ../../config.yaml -[INFO] Configuration loaded successfully -[INFO] Created 1 LogServer instances -[INFO] LogServer started: -[INFO] ⤷ receiving on Kafka topic 'pipeline-logserver_in-dns' -[INFO] ⤷ sending on Kafka topics: -[INFO] - pipeline-logserver_to_collector-dga_collector -[INFO] All LogServers started -[INFO] Press Ctrl+C to stop -``` - -## Configuration - -The LogServer uses the existing `config.yaml` structure: - -```yaml -pipeline: - log_storage: - logserver: - input_file: "/opt/file.txt" - - log_collection: - collectors: - - name: "dga_collector" - protocol_base: dns - # ... - -environment: - kafka_topics_prefix: - pipeline: - logserver_in: "pipeline-logserver_in" - logserver_to_collector: "pipeline-logserver_to_collector" -``` - -## Message Flow - -``` -Kafka Input Topic - ↓ -LogServer.fetch_from_kafka() - ↓ -LogServer.log_message() → ClickHouse (server_logs) - ↓ -LogServer.send() - ↓ -LogServer.log_timestamp() → ClickHouse (server_logs_timestamps) - ↓ -Kafka Output Topics (one per collector) -``` - -## Current Status - -✅ **Implemented:** -- Multi-threaded message processing -- Configuration-based server factory -- Graceful shutdown with signal handling -- Logging infrastructure -- Build system integration - -⏳ **TODO (Placeholders):** -- Full Kafka integration (librdkafka) -- Full ClickHouse integration (clickhouse-cpp) -- File-based log ingestion -- Performance optimizations - -## Testing - -Currently the LogServer runs in demo mode with simulated Kafka messages. Once the Kafka and ClickHouse integrations are complete, it will process real messages. - -### Future Tests - -```cpp -TEST(LogServerTest, MultiProtocolSupport) { - // Test creating servers for multiple protocols -} - -TEST(LogServerTest, GracefulShutdown) { - // Test signal handling and clean shutdown -} - -TEST(LogServerTest, MessageForwarding) { - // Test correct topic routing -} -``` - -## Performance Expectations - -Based on C++ improvements: - -- **Throughput**: 75K messages/sec (7.5x Python) -- **Latency**: <2ms per message (vs 15ms Python) -- **Memory**: 30MB per server (vs 100MB Python) -- **CPU**: 15% per server (vs 40% Python) - -## Next Steps - -1. Implement `kafka_handler.cpp` with librdkafka -2. Implement `clickhouse_sender.cpp` with clickhouse-cpp -3. Add file-based log ingestion -4. Add comprehensive tests -5. Performance benchmarking vs Python diff --git a/cpp/src/logserver/logserver.cpp b/cpp/src/logserver/logserver.cpp deleted file mode 100644 index 400b72d0..00000000 --- a/cpp/src/logserver/logserver.cpp +++ /dev/null @@ -1,223 +0,0 @@ -#include "hamstring/logserver/logserver.hpp" -#include "hamstring/base/utils.hpp" -#include -#include - -using json = nlohmann::json; - -namespace hamstring { -namespace logserver { - -LogServer::LogServer(const std::string &consume_topic, - const std::vector &produce_topics, - std::shared_ptr clickhouse, - const std::string &bootstrap_servers, - const std::string &group_id) - : consume_topic_(consume_topic), produce_topics_(produce_topics), - clickhouse_(clickhouse), running_(false) { - logger_ = base::Logger::get_logger("logserver"); - - // Create Kafka consumer - consumer_ = std::make_unique( - bootstrap_servers, group_id, std::vector{consume_topic_}); - - // Create Kafka producers for each output topic - for (const auto &topic : produce_topics_) { - auto producer = - std::make_unique(bootstrap_servers, topic); - producers_.push_back(std::move(producer)); - } - - logger_->info("LogServer created for topic: {}", consume_topic_); - logger_->info("Will produce to {} topics", produce_topics_.size()); -} - -LogServer::~LogServer() { stop(); } - -void LogServer::start() { - if (running_) { - logger_->warn("LogServer already running"); - return; - } - - running_ = true; - - logger_->info("LogServer started:"); - logger_->info(" ⤷ receiving on Kafka topic '{}'", consume_topic_); - logger_->info(" ⤷ sending on Kafka topics:"); - for (const auto &topic : produce_topics_) { - logger_->info(" - {}", topic); - } - - // Start worker thread for Kafka consumption - worker_thread_ = std::thread(&LogServer::fetch_from_kafka, this); -} - -void LogServer::stop() { - if (!running_) { - return; - } - - logger_->info("Stopping LogServer..."); - running_ = false; - - if (worker_thread_.joinable()) { - worker_thread_.join(); - } - - // Flush all producers - for (auto &producer : producers_) { - producer->flush(); - } - - logger_->info("LogServer stopped"); -} - -void LogServer::send(const std::string &message_id, - const std::string &message) { - // Send to all producer topics - for (size_t i = 0; i < producers_.size(); ++i) { - try { - producers_[i]->send(message_id, message); - logger_->trace("Sent message {} to topic {}", message_id, - produce_topics_[i]); - } catch (const std::exception &e) { - logger_->error("Failed to send to topic {}: {}", produce_topics_[i], - e.what()); - } - } - - // Log timestamp - log_timestamp(message_id, "timestamp_out"); -} - -void LogServer::fetch_from_kafka() { - logger_->info("Starting Kafka consumer loop"); - - while (running_) { - try { - // Poll for messages with 1 second timeout - consumer_->poll( - [this](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - if (!running_) - return; - - // Generate message ID (or use key if provided) - std::string message_id = - key.empty() ? base::utils::generate_uuid() : key; - - logger_->debug("From Kafka ({}): {}", topic, value.substr(0, 100)); - - // Log timestamp in - log_timestamp(message_id, "timestamp_in"); - - // Log to ClickHouse - log_message(message_id, value); - - // Forward to collectors - send(message_id, value); - - // Commit the offset - consumer_->commit(); - }, - 1000); // 1 second timeout - - } catch (const std::exception &e) { - logger_->error("Error in consumer loop: {}", e.what()); - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - } - - logger_->info("Kafka consumer loop stopped"); -} - -void LogServer::log_message(const std::string &message_id, - const std::string &message) { - // Log the message to ClickHouse server_logs table - auto now = std::chrono::system_clock::now(); - auto timestamp_ms = base::utils::timestamp_to_ms(now); - - logger_->trace("Logging message {} to ClickHouse", message_id); - - // Insert into ClickHouse server_logs table - clickhouse_->insert_server_log(message_id, timestamp_ms, message); -} - -void LogServer::log_timestamp(const std::string &message_id, - const std::string &event) { - // Log timestamp event to ClickHouse server_logs_timestamps table - auto now = std::chrono::system_clock::now(); - auto timestamp_ms = base::utils::timestamp_to_ms(now); - - logger_->trace("Logging timestamp {} event for message {}", event, - message_id); - - // Insert into ClickHouse server_logs_timestamps table - clickhouse_->insert_server_log_timestamp(message_id, event, timestamp_ms); -} - -std::vector> -create_logservers(std::shared_ptr config) { - std::vector> servers; - auto logger = base::Logger::get_logger("logserver.factory"); - - // Get topic prefixes from config - auto &env = config->environment; - std::string consume_prefix = env.kafka_topics_prefix["logserver_in"]; - std::string produce_prefix = - env.kafka_topics_prefix["logserver_to_collector"]; - - // Build bootstrap servers string - std::vector broker_addresses; - for (const auto &broker : env.kafka_brokers) { - broker_addresses.push_back(broker.hostname + ":" + - std::to_string(broker.internal_port)); - } - std::string bootstrap_servers; - for (size_t i = 0; i < broker_addresses.size(); ++i) { - if (i > 0) - bootstrap_servers += ","; - bootstrap_servers += broker_addresses[i]; - } - - // Create ClickHouse sender for monitoring - auto clickhouse = std::make_shared( - env.clickhouse_hostname, 9000, "hamstring", "default", ""); - - // Get unique protocols from collectors - std::set protocols; - for (const auto &collector : config->pipeline.collectors) { - protocols.insert(collector.protocol_base); - } - - logger->info("Creating LogServers for {} protocols", protocols.size()); - logger->info("Kafka brokers: {}", bootstrap_servers); - - // Create one LogServer per protocol - for (const auto &protocol : protocols) { - std::string consume_topic = consume_prefix + "-" + protocol; - std::string group_id = "logserver-" + protocol; - - // Find all collectors for this protocol - std::vector produce_topics; - for (const auto &collector : config->pipeline.collectors) { - if (collector.protocol_base == protocol) { - std::string topic = produce_prefix + "-" + collector.name; - produce_topics.push_back(topic); - } - } - - logger->info("Creating LogServer for protocol '{}' -> {} collectors", - protocol, produce_topics.size()); - - auto server = std::make_shared( - consume_topic, produce_topics, clickhouse, bootstrap_servers, group_id); - servers.push_back(server); - } - - return servers; -} - -} // namespace logserver -} // namespace hamstring diff --git a/cpp/src/logserver/main.cpp b/cpp/src/logserver/main.cpp deleted file mode 100644 index a9e1334a..00000000 --- a/cpp/src/logserver/main.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/logserver/logserver.hpp" -#include -#include -#include -#include - -using namespace hamstring; - -// Global vector of servers for signal handling -std::vector> g_servers; - -void signal_handler(int signum) { - auto logger = base::Logger::get_logger("main"); - logger->info("Received signal {}, shutting down...", signum); - - for (auto &server : g_servers) { - server->stop(); - } -} - -int main(int argc, char **argv) { - // Parse arguments - std::string config_path = (argc > 1) ? argv[1] : "../../config.yaml"; - - // Initialize logging - base::Logger::initialize(true); // debug mode - auto logger = base::Logger::get_logger("main"); - - logger->info("HAMSTRING LogServer"); - logger->info("=================="); - - try { - // Load configuration - logger->info("Loading configuration from: {}", config_path); - auto config = config::Config::load_from_file(config_path); - - logger->info("Configuration loaded successfully"); - logger->info("Kafka brokers: {}", config->environment.kafka_brokers.size()); - logger->info("Collectors: {}", config->pipeline.collectors.size()); - - // Create LogServer instances - g_servers = logserver::create_logservers(config); - - logger->info("Created {} LogServer instances", g_servers.size()); - - // Set up signal handlers for graceful shutdown - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - - // Start all servers - for (auto &server : g_servers) { - server->start(); - } - - logger->info("All LogServers started"); - logger->info("Press Ctrl+C to stop"); - - // Wait for servers to finish (they run in background threads) - while (true) { - bool any_running = false; - for (const auto &server : g_servers) { - if (server->is_running()) { - any_running = true; - break; - } - } - - if (!any_running) { - break; - } - - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - - logger->info("All LogServers stopped"); - - } catch (const std::exception &e) { - logger->error("Fatal error: {}", e.what()); - return 1; - } - - return 0; -} diff --git a/cpp/src/prefilter/CMakeLists.txt b/cpp/src/prefilter/CMakeLists.txt deleted file mode 100644 index ce3e2993..00000000 --- a/cpp/src/prefilter/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -# Prefilter library -add_library(hamstring_prefilter - prefilter.cpp -) - -target_link_libraries(hamstring_prefilter - PUBLIC - hamstring_base -) - -# Prefilter executable -add_executable(prefilter - main.cpp -) - -target_link_libraries(prefilter - PRIVATE - hamstring_prefilter - hamstring_base -) diff --git a/cpp/src/prefilter/main.cpp b/cpp/src/prefilter/main.cpp deleted file mode 100644 index 4985d1cb..00000000 --- a/cpp/src/prefilter/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include "hamstring/base/logger.hpp" -#include "hamstring/config/config.hpp" -#include "hamstring/prefilter/prefilter.hpp" -#include -#include -#include - -using namespace hamstring; - -std::vector> prefilters; -volatile sig_atomic_t shutdown_requested = 0; - -void signal_handler(int signum) { - std::cout << std::endl; - auto logger = base::Logger::get_logger("main"); - logger->info("Received signal {}, shutting down...", signum); - shutdown_requested = 1; - - // Stop all prefilters - for (auto &pf : prefilters) { - pf->stop(); - } -} - -int main(int argc, char *argv[]) { - // Set up signal handlers - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - - auto logger = base::Logger::get_logger("main"); - - logger->info("╔════════════════════════════════════════╗"); - logger->info("║ HAMSTRING Prefilter (C++) ║"); - logger->info("╚════════════════════════════════════════╝"); - logger->info(""); - - // Load configuration - std::string config_path = (argc > 1) ? argv[1] : "../config.yaml"; - logger->info("Loading configuration from: {}", config_path); - - std::shared_ptr config; - try { - config = config::Config::load_from_file(config_path); - logger->info("Configuration loaded successfully"); - logger->info(" Prefilters: {}", config->pipeline.prefilters.size()); - logger->info(" Inspectors: {}", config->pipeline.inspectors.size()); - logger->info(" Kafka brokers: {}", - config->environment.kafka_brokers.size()); - logger->info(""); - } catch (const std::exception &e) { - logger->error("Failed to load configuration: {}", e.what()); - return 1; - } - - // Create prefilters - try { - prefilters = prefilter::create_prefilters(config); - logger->info("Created {} Prefilter instances", prefilters.size()); - logger->info(""); - } catch (const std::exception &e) { - logger->error("Failed to create prefilters: {}", e.what()); - return 1; - } - - // Start all prefilters - for (auto &pf : prefilters) { - pf->start(); - } - - logger->info("All Prefilters started"); - logger->info("Press Ctrl+C to stop"); - logger->info(""); - - // Report statistics periodically - while (!shutdown_requested) { - std::this_thread::sleep_for(std::chrono::seconds(10)); - - if (!shutdown_requested) { - logger->info("=== Prefilter Statistics ==="); - for (const auto &pf : prefilters) { - auto stats = pf->get_stats(); - logger->info("Prefilter stats:"); - logger->info(" Batches consumed: {}", stats.batches_consumed); - logger->info(" Batches sent: {}", stats.batches_sent); - logger->info(" Loglines received: {}", stats.loglines_received); - logger->info(" Loglines filtered: {}", stats.loglines_filtered); - logger->info(" Loglines sent: {}", stats.loglines_sent); - } - } - } - - logger->info("Shutdown complete"); - return 0; -} diff --git a/cpp/src/prefilter/prefilter.cpp b/cpp/src/prefilter/prefilter.cpp deleted file mode 100644 index e18b67ce..00000000 --- a/cpp/src/prefilter/prefilter.cpp +++ /dev/null @@ -1,289 +0,0 @@ -#include "hamstring/prefilter/prefilter.hpp" -#include "hamstring/base/utils.hpp" -#include - -using json = nlohmann::json; - -namespace hamstring { -namespace prefilter { - -Prefilter::Prefilter(const std::string &name, const std::string &consume_topic, - const std::vector &produce_topics, - const std::string &relevance_function, - const std::vector &validation_config, - std::shared_ptr config, - const std::string &bootstrap_servers, - const std::string &group_id) - : name_(name), consume_topic_(consume_topic), - produce_topics_(produce_topics), relevance_function_(relevance_function), - validation_config_(validation_config), config_(config) { - logger_ = base::Logger::get_logger("prefilter." + name_); - - // Create Kafka consumer - consumer_ = std::make_unique( - bootstrap_servers, group_id, std::vector{consume_topic_}); - - // Create Kafka producers for each output topic - for (const auto &topic : produce_topics_) { - auto producer = - std::make_unique(bootstrap_servers, topic); - producers_.push_back(std::move(producer)); - } - - // Create ClickHouse sender for monitoring - clickhouse_ = std::make_shared( - config_->environment.clickhouse_hostname, 9000, "hamstring", "default", - ""); - - logger_->info("Prefilter '{}' created", name_); - logger_->info(" Relevance function: {}", relevance_function_); - logger_->info(" Consume topic: {}", consume_topic_); - logger_->info(" Produce topics: {}", produce_topics_.size()); -} - -Prefilter::~Prefilter() { stop(); } - -void Prefilter::start() { - if (running_) { - logger_->warn("Prefilter already running"); - return; - } - - running_ = true; - - logger_->info("Prefilter '{}' started", name_); - logger_->info(" ⤷ receiving on Kafka topic '{}'", consume_topic_); - logger_->info(" ⤷ sending to {} topics", produce_topics_.size()); - for (const auto &topic : produce_topics_) { - logger_->info(" - {}", topic); - } - - // Start worker thread - worker_thread_ = std::thread(&Prefilter::consume_loop, this); -} - -void Prefilter::stop() { - if (!running_) { - return; - } - - logger_->info("Stopping Prefilter '{}'...", name_); - running_ = false; - - if (worker_thread_.joinable()) { - worker_thread_.join(); - } - - // Flush all producers - for (auto &producer : producers_) { - producer->flush(); - } - - logger_->info("Prefilter '{}' stopped", name_); - logger_->info(" Batches consumed: {}", batches_consumed_.load()); - logger_->info(" Batches sent: {}", batches_sent_.load()); - logger_->info(" Loglines received: {}", loglines_received_.load()); - logger_->info(" Loglines filtered: {}", loglines_filtered_.load()); - logger_->info(" Loglines sent: {}", loglines_sent_.load()); -} - -void Prefilter::consume_loop() { - logger_->info("Consumer loop started"); - - while (running_) { - try { - // Poll for messages with 1 second timeout - consumer_->poll( - [this](const std::string &topic, const std::string &key, - const std::string &value, int64_t timestamp) { - if (!running_) - return; - - logger_->trace("Consumed batch from topic '{}'", topic); - - try { - // Parse batch JSON - auto batch_ptr = base::Batch::from_json(value); - - logger_->debug("Received batch {} with {} loglines", - batch_ptr->batch_id, batch_ptr->loglines.size()); - - // Process the batch - process_batch(*batch_ptr); - - // Commit the offset - consumer_->commit(); - - } catch (const std::exception &e) { - logger_->error("Failed to process batch: {}", e.what()); - } - }, - 1000); // 1 second timeout - - } catch (const std::exception &e) { - logger_->error("Error in consumer loop: {}", e.what()); - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - } - - logger_->info("Consumer loop stopped"); -} - -void Prefilter::process_batch(const base::Batch &batch) { - batches_consumed_++; - loglines_received_ += batch.loglines.size(); - - // Create filtered batch - base::Batch filtered_batch; - filtered_batch.batch_id = batch.batch_id; - filtered_batch.subnet_id = batch.subnet_id; - filtered_batch.collector_name = batch.collector_name; - filtered_batch.created_at = batch.created_at; - filtered_batch.timestamp_in = std::chrono::system_clock::now(); - - // Filter log lines based on relevance - for (const auto &logline_ptr : batch.loglines) { - if (check_relevance(*logline_ptr)) { - filtered_batch.loglines.push_back(logline_ptr); - } else { - loglines_filtered_++; - } - } - - if (!filtered_batch.loglines.empty()) { - logger_->info("Filtered batch {}: {} → {} loglines", batch.batch_id, - batch.loglines.size(), filtered_batch.loglines.size()); - - loglines_sent_ += filtered_batch.loglines.size(); - send_batch(filtered_batch); - } else { - logger_->debug("Batch {} completely filtered out", batch.batch_id); - } -} - -bool Prefilter::check_relevance(const base::LogLine &logline) { - // TODO: Implement actual relevance checking based on relevance_function_ - // For now, use a simple heuristic - - if (relevance_function_ == "always_relevant") { - return true; - } - - if (relevance_function_ == "check_nxdomain") { - // Check if this is an NXDOMAIN response (common DGA indicator) - auto it = logline.fields.find("rcode"); - if (it != logline.fields.end()) { - return it->second == "NXDOMAIN" || it->second == "3"; - } - } - - if (relevance_function_ == "check_query_length") { - // Check if query domain is suspiciously long (possible DGA) - auto it = logline.fields.find("query"); - if (it != logline.fields.end()) { - return it->second.length() > 20; // Heuristic threshold - } - } - - // Default: pass through - return true; -} - -void Prefilter::send_batch(const base::Batch &batch) { - std::string batch_json = batch.to_json(); - - // Send to all output topics - for (size_t i = 0; i < producers_.size(); ++i) { - try { - producers_[i]->send(batch.batch_id, batch_json); - logger_->trace("Sent batch {} to topic {}", batch.batch_id, - produce_topics_[i]); - } catch (const std::exception &e) { - logger_->error("Failed to send batch to {}: {}", produce_topics_[i], - e.what()); - } - } - - batches_sent_++; -} - -Prefilter::Stats Prefilter::get_stats() const { - Stats stats; - stats.batches_consumed = batches_consumed_.load(); - stats.batches_sent = batches_sent_.load(); - stats.loglines_received = loglines_received_.load(); - stats.loglines_filtered = loglines_filtered_.load(); - stats.loglines_sent = loglines_sent_.load(); - return stats; -} - -std::vector> -create_prefilters(std::shared_ptr config) { - std::vector> prefilters; - auto logger = base::Logger::get_logger("prefilter.factory"); - - // Get topic prefixes - auto &env = config->environment; - std::string consume_prefix = - env.kafka_topics_prefix["batch_sender_to_prefilter"]; - std::string produce_prefix = - env.kafka_topics_prefix["prefilter_to_inspector"]; - - // Build bootstrap servers string - std::vector broker_addresses; - for (const auto &broker : env.kafka_brokers) { - broker_addresses.push_back(broker.hostname + ":" + - std::to_string(broker.internal_port)); - } - std::string bootstrap_servers; - for (size_t i = 0; i < broker_addresses.size(); ++i) { - if (i > 0) - bootstrap_servers += ","; - bootstrap_servers += broker_addresses[i]; - } - - logger->info("Creating Prefilters for {} prefilters", - config->pipeline.prefilters.size()); - logger->info("Kafka brokers: {}", bootstrap_servers); - - // Create one Prefilter per configured prefilter - for (const auto &prefilter_config : config->pipeline.prefilters) { - std::string consume_topic = consume_prefix + "-" + prefilter_config.name; - std::string group_id = "prefilter-" + prefilter_config.name; - - // Find all inspectors for this prefilter - std::vector produce_topics; - for (const auto &inspector : config->pipeline.inspectors) { - if (inspector.prefilter_name == prefilter_config.name) { - std::string topic = produce_prefix + "-" + inspector.name; - produce_topics.push_back(topic); - } - } - - // Get validation config from collector - std::vector validation_config; - for (const auto &collector : config->pipeline.collectors) { - if (collector.name == prefilter_config.collector_name) { - validation_config = collector.required_log_information; - break; - } - } - - logger->info("Creating Prefilter '{}'", prefilter_config.name); - logger->info(" Consume from: {}", consume_topic); - logger->info(" Produce to {} topics", produce_topics.size()); - logger->info(" Relevance function: {}", prefilter_config.relevance_method); - - auto prefilter = std::make_shared( - prefilter_config.name, consume_topic, produce_topics, - prefilter_config.relevance_method, validation_config, config, - bootstrap_servers, group_id); - - prefilters.push_back(prefilter); - } - - return prefilters; -} - -} // namespace prefilter -} // namespace hamstring diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt deleted file mode 100644 index 23596eec..00000000 --- a/cpp/tests/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Tests -add_subdirectory(base) -add_subdirectory(detector) -add_subdirectory(integration) diff --git a/cpp/tests/base/CMakeLists.txt b/cpp/tests/base/CMakeLists.txt deleted file mode 100644 index 79ed00b0..00000000 --- a/cpp/tests/base/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# Placeholder for base tests -add_executable(test_utils - test_utils.cpp -) - -target_link_libraries(test_utils - PRIVATE - hamstring_base - GTest::gtest - GTest::gtest_main -) - -add_test(NAME UtilsTest COMMAND test_utils) diff --git a/cpp/tests/base/test_utils.cpp b/cpp/tests/base/test_utils.cpp deleted file mode 100644 index 5263571d..00000000 --- a/cpp/tests/base/test_utils.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "hamstring/base/utils.hpp" -#include - -using namespace hamstring::base::utils; - -TEST(UtilsTest, UUIDGeneration) { - auto uuid1 = generate_uuid(); - auto uuid2 = generate_uuid(); - - // UUIDs should be different - EXPECT_NE(uuid1, uuid2); - - // UUID should have correct format (36 characters with dashes) - EXPECT_EQ(uuid1.length(), 36); -} - -TEST(UtilsTest, IPv4Validation) { - EXPECT_TRUE(is_valid_ipv4("192.168.1.1")); - EXPECT_TRUE(is_valid_ipv4("10.0.0.1")); - EXPECT_FALSE(is_valid_ipv4("256.1.1.1")); - EXPECT_FALSE(is_valid_ipv4("not.an.ip.address")); - EXPECT_FALSE(is_valid_ipv4("::1")); -} - -TEST(UtilsTest, IPv6Validation) { - EXPECT_TRUE(is_valid_ipv6("::1")); - EXPECT_TRUE(is_valid_ipv6("2001:db8::1")); - EXPECT_FALSE(is_valid_ipv6("192.168.1.1")); - EXPECT_FALSE(is_valid_ipv6("not:valid:ipv6")); -} - -TEST(UtilsTest, StringSplit) { - auto parts = split("a,b,c", ','); - ASSERT_EQ(parts.size(), 3); - EXPECT_EQ(parts[0], "a"); - EXPECT_EQ(parts[1], "b"); - EXPECT_EQ(parts[2], "c"); -} - -TEST(UtilsTest, StringJoin) { - std::vector parts = {"a", "b", "c"}; - auto joined = join(parts, ","); - EXPECT_EQ(joined, "a,b,c"); -} - -TEST(UtilsTest, StringTrim) { - EXPECT_EQ(trim(" hello "), "hello"); - EXPECT_EQ(trim("hello"), "hello"); - EXPECT_EQ(trim(" "), ""); -} - -TEST(UtilsTest, DomainExtraction) { - EXPECT_EQ(extract_fqdn("www.example.com"), "www.example.com"); - EXPECT_EQ(extract_second_level_domain("www.example.com"), "example.com"); - EXPECT_EQ(extract_third_level_domain("www.example.com"), "www"); -} - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/cpp/tests/detector/CMakeLists.txt b/cpp/tests/detector/CMakeLists.txt deleted file mode 100644 index ceab9ee1..00000000 --- a/cpp/tests/detector/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -# Detector tests -add_executable(test_feature_extractor - test_feature_extractor.cpp -) - -target_link_libraries(test_feature_extractor - PRIVATE - hamstring_detector - hamstring_base - GTest::gtest - GTest::gtest_main -) - -add_test(NAME FeatureExtractorTest COMMAND test_feature_extractor) diff --git a/cpp/tests/detector/test_feature_extractor.cpp b/cpp/tests/detector/test_feature_extractor.cpp deleted file mode 100644 index c53067bf..00000000 --- a/cpp/tests/detector/test_feature_extractor.cpp +++ /dev/null @@ -1,129 +0,0 @@ -#include "hamstring/detector/feature_extractor.hpp" -#include -#include - -using namespace hamstring::detector; - -class FeatureExtractorTest : public ::testing::Test { -protected: - FeatureExtractor extractor; -}; - -TEST_F(FeatureExtractorTest, BasicDomainExtraction) { - std::string domain = "example.com"; - auto features = extractor.extract(domain); - - // Check label count - EXPECT_EQ(features.label_length, 2); - - // Check that features are populated - EXPECT_GT(features.fqdn_entropy, 0.0); - EXPECT_GT(features.secondleveldomain_entropy, 0.0); -} - -TEST_F(FeatureExtractorTest, SubdomainExtraction) { - std::string domain = "www.example.com"; - auto features = extractor.extract(domain); - - // Check label count - EXPECT_EQ(features.label_length, 3); - - // Check third level domain is extracted - EXPECT_GT(features.thirdleveldomain_entropy, 0.0); -} - -TEST_F(FeatureExtractorTest, CharacterFrequency) { - std::string domain = "aaa.com"; - auto features = extractor.extract(domain); - - // 'a' appears 3 times out of 7 characters - EXPECT_NEAR(features.char_freq['a'], 3.0 / 7.0, 0.01); - - // 'c' appears 1 time out of 7 characters - EXPECT_NEAR(features.char_freq['c'], 1.0 / 7.0, 0.01); -} - -TEST_F(FeatureExtractorTest, AlphaNumericRatios) { - std::string domain = "test123.com"; - auto features = extractor.extract(domain); - - // Should have both alpha and numeric characters - EXPECT_GT(features.fqdn_alpha_count, 0.0); - EXPECT_GT(features.fqdn_numeric_count, 0.0); - - // Alpha + numeric should be close to 1.0 (only dot is special) - EXPECT_NEAR(features.fqdn_alpha_count + features.fqdn_numeric_count, - 1.0 - features.fqdn_special_count, 0.01); -} - -TEST_F(FeatureExtractorTest, EntropyCalculation) { - // Domain with all same characters should have low entropy - std::string low_entropy = "aaaa.com"; - auto features_low = extractor.extract(low_entropy); - - // Domain with varied characters should have higher entropy - std::string high_entropy = "abcdefgh.com"; - auto features_high = extractor.extract(high_entropy); - - EXPECT_LT(features_low.fqdn_entropy, features_high.fqdn_entropy); -} - -TEST_F(FeatureExtractorTest, DGALikeDomain) { - // Typical DGA domain: random characters, high entropy - std::string dga_domain = "xjk3n2m9pq.com"; - auto features = extractor.extract(dga_domain); - - // DGA domains typically have: - // - High entropy - // - Mix of alpha and numeric - EXPECT_GT(features.fqdn_entropy, 2.0); - EXPECT_GT(features.fqdn_alpha_count, 0.5); -} - -TEST_F(FeatureExtractorTest, FeatureVectorSize) { - std::string domain = "test.example.com"; - auto features = extractor.extract(domain); - auto vec = features.to_vector(); - - // Should have 44 features: - // 3 (label stats) + 26 (char freq) + 12 (domain level counts) + 3 (entropy) - EXPECT_EQ(vec.size(), 44); -} - -TEST_F(FeatureExtractorTest, FeatureNames) { - auto names = DomainFeatures::get_feature_names(); - - // Should match vector size - EXPECT_EQ(names.size(), 44); - - // Check some expected names - EXPECT_EQ(names[0], "label_length"); - EXPECT_EQ(names[1], "label_max"); - EXPECT_EQ(names[2], "label_average"); - EXPECT_EQ(names[3], "freq_a"); -} - -TEST_F(FeatureExtractorTest, EmptyDomain) { - std::string domain = ""; - auto features = extractor.extract(domain); - - // Empty domain should have zero features - EXPECT_EQ(features.label_length, 0); - EXPECT_EQ(features.fqdn_entropy, 0.0); -} - -TEST_F(FeatureExtractorTest, SingleLabel) { - std::string domain = "localhost"; - auto features = extractor.extract(domain); - - // Single label domain - EXPECT_EQ(features.label_length, 1); - - // Should have FQDN features but not second/third level - EXPECT_GT(features.fqdn_entropy, 0.0); -} - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/cpp/tests/integration/CMakeLists.txt b/cpp/tests/integration/CMakeLists.txt deleted file mode 100644 index 916454e9..00000000 --- a/cpp/tests/integration/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# Placeholder for integration tests -add_executable(test_pipeline - test_pipeline.cpp -) - -target_link_libraries(test_pipeline - PRIVATE - hamstring_base - GTest::gtest - GTest::gtest_main -) - -add_test(NAME PipelineTest COMMAND test_pipeline) diff --git a/cpp/tests/integration/test_pipeline.cpp b/cpp/tests/integration/test_pipeline.cpp deleted file mode 100644 index ddd8946f..00000000 --- a/cpp/tests/integration/test_pipeline.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include - -// Placeholder integration test -TEST(PipelineTest, Placeholder) { EXPECT_TRUE(true); } - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json deleted file mode 100644 index 0cb83b2f..00000000 --- a/cpp/vcpkg.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "name": "hamstring", - "version": "1.0.0", - "description": "C++ implementation of HAMSTRING DGA detection pipeline", - "dependencies": [ - "yaml-cpp", - "librdkafka", - "clickhouse-cpp", - "boost-system", - "boost-thread", - "boost-asio", - "spdlog", - "fmt", - "nlohmann-json", - "openssl", - "gtest" - ] -} \ No newline at end of file diff --git a/CHANGELOG.md b/data/dgta/.gitkeep similarity index 100% rename from CHANGELOG.md rename to data/dgta/.gitkeep diff --git a/data/dgta/dgta_decode.py b/data/dgta/dgta_decode.py new file mode 100644 index 00000000..c57b184a --- /dev/null +++ b/data/dgta/dgta_decode.py @@ -0,0 +1,17 @@ +import pandas as pd +import polars as pl + + +def custom_decode(data): + retL = [None] * len(data) + for i, datum in enumerate(data): + retL[i] = str(datum.decode("latin-1").encode("utf-8").decode("utf-8")) + + return pl.Series(retL) + + +if __name__ == "__main__": + df_dgta = pl.read_parquet("./dgta-benchmark.parquet") + df_dgta = df_dgta.rename({"domain": "query"}) + df_dgta = df_dgta.with_columns([pl.col("query").map(custom_decode)]) + df_dgta.write_csv("dgta.csv") diff --git a/data/test_pcaps/cic-ids-2017-sample.pcap2 b/data/test_pcaps/cic-ids-2017-sample.pcap2 deleted file mode 100755 index 9f2ee3b3..00000000 Binary files a/data/test_pcaps/cic-ids-2017-sample.pcap2 and /dev/null differ diff --git a/data/test_pcaps/unsw-sample.pcap2 b/data/test_pcaps/unsw-sample.pcap2 deleted file mode 100755 index 7054f76e..00000000 Binary files a/data/test_pcaps/unsw-sample.pcap2 and /dev/null differ diff --git a/docker/datatests.json b/docker/datatests.json deleted file mode 100644 index bb7c1c94..00000000 --- a/docker/datatests.json +++ /dev/null @@ -1,1175 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "links": [], - "panels": [ - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-GrYlRd" - }, - "fieldMinMax": false, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 0 - }, - "id": 9, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS malicious_detected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)", - "refId": "malicious_detected" - }, - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS benign_detected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)", - "refId": "benign_detected" - }, - { - "datasource": { - "name": "Expression", - "type": "__expr__", - "uid": "__expr__" - }, - "expression": "$malicious_detected / ($benign_detected + $malicious_detected)", - "hide": false, - "refId": "A", - "type": "math" - } - ], - "title": "True Positive Rate", - "type": "stat" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-RdYlGr" - }, - "fieldMinMax": false, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 4, - "y": 0 - }, - "id": 10, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS malicious_detected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)", - "refId": "malicious_detected" - }, - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS benign_detected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)", - "refId": "benign_detected" - }, - { - "datasource": { - "name": "Expression", - "type": "__expr__", - "uid": "__expr__" - }, - "expression": "$benign_detected / ($benign_detected + $malicious_detected)", - "hide": false, - "refId": "A", - "type": "math" - } - ], - "title": "False Positive Rate", - "type": "stat" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "green", - "mode": "shades" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 9, - "y": 0 - }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": false, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS malicious_detected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)", - "refId": "B" - } - ], - "title": "malicious/detected", - "type": "stat" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "shades" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 13, - "y": 0 - }, - "id": 6, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": false, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS benign_detected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)", - "refId": "B" - } - ], - "title": "benign/detected", - "type": "stat" - }, - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "gridPos": { - "h": 7, - "w": 5, - "x": 19, - "y": 0 - }, - "id": 27, - "options": { - "folderUID": "", - "includeVars": true, - "keepTime": true, - "maxItems": 10, - "query": "", - "showFolderNames": false, - "showHeadings": false, - "showRecentlyViewed": false, - "showSearch": true, - "showStarred": false, - "tags": [] - }, - "pluginVersion": "11.2.2+security-01", - "title": "Dashboards", - "transparent": true, - "type": "dashlist" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-RdYlGr" - }, - "fieldMinMax": false, - "mappings": [], - "noValue": "-", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 3 - }, - "id": 12, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS malicious_notdetected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "malicious_notdetected" - }, - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS benign_notdetected\nFROM dns_loglines\nWHERE client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "benign_notdetected" - }, - { - "datasource": { - "name": "Expression", - "type": "__expr__", - "uid": "__expr__" - }, - "expression": "$malicious_notdetected / ($benign_notdetected + $malicious_notdetected)", - "hide": false, - "refId": "A", - "type": "math" - } - ], - "title": "False Negative Rate", - "type": "stat" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-GrYlRd" - }, - "fieldMinMax": false, - "mappings": [], - "noValue": "-", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 4, - "y": 3 - }, - "id": 11, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS malicious_notdetected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "malicious_notdetected" - }, - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": true, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS benign_notdetected\nFROM dns_loglines\nWHERE client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "benign_notdetected" - }, - { - "datasource": { - "name": "Expression", - "type": "__expr__", - "uid": "__expr__" - }, - "expression": "$benign_notdetected / ($benign_notdetected + $malicious_notdetected)", - "hide": false, - "refId": "A", - "type": "math" - } - ], - "title": "True Negative Rate", - "type": "stat" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "shades" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 9, - "y": 3 - }, - "id": 7, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": false, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.6.0", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS malicious_notdetected\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "B" - } - ], - "title": "malicious/not detected", - "type": "stat" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "green", - "mode": "shades" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 13, - "y": 3 - }, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "hide": false, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.5.1", - "queryType": "table", - "rawSql": "SELECT count(DISTINCT client_ip) AS benign_notdetected\nFROM dns_loglines\nWHERE client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "B" - } - ], - "title": "benign/not detected", - "type": "stat" - }, - { - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 7 - }, - "id": 26, - "title": "Details", - "type": "row" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "shades" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 8 - }, - "id": 23, - "options": { - "displayMode": "gradient", - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color" - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.6.0", - "queryType": "table", - "rawSql": "SELECT 'benign' AS type, count(DISTINCT client_ip) AS number\nFROM dns_loglines\nWHERE client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\n\nUNION ALL\n\nSELECT 'malicious' AS type, count(DISTINCT client_ip) AS number\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\n\nUNION ALL\n\nSELECT 'undetected' AS type, count(DISTINCT client_ip) AS number\nFROM dns_loglines\nWHERE client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\n\nUNION ALL\n\nSELECT 'detected' AS type, count(DISTINCT client_ip) AS number\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)", - "refId": "A" - } - ], - "title": "Total number of logs", - "transformations": [ - { - "id": "rowsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": {}, - "includeByName": {}, - "indexByName": { - "benign": 2, - "detected": 1, - "malicious": 3, - "undetected": 0 - }, - "renameByName": {} - } - } - ], - "type": "bargauge" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 8 - }, - "id": 24, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.6.0", - "queryType": "table", - "rawSql": "SELECT client_ip, JSONExtractString(additional_fields, 'domain_name') AS domain_name\nFROM dns_loglines\nWHERE client_ip IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nLIMIT 100", - "refId": "A" - } - ], - "title": "False Positive IPs and domains", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": {}, - "includeByName": {}, - "indexByName": {}, - "renameByName": { - "client_ip": "Client IP address", - "domain_name": "Domain used" - } - } - } - ], - "type": "table" - }, - { - "datasource": { - "default": false, - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 8 - }, - "id": 25, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true - }, - "pluginVersion": "11.2.2+security-01", - "targets": [ - { - "datasource": { - "type": "grafana-clickhouse-datasource", - "uid": "PDEE91DDB90597936" - }, - "editorType": "sql", - "format": 1, - "meta": { - "builderOptions": { - "columns": [], - "database": "", - "limit": 1000, - "mode": "list", - "queryType": "table", - "table": "" - } - }, - "pluginVersion": "4.6.0", - "queryType": "table", - "rawSql": "SELECT client_ip, JSONExtractString(additional_fields, 'domain_name') AS domain_name\nFROM dns_loglines\nWHERE client_ip NOT IN (\n SELECT DISTINCT client_ip\n FROM alerts\n)\nAND client_ip IN ( -- change to notin\n SELECT DISTINCT client_ip\n FROM dgta_dataset\n INNER JOIN dns_loglines ON dgta_dataset.query = JSONExtractString(dns_loglines.additional_fields, 'domain_name')\n WHERE class = 1\n)\nLIMIT 100", - "refId": "A" - } - ], - "title": "False Negative IPs and domains", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": {}, - "includeByName": {}, - "indexByName": {}, - "renameByName": { - "client_ip": "Client IP address", - "domain_name": "Domain used" - } - } - } - ], - "type": "table" - } - ], - "refresh": "auto", - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "hidden": true - }, - "timezone": "browser", - "title": "Data Tests", - "uid": "cea2xqic5klq8b", - "version": 6, - "weekStart": "" -} diff --git a/docker/default.txt b/docker/default.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/docker/docker-compose-swarm/docker-compose.swarm-kafka.yml b/docker/docker-compose-swarm/docker-compose.swarm-kafka.yml deleted file mode 100644 index 405d263b..00000000 --- a/docker/docker-compose-swarm/docker-compose.swarm-kafka.yml +++ /dev/null @@ -1,133 +0,0 @@ -services: - - zookeeper: - image: confluentinc/cp-zookeeper:7.9.3 - networks: - - hamstring - environment: - ZOOKEEPER_CLIENT_PORT: 2181 - ZOOKEEPER_TICK_TIME: 2000 - ZOOKEEPER_SERVER_ID: 1 - volumes: - - zk-data:/var/lib/zookeeper/data - - zk-txn-logs:/var/lib/zookeeper/log - healthcheck: - test: ["CMD-SHELL", "curl -s localhost:8080/commands | grep ruok"] - interval: 10s - timeout: 5s - retries: 3 - deploy: - placement: - constraints: [node.hostname == hamstring-1] - restart_policy: - condition: on-failure - - kafka1: - image: confluentinc/cp-kafka:7.9.3 - networks: - - hamstring - ports: - - "8097:8097" - environment: - KAFKA_BROKER_ID: 1 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT - KAFKA_ADVERTISED_LISTENERS: INTERNAL://kafka1:19092,EXTERNAL://${HOST_IP}:8097 - KAFKA_LISTENERS: INTERNAL://0.0.0.0:19092,EXTERNAL://0.0.0.0:8097 - KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL - KAFKA_AUTO_CREATE_TOPICS_ENABLE: "false" - KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO" - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3 - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 2 - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3 - KAFKA_AUTHORIZER_CLASS_NAME: kafka.security.authorizer.AclAuthorizer - KAFKA_ALLOW_EVERYONE_IF_NO_ACL_FOUND: "true" - volumes: - - kafka-data1:/var/lib/kafka/data - healthcheck: - test: ["CMD-SHELL", "nc -z localhost 19092"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - placement: - constraints: [node.hostname == hamstring-1] - restart_policy: - condition: on-failure - - kafka2: - image: confluentinc/cp-kafka:7.9.3 - networks: - - hamstring - ports: - - "8098:8098" - environment: - KAFKA_BROKER_ID: 2 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT - KAFKA_ADVERTISED_LISTENERS: INTERNAL://kafka2:19093,EXTERNAL://${HOST_IP}:8098 - KAFKA_LISTENERS: INTERNAL://0.0.0.0:19093,EXTERNAL://0.0.0.0:8098 - KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL - KAFKA_AUTO_CREATE_TOPICS_ENABLE: "false" - KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO" - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3 - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 2 - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3 - KAFKA_AUTHORIZER_CLASS_NAME: kafka.security.authorizer.AclAuthorizer - KAFKA_ALLOW_EVERYONE_IF_NO_ACL_FOUND: "true" - volumes: - - kafka-data2:/var/lib/kafka/data - healthcheck: - test: ["CMD-SHELL", "nc -z localhost 19093"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - placement: - constraints: [node.hostname == hamstring-1] - restart_policy: - condition: on-failure - - kafka3: - image: confluentinc/cp-kafka:7.9.3 - networks: - - hamstring - ports: - - "8099:8099" - environment: - KAFKA_BROKER_ID: 3 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT - KAFKA_ADVERTISED_LISTENERS: INTERNAL://kafka3:19094,EXTERNAL://${HOST_IP}:8099 - KAFKA_LISTENERS: INTERNAL://0.0.0.0:19094,EXTERNAL://0.0.0.0:8099 - KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL - KAFKA_AUTO_CREATE_TOPICS_ENABLE: "false" - KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO" - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3 - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 2 - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3 - KAFKA_AUTHORIZER_CLASS_NAME: kafka.security.authorizer.AclAuthorizer - KAFKA_ALLOW_EVERYONE_IF_NO_ACL_FOUND: "true" - volumes: - - kafka-data3:/var/lib/kafka/data - healthcheck: - test: ["CMD-SHELL", "nc -z localhost 19094"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - placement: - constraints: [node.hostname == hamstring-1] - restart_policy: - condition: on-failure - -networks: - hamstring: - external: true - -volumes: - kafka-data1: - kafka-data2: - kafka-data3: - zk-data: - zk-txn-logs: diff --git a/docker/docker-compose-swarm/docker-compose.swarm-monitoring.yml b/docker/docker-compose-swarm/docker-compose.swarm-monitoring.yml deleted file mode 100644 index 5e183480..00000000 --- a/docker/docker-compose-swarm/docker-compose.swarm-monitoring.yml +++ /dev/null @@ -1,72 +0,0 @@ -services: - clickhouse-server: - image: clickhouse/clickhouse-server:24.3.12.75-alpine - volumes: - - ../create_tables:/docker-entrypoint-initdb.d - - ch_data:/var/lib/clickhouse/ - - ch_logs:/var/log/clickhouse-server/ - networks: - - hamstring - ports: - - "8123:8123" - - "9000:9000" -# healthcheck: -# test: [ "CMD-SHELL", "nc -z localhost 8123" ] -# interval: 10s -# timeout: 5s -# retries: 3 - deploy: - placement: - constraints: [ node.hostname == hamstring-3 ] - restart_policy: - condition: on-failure - - grafana: - image: grafana/grafana:11.2.2-security-01 - networks: - - hamstring - ports: - - "3000:3000" - volumes: - - ../grafana-provisioning/dashboards:/etc/grafana/provisioning/dashboards - - ../grafana-provisioning/dashboards/dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml - - ../grafana-provisioning/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml - environment: - - GF_SECURITY_ADMIN_USER=admin - - GF_SECURITY_ADMIN_PASSWORD=admin - - GF_INSTALL_PLUGINS=grafana-clickhouse-datasource -# healthcheck: -# test: [ "CMD-SHELL", "nc -z localhost 3000" ] -# interval: 10s -# timeout: 5s -# retries: 3 - deploy: - placement: - constraints: [ node.hostname == hamstring-3 ] - restart_policy: - condition: on-failure - - monitoring_agent: - image: stefan96/hamstring-monitoring - networks: - - hamstring - environment: - - GROUP_ID=monitoring_agent - depends_on: - - kafka1 - - kafka2 - - kafka3 - - clickhouse-server - deploy: - placement: - constraints: [ node.hostname == hamstring-3 ] - restart_policy: - condition: on-failure - -networks: - hamstring: - external: true - -volumes: - ch_data: - ch_logs: diff --git a/docker/docker-compose-swarm/docker-compose.swarm-pipeline.yml b/docker/docker-compose-swarm/docker-compose.swarm-pipeline.yml deleted file mode 100644 index 5a68ff69..00000000 --- a/docker/docker-compose-swarm/docker-compose.swarm-pipeline.yml +++ /dev/null @@ -1,111 +0,0 @@ -services: - logserver: - image: stefan96/hamstring-logserver - networks: - - hamstring - deploy: -# resources: -# limits: -# cpus: '2' -# memory: 512m -# reservations: -# cpus: '1' -# memory: 256m - placement: - constraints: [ node.hostname == hamstring-2 ] - volumes: - - ../default.txt:/opt/file.txt - - ../../config.yaml:/app/config.yaml - environment: - - GROUP_ID=log_storage - - logcollector: - image: stefan96/hamstring-logcollector - networks: - - hamstring - deploy: -# resources: -# limits: -# cpus: '2' -# memory: 512m -# reservations: -# cpus: '1' -# memory: 256m - placement: - constraints: [ node.hostname == hamstring-2 ] - volumes: - - ../../config.yaml:/app/config.yaml - environment: - - GROUP_ID=log_collection - - prefilter: - image: stefan96/hamstring-prefilter - networks: - - hamstring - deploy: - mode: "replicated" - replicas: 1 -# resources: -# limits: -# cpus: '2' -# memory: 512m -# reservations: -# cpus: '1' -# memory: 256m - placement: - constraints: [ node.hostname == hamstring-2 ] - volumes: - - ../../config.yaml:/app/config.yaml - environment: - - GROUP_ID=log_filtering - - inspector: - image: stefan96/hamstring-inspector - networks: - - hamstring - deploy: - mode: "replicated" - replicas: 1 -# resources: -# limits: -# cpus: '2' -# memory: 512m -# reservations: -# cpus: '1' -# memory: 256m - placement: - constraints: [ node.hostname == hamstring-2 ] - volumes: - - ../../config.yaml:/app/config.yaml - environment: - - GROUP_ID=data_inspection - - NUMBER_OF_INSTANCES=1 - - detector: - image: stefan96/hamstring-detector - networks: - - hamstring - deploy: - mode: "replicated" - replicas: 1 -# resources: -# limits: -# cpus: '2' -# memory: 512m -# reservations: -# cpus: '1' -# memory: 256m -# generic_resources: -# - discrete_resource_spec: -# kind: 'gpu' -# value: 1 - placement: - constraints: [ node.hostname == hamstring-2 ] - volumes: - - ../../config.yaml:/app/config.yaml - environment: - - GROUP_ID=data_analysis - -networks: - hamstring: - external: true diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 39ef88d9..a37d0f19 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -57,13 +57,26 @@ services: clickhouse-server: condition: service_healthy zeek1: - extends: - file: "docker-compose/base/docker-compose.zeek.yml" - service: zeek + image: ghcr.io/hamstring-ndr/hamstring-zeek:1.0.0 environment: - CONTAINER_NAME=zeek1 volumes: - ../data/test_pcaps/:/opt/static_files + - ../../../config.yaml:/opt/config.yaml + cap_add: + - NET_ADMIN + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + clickhouse-server: + condition: service_healthy + grafana: + condition: service_healthy + network_mode: host monitoring_agent-dev: extends: @@ -227,6 +240,43 @@ services: condition: service_healthy profiles: ["prod"] + alerter-dev: + extends: + file: "docker-compose/dev/docker-compose.pipeline.yml" + service: alerter + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + profiles: ["dev"] + + alerter: + extends: + file: "docker-compose/prod/docker-compose.pipeline.yml" + service: alerter + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + profiles: ["prod"] + + zeek-1: + image: ghcr.io/hamstring-ndr/hamstring-zeek:1.0.0 + cap_add: + - NET_ADMIN + network_mode: host + environment: + - CONTAINER_NAME=zeek-1 + volumes: + - ./config.yaml:/opt/config.yaml + - ./data/test_pcaps/:/opt/static_files + networks: hamstring: driver: bridge diff --git a/docker/docker-compose/base/docker-compose.zeek.yml b/docker/docker-compose/base/docker-compose.zeek.yml deleted file mode 100644 index 24a67439..00000000 --- a/docker/docker-compose/base/docker-compose.zeek.yml +++ /dev/null @@ -1,22 +0,0 @@ -services: - zeek: - build: - context: ../../../ - dockerfile: docker/dockerfiles/Dockerfile.zeek - cap_add: - - NET_ADMIN - depends_on: - kafka1: - condition: service_healthy - kafka2: - condition: service_healthy - kafka3: - condition: service_healthy - clickhouse-server: - condition: service_healthy - grafana: - condition: service_healthy - network_mode: host - volumes: - - ../../../src:/opt/src - - ../../../config.yaml:/opt/config.yaml diff --git a/docker/docker-compose/dev/docker-compose.pipeline.yml b/docker/docker-compose/dev/docker-compose.pipeline.yml index d9c0506b..a4a14624 100644 --- a/docker/docker-compose/dev/docker-compose.pipeline.yml +++ b/docker/docker-compose/dev/docker-compose.pipeline.yml @@ -73,3 +73,19 @@ services: replicas: 1 environment: - GROUP_ID=data_analysis + + alerter: + build: + context: ../../.. + dockerfile: docker/dockerfiles/Dockerfile.alerter + restart: "unless-stopped" + volumes: + - ../../../config.yaml:/app/config.yaml + - /opt/logs:/opt/logs + networks: + hamstring: + deploy: + mode: "replicated" + replicas: 1 + environment: + - GROUP_ID=data_alerting diff --git a/docker/docker-compose/prod/docker-compose.pipeline.yml b/docker/docker-compose/prod/docker-compose.pipeline.yml index c1c4afa0..1b48edbf 100644 --- a/docker/docker-compose/prod/docker-compose.pipeline.yml +++ b/docker/docker-compose/prod/docker-compose.pipeline.yml @@ -126,3 +126,22 @@ services: # capabilities: [ gpu ] environment: - GROUP_ID=data_analysis + + alerter: + # build: + # context: ../../.. + # dockerfile: docker/dockerfiles/Dockerfile.alerter + # network: host + image: stefan96/hamstring-alerter:v1.0.0 + restart: "unless-stopped" + volumes: + - ../../../config.yaml:/app/config.yaml + - /opt/logs:/opt/logs + networks: + hamstring: + # platform: linux/x86_64 + deploy: + mode: "replicated" + replicas: 1 + environment: + - GROUP_ID=data_alerting diff --git a/docker/dockerfiles/Dockerfile.alerter b/docker/dockerfiles/Dockerfile.alerter new file mode 100644 index 00000000..9338c7e5 --- /dev/null +++ b/docker/dockerfiles/Dockerfile.alerter @@ -0,0 +1,22 @@ +# Build a virtualenv using the appropriate Debian release +# * Install python3-venv for the built-in Python3 venv module (not installed by default) +# * Install gcc libpython3-dev to compile C Python modules +# * In the virtualenv: Update pip setuputils and wheel to support building new packages +FROM debian:12-slim AS build +RUN apt-get update && \ + apt-get install --no-install-suggests --no-install-recommends --yes python3-venv gcc g++ build-essential libpython3-dev && \ + python3 -m venv /venv && \ + /venv/bin/pip install --upgrade pip setuptools wheel + +# Build the virtualenv as a separate step: Only re-execute this step when requirements.txt changes +FROM build AS build-venv +COPY requirements/requirements.alerter.txt /requirements.alerter.txt +RUN /venv/bin/pip install --disable-pip-version-check -r /requirements.alerter.txt + +# Copy the virtualenv into a distroless image +FROM gcr.io/distroless/python3-debian12 +COPY --from=build-venv /venv /venv +COPY src/base /app/src/base +COPY src/alerter /app/src/alerter +WORKDIR /app +ENTRYPOINT ["/venv/bin/python3", "src/alerter/alerter.py"] diff --git a/docker/dockerfiles/Dockerfile.detector b/docker/dockerfiles/Dockerfile.detector index a715f411..65c8b84b 100644 --- a/docker/dockerfiles/Dockerfile.detector +++ b/docker/dockerfiles/Dockerfile.detector @@ -4,7 +4,7 @@ # * In the virtualenv: Update pip setuputils and wheel to support building new packages FROM debian:12-slim AS build RUN apt-get update && \ - apt-get install --no-install-suggests --no-install-recommends --yes python3-venv gcc libpython3-dev && \ + apt-get install --no-install-suggests --no-install-recommends --yes python3-venv gcc g++ build-essential libpython3-dev && \ python3 -m venv /venv && \ /venv/bin/pip install --upgrade pip setuptools wheel diff --git a/docker/dockerfiles/Dockerfile.zeek b/docker/dockerfiles/Dockerfile.zeek deleted file mode 100644 index e6dd5f93..00000000 --- a/docker/dockerfiles/Dockerfile.zeek +++ /dev/null @@ -1,29 +0,0 @@ -FROM zeek/zeek:8.0 - -RUN apt update -y && apt upgrade -y -RUN apt install -y \ - build-essential \ - cmake \ - librdkafka-dev \ - libssl-dev \ - libpcap-dev \ - vim \ - iproute2 \ - python3-pip - -# install the zeek kafka plugin -RUN yes | zkg install zeek-kafka --user-var LIBRDKAFKA_ROOT=/usr/local -RUN setcap cap_net_raw,cap_net_admin=+eip $(which zeek) - -RUN rm /usr/lib/python3.13/EXTERNALLY-MANAGED -COPY requirements/requirements.zeek.txt /opt/requirements.txt -RUN pip3 install -r /opt/requirements.txt - -RUN chown -R root:root /usr/local/zeek -RUN mkdir /opt/logs -WORKDIR /opt/logs - -RUN mkdir "/opt/static_files" -ENV STATIC_FILES_DIR="/opt/static_files" - -CMD ["bash", "-c", "cd /opt/ && python3 /opt/src/zeek/zeek_handler.py -c /opt/config.yaml"] diff --git a/docker/init_datatests.sh b/docker/init_datatests.sh deleted file mode 100755 index 783afdbb..00000000 --- a/docker/init_datatests.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -echo "Creating tables for normal operation..." - -for script in /create_tables/*.sql; do - echo "Executing $script..." - clickhouse-client --host=127.0.0.1 --query="$(cat $script)" -done - -echo "Initializing datatest tables..." - -for script in /create_datatest_tables/*.sql; do - echo "Executing $script..." - clickhouse-client --host=127.0.0.1 --query="$(cat $script)" -done - -echo "Inserting data..." - -for script in /insert_datatest_data/*.sql; do - echo "Executing $script..." - clickhouse-client --host=127.0.0.1 --query="$(cat $script)" -done - -echo "Initialization complete!" diff --git a/docs/DOCKER_TO_NIX.md b/docs/DOCKER_TO_NIX.md deleted file mode 100644 index cc8fb60a..00000000 --- a/docs/DOCKER_TO_NIX.md +++ /dev/null @@ -1,162 +0,0 @@ -# Docker to NixOS Migration Guide - -## Overview - -HAMSTRING now uses **Nix** instead of Docker for reproducible,cross-platform builds. OCI images are generated from Nix for Docker compatibility. - -## What Changed - -### Before (Docker) -```bash -docker build -f docker/dockerfiles/Dockerfile.logserver -t hamstring/logserver . -docker run hamstring/logserver -``` - -### After (Nix) -```bash -nix build .#oci-logserver -docker load < result -docker run hamstring/logserver:latest -``` - -## Available OCI Images - -| Image | Nix Package | Description | -|-------|-------------|-------------| -| `hamstring/logserver` | `.#oci-logserver` | C++ LogServer | -| `hamstring/logcollector` | `.#oci-logcollector` | C++ LogCollector | -| `hamstring/prefilter` | `.#oci-prefilter` | C++ Prefilter | -| `hamstring/inspector` | `.#oci-inspector` | C++ ML Inspector | -| `hamstring/zeek` | `.#oci-zeek` | Zeek network capture | - -## Build OCI Images - -### Single Image -```bash -# Build -nix build .#oci-inspector - -# Load into Docker -docker load < result - -# Run -docker run -v $(pwd)/config.yaml:/config.yaml \ - hamstring/inspector:latest -``` - -### All Images -```bash -# Build all OCI images -nix build .#oci-images - -# Load all at once -for img in result-*; do - docker load < $img -done - -# List images -docker images | grep hamstring -``` - -## Cross-Platform Build - -```bash -# Build for Linux (from macOS) -nix build .#oci-logserver --system x86_64-linux - -# Build for ARM -nix build .#oci-logserver --system aarch64-linux - -# Build for all platforms -nix build .#oci-logserver \ - --system x86_64-linux \ - --system aarch64-darwin -``` - -## Docker Compose Equivalent - -### Old docker-compose.yml -```yaml -services: - logserver: - build: - context: . - dockerfile: docker/dockerfiles/Dockerfile.logserver - command: ./logserver config.yaml -``` - -### New (Nix-built images) -```yaml -services: - logserver: - image: hamstring/logserver:latest - volumes: - - ./config.yaml:/config.yaml -``` - -## Testing - -```bash -# Start Kafka -docker run -d --name kafka apache/kafka:latest - -# Run pipeline modules -docker run --link kafka -v $(pwd)/config.yaml:/config.yaml \ - hamstring/logserver:latest - -docker run --link kafka -v $(pwd)/config.yaml:/config.yaml \ - hamstring/inspector:latest -``` - -## Benefits - -✅ **Reproducible**: Same build everywhere -✅ **Fast**: Nix caching -✅ **Cross-platform**: Build for any architecture -✅ **Pure**: Hermetic builds -✅ **Small**: Minimal images (no layers waste) - -## Migration Checklist - -- [x] Convert C++ modules to Nix packages -- [x] Create OCI image builders -- [x] Add Zeek image -- [x] Document migration -- [ ] Update CI/CD to use Nix -- [ ] Archive old Dockerfiles - -## Advanced - -### Custom OCI Image -```nix -# In flake.nix -oci-custom = buildOciImage { - name = "mymodule"; - package = self.packages.${system}.mymodule; -}; -``` - -### Push to Registry -```bash -# Build -nix build .#oci-inspector - -# Push to Docker Hub -docker load < result -docker tag hamstring/inspector:latest myuser/inspector:v1.0 -docker push myuser/inspector:v1.0 - -# Or use skopeo (no Docker daemon needed) -skopeo copy oci-archive:result docker://myuser/inspector:v1.0 -``` - -## Summary - -**Nix provides:** -- 🔄 Bit-for-bit reproducible builds -- 🌍 True cross-platform compilation -- ⚡ Faster builds with caching -- 🐳 Docker-compatible OCI images -- 🔒 Hermetic, auditable builds - -**Perfect for CI/CD and production deployments!** diff --git a/docs/NIX_DEPLOYMENT.md b/docs/NIX_DEPLOYMENT.md deleted file mode 100644 index 8c77115c..00000000 --- a/docs/NIX_DEPLOYMENT.md +++ /dev/null @@ -1,312 +0,0 @@ -# HAMSTRING - NixOS Deployment Guide - -## Overview - -HAMSTRING uses Nix flakes for reproducible builds and cross-platform deployment instead of Docker. - -## Quick Start - -### Development Environment - -```bash -# Enter development shell -nix develop - -# Build all modules -cmake -B build -S cpp -cmake --build build -j -``` - -### Build Individual Modules - -```bash -# Build specific module -nix build .#logserver -nix build .#logcollector -nix build .#prefilter -nix build .#inspector - -# Build all modules -nix build - -# Run directly -nix run .#logserver -- config.yaml -``` - -### Cross-Platform Build - -```bash -# Build for Linux (from macOS) -nix build .#logserver --system x86_64-linux - -# Build for macOS ARM -nix build .#logserver --system aarch64-darwin - -# Build for all platforms -nix build .#logserver --system x86_64-linux --system aarch64-darwin -``` - -## NixOS Deployment - -### Configuration - -Add to your `configuration.nix`: - -```nix -{ - inputs.hamstring.url = "github:yourusername/hamstring"; - - # ... in your configuration: - - imports = [ - hamstring.nixosModules.default - ]; - - services.hamstring = { - enable = true; - configFile = ./config.yaml; - - kafkaBrokers = [ "localhost:19092" "localhost:19093" ]; - - modules = { - logserver.enable = true; - logcollector.enable = true; - prefilter.enable = true; - inspector.enable = true; - }; - }; - - # Kafka service (example) - services.apache-kafka = { - enable = true; - # ... kafka config - }; -} -``` - -### Deploy - -```bash -# Rebuild NixOS configuration -sudo nixos-rebuild switch - -# Check service status -systemctl status hamstring-logserver -systemctl status hamstring-logcollector -systemctl status hamstring-prefilter -systemctl status hamstring-inspector - -# View logs -journalctl -u hamstring-inspector -f -``` - -## Development Workflow - -### Enter Dev Shell - -```bash -nix develop -``` - -Provides: -- C++20 compiler (clang/gcc) -- CMake, pkg-config -- All dependencies (Boost, spdlog, Kafka, etc.) -- Development tools (clang-tools, gdb, valgrind) -- Python with Zeek handler dependencies -- Zeek for network capture - -### Build & Test - -```bash -# Inside nix develop -cmake -B build -S cpp \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=ON - -cmake --build build -j - -# Run tests -cd build && ctest --output-on-failure - -# Run module -./build/src/inspector/inspector ../config.yaml -``` - -### Format Code - -```bash -# Auto-provided in dev shell -clang-format -i cpp/src/**/*.{cpp,hpp} -``` - -## Benefits Over Docker - -✅ **Reproducible Builds** -- Exact dependency versions -- Same build on any platform -- No "works on my machine" - -✅ **Cross-Platform** -- Build for Linux from macOS -- Build for ARM from x86 -- Single flake.nix for all platforms - -✅ **Fast Iteration** -- Nix cache reuses builds -- Incremental compilation -- No image rebuild delays - -✅ **Native Performance** -- No containerization overhead -- Direct system calls -- Full CPU access - -✅ **Development Environment** -- Instant dev shell -- All tools included -- Consistent across team - -## CI/CD Integration - -### GitHub Actions - -```yaml -name: Build - -on: [push, pull_request] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: cachix/install-nix-action@v22 - - uses: cachix/cachix-action@v12 - with: - name: hamstring - - - name: Build all modules - run: nix build .#default - - - name: Run tests - run: nix develop -c ctest --test-dir build -``` - -### Binary Cache - -```bash -# Setup Cachix for faster builds -cachix use hamstring - -# Build and push -nix build -cachix push hamstring ./result -``` - -## Migrating from Docker - -### Docker Compose → Nix - -**Before (docker-compose.yml):** -```yaml -services: - logserver: - build: ./cpp - command: ./logserver config.yaml -``` - -**After (flake.nix):** -```nix -packages.logserver = buildModule { name = "logserver"; }; -``` - -### Running Services - -**Before:** -```bash -docker-compose up -d logserver -``` - -**After:** -```bash -# Development -nix run .#logserver -- config.yaml - -# Production (NixOS) -systemctl start hamstring-logserver -``` - -## Troubleshooting - -### Build Fails - -```bash -# Clean build -nix build .#logserver --rebuild - -# Verbose output -nix build .#logserver -L - -# Show build logs -nix log .#logserver -``` - -### Missing Dependencies - -```bash -# Update flake inputs -nix flake update - -# Check what's available -nix flake show -``` - -### Platform-Specific Issues - -```bash -# Force specific platform -nix build .#logserver \ - --system x86_64-linux \ - --option sandbox false -``` - -## Advanced Usage - -### Custom Build Options - -```nix -# In flake.nix -buildModule = { name, cmakeFlags ? [] }: - pkgs.stdenv.mkDerivation { - cmakeFlags = [ - "-DCMAKE_BUILD_TYPE=Release" - "-DENABLE_ASAN=ON" # AddressSanitizer - ] ++ cmakeFlags; - }; -``` - -### Development with Different Compilers - -```bash -# Use GCC -nix develop --override-input nixpkgs github:NixOS/nixpkgs/gcc-latest - -# Use Clang -nix develop --override-input nixpkgs github:NixOS/nixpkgs/llvm-latest -``` - -## Summary - -**Nix provides:** -- 🔄 Reproducible builds across platforms -- ⚡ Fast development iteration -- 📦 Declarative deployment -- 🔒 Hermetic build environment -- 🚀 Native performance (no containers) - -**Perfect for:** -- Cross-platform C++ development -- CI/CD pipelines -- Production NixOS deployments -- Team development consistency diff --git a/docs/ZEEK_INTEGRATION.md b/docs/ZEEK_INTEGRATION.md deleted file mode 100644 index 0e40059f..00000000 --- a/docs/ZEEK_INTEGRATION.md +++ /dev/null @@ -1,359 +0,0 @@ -# Zeek Network Capture Integration - -## Overview - -HAMSTRING uses **Zeek** (formerly Bro) for live network traffic capture and DNS analysis. Zeek captures packets from network interfaces, parses DNS protocol data, and forwards it to Kafka for processing by the C++ pipeline. - -## Architecture - -``` -Network Interface → Zeek → Kafka → C++ LogServer → C++ Pipeline - ↓ - (DNS packets) - ↓ - (Parsed JSON) - ↓ - (Kafka Topic: pipeline-logserver_in-dns) - ↓ - LogServer → LogCollector → Prefilter → Inspector → Detector -``` - -## Prerequisites - -### Install Zeek - -**macOS (Homebrew):** -```bash -brew install zeek - -# Verify installation path -ls -la /opt/homebrew/opt/zeek/share/zeek/site/ # Apple Silicon -ls -la /usr/local/opt/zeek/share/zeek/site/ # Intel Mac -``` - -**Ubuntu/Debian:** -```bash -sudo apt-get install zeek - -# Default path: /usr/local/zeek/share/zeek/site/ -``` - -**Verify Installation:** -```bash -zeek --version -# Should show: zeek version X.X.X - -# Python handler auto-detects installation path -python -m src.zeek.zeek_handler --help -``` - -### Install Zeek Kafka Plugin - -```bash -# Clone and build -git clone https://github.com/apache/metron-bro-plugin-kafka -cd metron-bro-plugin-kafka -./configure --with-zeek=/usr/local/zeek -make -make install -``` - -## Running Zeek with C++ Pipeline - -### Quick Start - -**Local Execution (macOS/Linux):** -```bash -# 1. Start Kafka (required) -docker-compose up -d kafka1 kafka2 kafka3 - -# 2. Start Zeek (auto-detects first sensor in config) -python -m src.zeek.zeek_handler -c config.yaml - -# 3. Start C++ pipeline in separate terminals -./start-pipeline.sh config.yaml -``` - -**Docker Execution:** - -```bash -# 1. Start Kafka (required) -docker-compose up -d kafka1 kafka2 kafka3 - -# 2. Start complete pipeline (Zeek + C++ modules) -./start-pipeline.sh config.yaml -``` - -### Manual Start (Step by Step) - -```bash -# Terminal 1: Start Zeek -python -m src.zeek.zeek_handler -c config.yaml - -# Terminal 2-5: Start C++ modules -./cpp/build/src/logserver/logserver config.yaml -./cpp/build/src/logcollector/logcollector config.yaml -./cpp/build/src/prefilter/prefilter config.yaml -./cpp/build/src/inspector/inspector config.yaml -``` - -## Configuration - -### Zeek Configuration - -**Auto-Detection:** The Python handler automatically detects Zeek installation: -- macOS Apple Silicon: `/opt/homebrew/opt/zeek/share/zeek/site/` -- macOS Intel: `/usr/local/opt/zeek/share/zeek/site/` -- Linux: `/usr/local/zeek/share/zeek/site/` - -**Manual Override:** -```bash -python -m src.zeek.zeek_handler -c config.yaml \ - --zeek-config-location /custom/path/to/local.zeek -``` - -**Configuration File (`local.zeek`):** - -```zeek -@load policy/tuning/json-logs.zeek -@load Apache/Kafka - -# Kafka broker configuration -redef Kafka::kafka_conf = table( - ["metadata.broker.list"] = "localhost:19092,localhost:19093,localhost:19094" -); - -# Topic configuration -redef Kafka::topic_name = "pipeline-logserver_in"; -redef Kafka::tag_json = T; - -# Which logs to send to Kafka -redef Kafka::logs_to_send = set(DNS::LOG); -``` - -### Pipeline Configuration (`config.yaml`) - -```yaml -pipeline: - zeek: - sensors: - - name: "sensor1" - interface: "en0" # Network interface - protocols: - - dns - - http - static_analysis: false # Set to true for PCAP analysis - -environment: - kafka_topics_prefix: - pipeline: - logserver_in: "pipeline-logserver_in" -``` - -## Capture Modes - -### 1. Live Network Capture (Default) - -Captures real-time traffic from network interface: - -```bash -python -m src.zeek.zeek_handler -c config.yaml -``` - -**Configuration:** -```yaml -pipeline: - zeek: - sensors: - - interface: "eth0" # or en0 on macOS - static_analysis: false -``` - -### 2. Static PCAP Analysis - -Analyzes pre-recorded PCAP files: - -```bash -# Set environment variable -export STATIC_FILES_DIR=/path/to/pcaps - -python -m src.zeek.zeek_handler -c config.yaml -``` - -**Configuration:** -```yaml -pipeline: - zeek: - static_analysis: true -``` - -Place `.pcap` files in `$STATIC_FILES_DIR/`. - -## Data Flow - -### Zeek Output Format - -Zeek sends DNS logs to Kafka in JSON: - -```json -{ - "ts": 1700925045.123, - "uid": "CHhAvVGS1DHFjwGM9", - "id.orig_h": "192.168.1.100", - "id.orig_p": 52134, - "id.resp_h": "8.8.8.8", - "id.resp_p": 53, - "query": "example.com", - "qtype": 1, - "qtype_name": "A", - "rcode": 0, - "rcode_name": "NOERROR", - "AA": false, - "TC": false, - "RD": true, - "RA": true, - "Z": 0, - "answers": ["93.184.216.34"], - "TTLs": [3600.0] -} -``` - -### Pipeline Processing - -1. **Zeek** → Captures & parses DNS -2. **Kafka** → Buffers messages -3. **LogServer** → Forwards to collectors -4. **LogCollector** → Validates & batches by subnet -5. **Prefilter** → Filters irrelevant traffic -6. **Inspector** → ML anomaly detection -7. **Detector** → DGA classification - -## Monitoring - -### Check Zeek Status - -```bash -# If using zeekctl -zeekctl status - -# Check Zeek logs -tail -f /usr/local/zeek/logs/current/dns.log -``` - -### Monitor Kafka Topics - -```bash -# Zeek output -docker exec kafka1 kafka-console-consumer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-logserver_in-dns \ - --from-beginning - -# LogCollector batches -docker exec kafka1 kafka-console-consumer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-logserver_to_collector-dns -``` - -### Pipeline Metrics - -```bash -# Watch Inspector detections (ML anomalies) -docker exec kafka1 kafka-console-consumer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-prefilter_to_inspector-dga_inspector -``` - -## Testing - -### Generate Test Traffic - -```bash -# DNS queries -while true; do - dig @8.8.8.8 google.com - dig @8.8.8.8 facebook.com - dig @8.8.8.8 randomdomain$(date +%s).xyz # Simulates DGA - sleep 1 -done -``` - -### Verify Pipeline - -```bash -# 1. Check Zeek is capturing -docker exec kafka1 kafka-console-consumer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-logserver_in-dns \ - --max-messages 1 - -# Should show JSON DNS records - -# 2. Check C++ pipeline processing -tail -f cpp/build/src/inspector/inspector.log - -# Should show anomaly detection results -``` - -## Performance - -| Component | Throughput | Latency | -|-----------|------------|---------| -| Zeek Capture | ~100K pkts/s | <1ms | -| Kafka Buffer | ~1M msg/s | <5ms | -| C++ Pipeline | ~10K msg/s | ~10ms | -| **Total** | **~10K DNS/s** | **~15ms** | - -## Troubleshooting - -### Zeek Not Capturing - -```bash -# Check interface name -ifconfig - -# Check permissions (may need sudo) -sudo python -m src.zeek.zeek_handler -c config.yaml - -# Check Zeek configuration -zeek --parse-only zeek/local.zeek -``` - -### Kafka Connection Issues - -```bash -# Verify Kafka brokers -docker exec kafka1 kafka-broker-api-versions \ - --bootstrap-server localhost:19092 - -# Check topic exists -docker exec kafka1 kafka-topics \ - --bootstrap-server localhost:19092 \ - --list | grep logserver_in -``` - -### No Data in Pipeline - -```bash -# 1. Verify Zeek is sending to Kafka -docker exec kafka1 kafka-console-consumer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-logserver_in-dns \ - --max-messages 5 - -# 2. Check LogServer is consuming -tail -f cpp/build/src/logserver/logserver.log - -# 3. Verify topic configuration matches -grep "logserver_in" config.yaml -``` - -## Summary - -✅ **Zeek captures live DNS traffic** -✅ **Sends JSON to Kafka via plugin** -✅ **C++ pipeline processes in real-time** -✅ **ML anomaly detection operational** -✅ **~10K DNS queries/second throughput** - -**Complete integration working with zero C++ code changes!** diff --git a/docs/configuration.rst b/docs/configuration.rst index f3922c96..259bed01 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -253,8 +253,37 @@ To entirely skip the anomaly detection phase, you can set ``inspector_module_nam * - threshold - ``0.5`` - Threshold for the detector's classification. + * - produce_topics + - ``(empty)`` + - (Optional) Comma-separated list of topic suffixes to produce alerts to. If left empty, defaults to the ``generic`` topic. +``pipeline.alerting`` +^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: ``alerting`` Parameters + :header-rows: 1 + :widths: 30 20 50 + + * - Parameter + - Default Value + - Description + * - log_to_file + - ``true`` + - Boolean flag to enable/disable logging of alerts to a local file. + * - log_to_kafka + - ``true`` + - Boolean flag to enable/disable forwarding of alerts to an external Kafka topic. + * - log_file_path + - ``"/opt/logs/alerts.txt"`` + - Local file path where alerts will be appended if ``log_to_file`` is enabled. + * - external_kafka_topic + - ``"hamstring_alerts"`` + - Name of the external Kafka topic where alerts will be sent if ``log_to_kafka`` is enabled. + * - plugins + - ``[]`` + - List of custom alerter plugins to execute. Each plugin must specify ``name``, ``alerter_module_name``, and ``alerter_class_name``. + ``pipeline.zeek`` ^^^^^^^^^^^^^^^^^ diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 9f6c6a4e..5ab59c20 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -752,10 +752,74 @@ You may use the provided, pre-trained models or supply your own. To use a custom - `inspector_name`: name of the inspector configuration for input - `detector_module_name`: name of the python module the implementation details reside - `detector_class_name`: name of the class in the python module to load the detector implementation details +- `produce_topics`: (Optional) Comma-separated list of topic suffixes to produce alerts to. If left empty, alerts are sent to the ``generic`` alerter topic. These parameters are loaded at startup and used to download, verify, and load the model/scaler if not already cached locally (in temp directory). +Stage 7: Alerter +================ +.. _alerter_stage: + +Overview +-------- + +The **Alerter** stage is the final stage of the HAMSTRING pipeline. It serves as a central hub for all alerts generated by various detectors. Its primary role is to consolidate these alerts and perform configurable actions, such as logging to a local file or forwarding them to an external Kafka topic for further processing (e.g., SIEM integration). + +The Alerter stage is designed to be highly flexible, allowing for custom "Alerter Plugins" to perform arbitrary workloads on the alert data before the final logging/forwarding actions are executed. + +Core Functionality +------------------ + +1. **Alert Consumption**: The Alerter stage runs multiple instances of alerters, each listening on a specific Kafka topic. +2. **Topic Naming**: By convention, the topics listened to by the Alerter stage follow the pattern: + ``[detector_to_alerter_prefix]-[suffix]`` + where the prefix is defined in the environment configuration and the suffix is either ``generic`` or a specific name defined in a detector's ``produce_topics`` configuration. +3. **Generic Topic**: Detectors that do not specify a particular ``produce_topics`` will automatically route their alerts to the ``generic`` topic. A dedicated ``GenericAlerter`` instance consumes from this topic. +4. **Plugin Execution**: For each alert received, the Alerter first passes the data through any configured plugins. Plugins can mutate the alert payload (e.g., adding metadata). +5. **Base Actions**: After plugin processing, the Alerter performs the "Base Actions" as configured: + - **Log to File**: Appends the alert JSON to a local log file. + - **Log to Kafka**: Forwards the alert to an external Kafka topic. + +Main Classes +------------ + +.. py:currentmodule:: src.alerter.alerter +.. autoclass:: AlerterBase + +.. py:currentmodule:: src.alerter.alerter +.. autoclass:: GenericAlerter + +The :class:`AlerterBase` provides the foundation for all alerter instances, handling the base logging and Kafka forwarding logic. Custom plugins should not necessarily inherit from it but are loaded dynamically by the framework. + +Usage and Configuration +----------------------- + +The Alerter stage is configured globally in the ``pipeline.alerting`` section of ``config.yaml``. + +Base Configuration +.................. + +- ``log_to_file``: boolean, enables logging to a local file. +- ``log_to_kafka``: boolean, enables forwarding to an external Kafka topic. +- ``log_file_path``: path to the local log file (e.g., ``/opt/logs/alerts.txt``). +- ``external_kafka_topic``: the name of the Kafka topic to forward alerts to (e.g., ``hamstring_alerts``). + +Plugins +....... + +Custom alerter plugins can be defined in the ``plugins`` list. Each plugin entry requires: + +- ``name``: unique name for the plugin instance. +- ``alerter_module_name``: name of the python module in ``src/alerter/plugins/``. +- ``alerter_class_name``: name of the class to instantiate. + +Example Plugin: HelloWorld Alerter +................................... + +The ``HelloWorldAlerter`` is a sample plugin that appends a simple "hello_world" field to every alert payload before it is logged or forwarded. + + Supported Detectors Overview ---------------------------- @@ -770,3 +834,10 @@ DGA Detector ................... The :class:`DGADetector` consumes anomalous batches of requests, preprocessed by the StreamAD library. It calculates a probability score for each request, to find if a DGA DNS entry was queried. + +Domainator Detector +................... +The :class:`DomainatorDetector` consumes anomalous batches of requests. +It identifies potential data exfiltration and command & control on the subdomain level by analyzing characteristics of the subdomains. +Messages are grouped by domain into fixed-size windows to allow for sequential anomaly detection. The detector leverages machine learning based on statistical and linguistic features from the domain name +including label lengths, character frequencies, entropy measures, and counts of different character types across domain name levels. \ No newline at end of file diff --git a/flake.lock b/flake.lock deleted file mode 100644 index 128df5be..00000000 --- a/flake.lock +++ /dev/null @@ -1,61 +0,0 @@ -{ - "nodes": { - "flake-utils": { - "inputs": { - "systems": "systems" - }, - "locked": { - "lastModified": 1731533236, - "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, - "nixpkgs": { - "locked": { - "lastModified": 1763966396, - "narHash": "sha256-6eeL1YPcY1MV3DDStIDIdy/zZCDKgHdkCmsrLJFiZf0=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "5ae3b07d8d6527c42f17c876e404993199144b6a", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "flake-utils": "flake-utils", - "nixpkgs": "nixpkgs" - } - }, - "systems": { - "locked": { - "lastModified": 1681028828, - "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", - "owner": "nix-systems", - "repo": "default", - "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", - "type": "github" - }, - "original": { - "owner": "nix-systems", - "repo": "default", - "type": "github" - } - } - }, - "root": "root", - "version": 7 -} diff --git a/flake.nix b/flake.nix deleted file mode 100644 index f106fafd..00000000 --- a/flake.nix +++ /dev/null @@ -1,421 +0,0 @@ -{ - description = "HAMSTRING - High-performance DGA detection pipeline"; - - inputs = { - nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; - }; - - outputs = { self, nixpkgs, flake-utils }: - flake-utils.lib.eachDefaultSystem (system: - let - pkgs = nixpkgs.legacyPackages.${system}; - - # CityHash library (required by ClickHouse) - cityhash = pkgs.stdenv.mkDerivation { - pname = "cityhash"; - version = "1.1.1"; - - src = pkgs.fetchFromGitHub { - owner = "google"; - repo = "cityhash"; - rev = "8af9b8c2b889d80c22d6bc26ba0df1afb79a30db"; # v1.1.1 - sha256 = "sha256-Ji6o9N8Nd6BrFO6jEhLjk+qlN8QNdTVZoM6j3Yqb2lg="; - }; - - nativeBuildInputs = with pkgs; [ cmake ]; - - # CityHash uses old-style build - configurePhase = '' - ./configure --prefix=$out --enable-sse4.2 - ''; - - preConfigure = '' - # Generate configure script if not present - [ -f configure ] || { - echo "Creating configure script..." - autoreconf -fvi || true - } - ''; - }; - - # ClickHouse C++ client library (built from source) - clickhouse-cpp = pkgs.stdenv.mkDerivation { - pname = "clickhouse-cpp"; - version = "2.6.0"; - - src = pkgs.fetchFromGitHub { - owner = "ClickHouse"; - repo = "clickhouse-cpp"; - rev = "v2.6.0"; - sha256 = "sha256-7QPXm0ij/ImFILCGnv1bqfe6nbMt8XCgjofEi5XElNo="; - }; - - nativeBuildInputs = with pkgs; [ cmake ]; - buildInputs = with pkgs; [ openssl zlib lz4 abseil-cpp cityhash ]; - - cmakeFlags = [ - "-DBUILD_SHARED_LIBS=ON" - "-DWITH_OPENSSL=ON" - ]; - }; - - # Common build inputs for all C++ modules - commonBuildInputs = with pkgs; [ - cmake - pkg-config - boost - spdlog - fmt - nlohmann_json - yaml-cpp - openssl - curl - zlib - cyrus_sasl - lz4 - abseil-cpp - cityhash - rdkafka - clickhouse-cpp - # Build tools for cityhash - autoconf - automake - libtool - ]; - - # Build a C++ module - buildModule = { name, src ? ./cpp, extraInputs ? [] }: - pkgs.stdenv.mkDerivation { - pname = "hamstring-${name}"; - version = "0.1.0"; - - inherit src; - - nativeBuildInputs = commonBuildInputs ++ extraInputs; - - cmakeFlags = [ - "-DCMAKE_BUILD_TYPE=Release" - "-DBUILD_TESTING=OFF" - ]; - - # Don't use default cmake buildPhase, do it manually - dontUseCmakeBuildDir = true; - - configurePhase = '' - runHook preConfigure - - # Help CMake find clickhouse-cpp and dependencies - export CMAKE_PREFIX_PATH="${clickhouse-cpp}:$CMAKE_PREFIX_PATH" - export PKG_CONFIG_PATH="${clickhouse-cpp}/lib/pkgconfig:$PKG_CONFIG_PATH" - - cmake -B build -S . \ - -DCMAKE_BUILD_TYPE=Release \ - -DBUILD_TESTING=OFF \ - -DCLICKHOUSE_CPP_LIB="${clickhouse-cpp}/lib/libclickhouse-cpp-lib.dylib" \ - -DCLICKHOUSE_INCLUDE_DIR="${clickhouse-cpp}/include" \ - -DZSTD_LIB="${pkgs.zstd.out}/lib/libzstd.dylib" \ - -DCITYHASH_LIB="${cityhash}/lib/libcityhash.dylib" - runHook postConfigure - ''; - - buildPhase = '' - runHook preBuild - cmake --build build --target ${name} -j$NIX_BUILD_CORES - runHook postBuild - ''; - - installPhase = '' - mkdir -p $out/bin - cp build/src/${name}/${name} $out/bin/ - ''; - - meta = with pkgs.lib; { - description = "HAMSTRING ${name} module"; - homepage = "https://github.com/yourusername/hamstring"; - license = licenses.eupl12; - platforms = platforms.unix; - }; - }; - - # Build OCI image (Docker-compatible) for a module - buildOciImage = { name, package, config-file ? null }: - pkgs.dockerTools.buildImage { - name = "hamstring/${name}"; - tag = "latest"; - - copyToRoot = pkgs.buildEnv { - name = "image-root"; - paths = [ package pkgs.coreutils pkgs.bash ]; - pathsToLink = [ "/bin" ]; - }; - - config = { - Cmd = [ "${package}/bin/${name}" "/config.yaml" ]; - Env = [ - "PATH=/bin" - ]; - WorkingDir = "/"; - ExposedPorts = {}; - Labels = { - "org.opencontainers.image.title" = "HAMSTRING ${name}"; - "org.opencontainers.image.description" = "DGA detection pipeline - ${name} module"; - "org.opencontainers.image.source" = "https://github.com/yourusername/hamstring"; - }; - }; - }; - - # Python environment for Zeek and Detector - pythonEnv = pkgs.python3.withPackages (ps: with ps; [ - # From requirements.zeek.txt - pyyaml - click - colorlog - # Note: ONNX runtime would go here when available - ]); - - in - { - packages = { - # C++ module packages - logserver = buildModule { name = "logserver"; }; - logcollector = buildModule { name = "logcollector"; }; - prefilter = buildModule { name = "prefilter"; }; - inspector = buildModule { name = "inspector"; }; - - # Zeek network capture (Python-based) - zeek = pkgs.writeScriptBin "hamstring-zeek" '' - #!${pkgs.bash}/bin/bash - export PYTHONPATH=${./src}:$PYTHONPATH - exec ${pythonEnv}/bin/python -m src.zeek.zeek_handler "$@" - ''; - - # OCI images (Docker-compatible) - oci-logserver = buildOciImage { - name = "logserver"; - package = self.packages.${system}.logserver; - }; - - oci-logcollector = buildOciImage { - name = "logcollector"; - package = self.packages.${system}.logcollector; - }; - - oci-prefilter = buildOciImage { - name = "prefilter"; - package = self.packages.${system}.prefilter; - }; - - oci-inspector = buildOciImage { - name = "inspector"; - package = self.packages.${system}.inspector; - }; - - # Zeek sensor image - oci-zeek = pkgs.dockerTools.buildImage { - name = "hamstring/zeek"; - tag = "latest"; - - copyToRoot = pkgs.buildEnv { - name = "zeek-root"; - paths = with pkgs; [ - zeek - pythonEnv - coreutils - bash - # Copy source files (Python handlers and Zeek configs) - (pkgs.runCommand "hamstring-src" {} '' - mkdir -p $out/opt/src - - # Copy Python modules - cp -r ${./src/zeek} $out/opt/src/zeek - cp -r ${./src/base} $out/opt/src/base - '') - ]; - }; - - config = { - Cmd = [ "${pythonEnv}/bin/python" "-m" "src.zeek.zeek_handler" "-c" "/config.yaml" ]; - Env = [ - "PYTHONPATH=/opt" - "PATH=${pkgs.zeek}/bin:${pythonEnv}/bin:/bin" - ]; - WorkingDir = "/opt"; - }; - }; - - # Default package builds all C++ modules (not OCI images) - default = pkgs.buildEnv { - name = "hamstring-pipeline"; - paths = [ - self.packages.${system}.logserver - self.packages.${system}.logcollector - self.packages.${system}.prefilter - self.packages.${system}.inspector - ]; - }; - }; - - # Apps for easy running - apps = { - logserver = { - type = "app"; - program = "${self.packages.${system}.logserver}/bin/logserver"; - }; - logcollector = { - type = "app"; - program = "${self.packages.${system}.logcollector}/bin/logcollector"; - }; - prefilter = { - type = "app"; - program = "${self.packages.${system}.prefilter}/bin/prefilter"; - }; - inspector = { - type = "app"; - program = "${self.packages.${system}.inspector}/bin/inspector"; - }; - zeek = { - type = "app"; - program = "${self.packages.${system}.zeek}/bin/hamstring-zeek"; - }; - }; - - # Development shell - devShells.default = pkgs.mkShell { - buildInputs = commonBuildInputs ++ (with pkgs; [ - # Development tools - clang-tools - gdb - valgrind - - # Python for Zeek handler - pythonEnv - - # Zeek for network capture - zeek - - # Kafka for testing - apacheKafka - - # Docker tools for OCI image testing - skopeo - podman - ]); - - shellHook = '' - echo "🚀 HAMSTRING Development Environment" - echo "======================================" - echo "" - echo "Available commands:" - echo " cmake -B build -S cpp" - echo " cmake --build build -j" - echo " python -m src.zeek.zeek_handler -c config.yaml" - echo "" - echo "Build OCI images:" - echo " nix build .#oci-logserver" - echo " nix build .#oci-inspector" - echo "" - echo "Load images:" - echo " docker load < result" - echo "" - echo "C++ Modules:" - echo " - LogServer" - echo " - LogCollector" - echo " - Prefilter " - echo " - Inspector" - echo "" - ''; - }; - - # NixOS module for deployment - nixosModules.default = { config, lib, pkgs, ... }: - with lib; - let - cfg = config.services.hamstring; - in - { - options.services.hamstring = { - enable = mkEnableOption "HAMSTRING DGA detection pipeline"; - - configFile = mkOption { - type = types.path; - description = "Path to config.yaml"; - }; - - kafkaBrokers = mkOption { - type = types.listOf types.str; - default = [ "localhost:19092" ]; - description = "Kafka broker addresses"; - }; - - modules = { - logserver.enable = mkEnableOption "LogServer module"; - logcollector.enable = mkEnableOption "LogCollector module"; - prefilter.enable = mkEnableOption "Prefilter module"; - inspector.enable = mkEnableOption "Inspector module"; - }; - }; - - config = mkIf cfg.enable { - systemd.services = { - hamstring-logserver = mkIf cfg.modules.logserver.enable { - description = "HAMSTRING LogServer"; - after = [ "network.target" "kafka.service" ]; - wantedBy = [ "multi-user.target" ]; - - serviceConfig = { - ExecStart = "${self.packages.${system}.logserver}/bin/logserver ${cfg.configFile}"; - Restart = "always"; - User = "hamstring"; - }; - }; - - hamstring-logcollector = mkIf cfg.modules.logcollector.enable { - description = "HAMSTRING LogCollector"; - after = [ "network.target" "kafka.service" "hamstring-logserver.service" ]; - wantedBy = [ "multi-user.target" ]; - - serviceConfig = { - ExecStart = "${self.packages.${system}.logcollector}/bin/logcollector ${cfg.configFile}"; - Restart = "always"; - User = "hamstring"; - }; - }; - - hamstring-prefilter = mkIf cfg.modules.prefilter.enable { - description = "HAMSTRING Prefilter"; - after = [ "network.target" "kafka.service" "hamstring-logcollector.service" ]; - wantedBy = [ "multi-user.target" ]; - - serviceConfig = { - ExecStart = "${self.packages.${system}.prefilter}/bin/prefilter ${cfg.configFile}"; - Restart = "always"; - User = "hamstring"; - }; - }; - - hamstring-inspector = mkIf cfg.modules.inspector.enable { - description = "HAMSTRING Inspector"; - after = [ "network.target" "kafka.service" "hamstring-prefilter.service" ]; - wantedBy = [ "multi-user.target" ]; - - serviceConfig = { - ExecStart = "${self.packages.${system}.inspector}/bin/inspector ${cfg.configFile}"; - Restart = "always"; - User = "hamstring"; - }; - }; - }; - - users.users.hamstring = { - isSystemUser = true; - group = "hamstring"; - description = "HAMSTRING pipeline user"; - }; - - users.groups.hamstring = {}; - }; - }; - } - ); -} diff --git a/generate-env.sh b/generate-env.sh deleted file mode 100755 index d6e2719d..00000000 --- a/generate-env.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# generate-env.sh - -HOST_IP=$(ip route get 1 | awk '{print $(NF-2); exit}') -echo "HOST_IP=$HOST_IP" > .env diff --git a/install_requirements.sh b/install_requirements.sh deleted file mode 100644 index 8c0804fe..00000000 --- a/install_requirements.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# Check if pip is available -if ! command -v pip &> /dev/null -then - echo "pip could not be found, please install Python and pip first." - exit -fi - -# Find all requirements*.txt files in the current directory -for req_file in $(ls requirements/requirements.*.txt); do - echo "Installing from $req_file..." - pip install -r "$req_file" -done - -echo "All requirements installed!" diff --git a/requirements/requirements.alerter.txt b/requirements/requirements.alerter.txt new file mode 100644 index 00000000..8e8937fc --- /dev/null +++ b/requirements/requirements.alerter.txt @@ -0,0 +1,5 @@ +PyYAML~=6.0.1 +colorlog~=6.8.2 +confluent-kafka~=2.4.0 +marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.detector.txt b/requirements/requirements.detector.txt index d8bd3ea4..6be038c1 100644 --- a/requirements/requirements.detector.txt +++ b/requirements/requirements.detector.txt @@ -6,3 +6,5 @@ PyYAML~=6.0.1 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 clickhouse_connect~=0.8.3 +pylcs +Levenshtein \ No newline at end of file diff --git a/requirements/requirements.inspector.txt b/requirements/requirements.inspector.txt index 3be91d42..2ee4c1a5 100644 --- a/requirements/requirements.inspector.txt +++ b/requirements/requirements.inspector.txt @@ -7,3 +7,4 @@ numpy~=1.26.4 marshmallow_dataclass~=8.7.1 clickhouse_connect~=0.8.3 scipy==1.12.0 +setuptools==81.0.0 diff --git a/requirements/requirements.train.txt b/requirements/requirements.train.txt index 550eda81..3ca51a33 100644 --- a/requirements/requirements.train.txt +++ b/requirements/requirements.train.txt @@ -12,3 +12,5 @@ te2rules seaborn lightgbm imblearn +pylcs +Levenshtein \ No newline at end of file diff --git a/requirements/requirements.zeek.txt b/requirements/requirements.zeek.txt deleted file mode 100644 index ff28e57a..00000000 --- a/requirements/requirements.zeek.txt +++ /dev/null @@ -1,3 +0,0 @@ -click -PyYAML -colorlog diff --git a/scripts/convert_models_to_onnx.py b/scripts/convert_models_to_onnx.py deleted file mode 100644 index e82a32ef..00000000 --- a/scripts/convert_models_to_onnx.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert XGBoost and RandomForest models to ONNX format for C++ inference. -""" - -import os -import sys -import pickle -import argparse -from pathlib import Path - -import numpy as np -import onnx -from skl2onnx import convert_sklearn -from skl2onnx.common.data_types import FloatTensorType -import onnxmltools -from onnxmltools.convert import convert_xgboost - -sys.path.append(os.getcwd()) -from src.base.log_config import get_logger - -logger = get_logger("model_conversion") - - -def convert_xgboost_model(model_path: Path, output_path: Path): - """Convert XGBoost model to ONNX format.""" - logger.info(f"Converting XGBoost model: {model_path}") - - # Load the model - with open(model_path, 'rb') as f: - model = pickle.load(f) - - # Define input shape (number of features) - # Based on feature_extractor.cpp: 3 + 26 + 12 + 3 = 44 features - initial_type = [('float_input', FloatTensorType([None, 44]))] - - # Convert to ONNX - onnx_model = convert_xgboost(model, initial_types=initial_type) - - # Save ONNX model - onnx.save_model(onnx_model, str(output_path)) - logger.info(f"Saved ONNX model to: {output_path}") - - return onnx_model - - -def convert_randomforest_model(model_path: Path, output_path: Path): - """Convert RandomForest model to ONNX format.""" - logger.info(f"Converting RandomForest model: {model_path}") - - # Load the model - with open(model_path, 'rb') as f: - model = pickle.load(f) - - # Define input shape - initial_type = [('float_input', FloatTensorType([None, 44]))] - - # Convert to ONNX - onnx_model = convert_sklearn(model, initial_types=initial_type) - - # Save ONNX model - onnx.save_model(onnx_model, str(output_path)) - logger.info(f"Saved ONNX model to: {output_path}") - - return onnx_model - - -def verify_conversion(original_model_path: Path, onnx_model_path: Path, num_samples: int = 100): - """Verify that ONNX model produces same results as original.""" - logger.info("Verifying ONNX conversion...") - - # Load original model - with open(original_model_path, 'rb') as f: - original_model = pickle.load(f) - - # Load ONNX model - import onnxruntime as rt - sess = rt.InferenceSession(str(onnx_model_path)) - input_name = sess.get_inputs()[0].name - - # Generate random test data - np.random.seed(42) - test_data = np.random.randn(num_samples, 44).astype(np.float32) - - # Get predictions from original model - original_preds = original_model.predict_proba(test_data)[:, 1] - - # Get predictions from ONNX model - onnx_preds = sess.run(None, {input_name: test_data})[1][:, 1] - - # Compare results - max_diff = np.max(np.abs(original_preds - onnx_preds)) - mean_diff = np.mean(np.abs(original_preds - onnx_preds)) - - logger.info(f"Max difference: {max_diff}") - logger.info(f"Mean difference: {mean_diff}") - - if max_diff < 1e-5: - logger.info("✓ Conversion verified successfully!") - return True - else: - logger.warning(f"⚠ Large difference detected: {max_diff}") - return False - - -def main(): - parser = argparse.ArgumentParser(description="Convert ML models to ONNX format") - parser.add_argument("--model-dir", type=str, default="./models", - help="Directory containing trained models") - parser.add_argument("--output-dir", type=str, default="./models/onnx", - help="Directory to save ONNX models") - parser.add_argument("--verify", action="store_true", - help="Verify conversion accuracy") - - args = parser.parse_args() - - model_dir = Path(args.model_dir) - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Find all model files - xgb_models = list(model_dir.glob("*xgb*.pkl")) - rf_models = list(model_dir.glob("*rf*.pkl")) - - logger.info(f"Found {len(xgb_models)} XGBoost models and {len(rf_models)} RandomForest models") - - # Convert XGBoost models - for model_path in xgb_models: - output_path = output_dir / f"{model_path.stem}.onnx" - convert_xgboost_model(model_path, output_path) - - if args.verify: - verify_conversion(model_path, output_path) - - # Convert RandomForest models - for model_path in rf_models: - output_path = output_dir / f"{model_path.stem}.onnx" - convert_randomforest_model(model_path, output_path) - - if args.verify: - verify_conversion(model_path, output_path) - - logger.info("Model conversion complete!") - - -if __name__ == "__main__": - main() diff --git a/scripts/dev-get-csv-data.py b/scripts/dev-get-csv-data.py deleted file mode 100644 index ca1bbe32..00000000 --- a/scripts/dev-get-csv-data.py +++ /dev/null @@ -1,39 +0,0 @@ -import gzip -import os -import sys - -sys.path.append(os.getcwd()) -from src.train.dataset import DatasetLoader -from src.base.log_config import get_logger - -logger = get_logger() - - -def create_dgta_dataset_json_gz(base_path="../data"): - logger.info("Loading data for DGTA dataset...") - try: - loader = DatasetLoader(base_path) - dataset = loader.dgta_dataset - - logger.info("Converting to JSON data...") - json_data = dataset.data.write_json() - logger.warning(json_data) - - logger.info("Compressing data...") - with gzip.open("../data/dgta_dataset.json.gz", "wb") as f: - logger.info("Writing compressed data to file...") - f.write(json_data.encode()) - except FileNotFoundError: - logger.warning( - "Dataset was not found in 'data' directory. Skipping this dataset" - ) - return - except Exception as err: - logger.error(err) - return - - logger.info("DGTA dataset: Done") - - -if __name__ == "__main__": - create_dgta_dataset_json_gz() diff --git a/scripts/mock_logs.dev.py b/scripts/mock_logs.dev.py deleted file mode 100644 index ed611c99..00000000 --- a/scripts/mock_logs.dev.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import sys - -sys.path.append(os.getcwd()) -from src.base.kafka_handler import SimpleKafkaProduceHandler -from src.mock.log_generator import generate_dns_log_line - -kafka_producer = SimpleKafkaProduceHandler() - -NUMBER_OF_LOGLINES_TO_SEND: int = 50000 - - -def main(): - try: - for i in range(NUMBER_OF_LOGLINES_TO_SEND): - kafka_producer.produce( - "pipeline-logserver_in", f"{generate_dns_log_line('random-ip.de')}" - ) - print("Sent logline", i) - # time.sleep(0.1 * random.uniform(0.1, 1)) - # print(f"{generate_dns_log_line('random-ip.de')}") - except KeyboardInterrupt: - pass - - -if __name__ == "__main__": - main() diff --git a/scripts/query.dev.py b/scripts/query.dev.py deleted file mode 100644 index db027dcd..00000000 --- a/scripts/query.dev.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import sys - -import clickhouse_connect - -sys.path.append(os.getcwd()) -from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE - - -def get_tables(): - tables = {} - - for table_name in TABLE_NAME_TO_TYPE: - tables[table_name] = [] - - return tables - - -def query_once(client, tables): - for table_name in tables.keys(): - tables[table_name] = client.query(f"SELECT * FROM {table_name} LIMIT 10;") - - return tables - - -def reset_tables(client, tables): - for table_name in tables.keys(): - tables[table_name] = client.command(f"DROP TABLE {table_name};") - - -def main(): - client = clickhouse_connect.get_client(host="172.27.0.11", port=8123) - tables = get_tables() - - results = query_once(client, tables) - - for key in results: - print(f"'{key}':") - - if results[key].result_rows: - for row in results[key].result_rows: - print("\t", row) - else: - print("\t -") - - -if __name__ == "__main__": - main() diff --git a/scripts/real_logs.dev.py b/scripts/real_logs.dev.py deleted file mode 100644 index eb1f3932..00000000 --- a/scripts/real_logs.dev.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -import sys -import time - -import numpy as np -import polars as pl -from confluent_kafka import KafkaError - -sys.path.append(os.getcwd()) -from src.base.kafka_handler import SimpleKafkaProduceHandler -from src.mock.log_generator import generate_dns_log_line -from src.base.log_config import get_logger -from src.train.dataset import Dataset, DatasetLoader - -logger = get_logger() -kafka_producer = SimpleKafkaProduceHandler() - -if __name__ == "__main__": - try: - data_base_path: str = "./data" - datasets = DatasetLoader(base_path=data_base_path, max_rows=10000) - dataset = Dataset( - data_path="", - name="", - data=pl.concat( - [ - datasets.dgta_dataset.data, - # datasets.cic_dataset.data, - # datasets.bambenek_dataset.data, - # datasets.dga_dataset.data, - # datasets.dgarchive_dataset.data, - ] - ), - max_rows=100, - ) - data = dataset.data - print(data) - np.random.seed(None) - while True: - for i in range(0, 10): - random_domain = data.sample(n=1) - logline = generate_dns_log_line(random_domain["query"].item()) - try: - kafka_producer.produce( - "pipeline-logserver_in", logline.encode("utf-8") - ) - logger.info(f"Sent logline: {logline}") - except KafkaError: - logger.warning(KafkaError) - time.sleep(0.1) - except KeyboardInterrupt: - pass diff --git a/scripts/run_test.py b/scripts/run_test.py deleted file mode 100644 index a3816379..00000000 --- a/scripts/run_test.py +++ /dev/null @@ -1,331 +0,0 @@ -import argparse -import datetime -import ipaddress -import os -import random -import sys -import time - -import polars as pl -from confluent_kafka import KafkaError - -sys.path.append(os.getcwd()) -from src.base.kafka_handler import SimpleKafkaProduceHandler -from src.train.dataset import Dataset, DatasetLoader -from src.base.log_config import get_logger -from src.base.utils import setup_config - -logger = get_logger() -config = setup_config() - -PRODUCE_TO_TOPIC = config["environment"]["kafka_topics"]["pipeline"]["logserver_in"] - - -class DatasetGenerator: - """Generates log lines and datasets.""" - - def __init__(self, data_base_path: str = "./data"): - datasets = DatasetLoader(base_path=data_base_path, max_rows=10000) - - dataset = Dataset( - name="", - data_path="", - data=pl.concat( - [ - datasets.dgta_dataset.data, - # datasets.cic_dataset.data, - # datasets.bambenek_dataset.data, - # datasets.dga_dataset.data, - # datasets.dgarchive_dataset.data, - ] - ), - max_rows=1000, - ) - - self.domains = dataset.data - - def generate_random_logline( - self, statuses: list[str] = None, record_types: list[str] = None - ): - """Generates a (mostly) random logline.""" - if record_types is None: - record_types = 6 * ["AAAA"] + 10 * ["A"] + ["PR", "CNAME"] - - if statuses is None: - statuses = ["NOERROR", "NXDOMAIN"] - - # choose timestamp - timestamp = ( - datetime.datetime.now() + datetime.timedelta(0, 0, random.randint(0, 900)) - ).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" - - # choose status code - status = random.choice(statuses) - - # choose client IP address - number_of_subnets = 50 - src_ip = ( - f"192.168.{random.randint(0, number_of_subnets)}.{random.randint(1, 255)}" - ) - - # choose server IP address - server_ip = f"10.10.0.{random.randint(1, 100)}" - - # choose random domain (can be malicious or benign) - domain = self.get_random_domain() - - # choose random record type - record_type = random.choice(record_types) - - # choose random response IP address - def _get_random_ipv4(): - max_ipv4 = ipaddress.IPv4Address._ALL_ONES # 2 ** 32 - 1 - return ipaddress.IPv4Address._string_from_ip_int( - random.randint(0, max_ipv4) - ) - - def _get_random_ipv6(): - max_ipv6 = ipaddress.IPv6Address._ALL_ONES # 2 ** 128 - 1 - return ipaddress.IPv6Address._string_from_ip_int( - random.randint(0, max_ipv6) - ) - - ip_address_choices = [_get_random_ipv4(), _get_random_ipv6()] - response_ip_address = random.choice(ip_address_choices) - - # choose random size - size = f"{random.randint(50, 255)}b" - - return f"{timestamp} {status} {src_ip} {server_ip} {domain} {record_type} {response_ip_address} {size}" - - def get_random_domain(self) -> str: - random_domain = self.domains.sample(n=1) - return random_domain["query"].item() - - def generate_dataset(self, number_of_elements: int) -> list[str]: - dataset = [] - - for _ in range(number_of_elements): - logline = self.generate_random_logline() - dataset.append(logline) - - return dataset - - -class ScalabilityTest: - """Base class for tests that focus on the scalability of the software.""" - - def __init__(self): - self.dataset_generator = DatasetGenerator() - self.kafka_producer = SimpleKafkaProduceHandler() - - self.interval_lengths = None - self.msg_per_sec_in_intervals = None - - def execute(self): - """Executes the test with the configured parameters.""" - logger.warning(f"Start at: {datetime.datetime.now()}") - - cur_index = 0 - for i in range(len(self.msg_per_sec_in_intervals)): - cur_index = self._execute_one_interval( - cur_index=cur_index, - msg_per_sec=self.msg_per_sec_in_intervals[i], - length_in_sec=self.interval_lengths[i], - ) - - logger.warning(f"Stop at: {datetime.datetime.now()}") - - def _execute_one_interval( - self, cur_index: int, msg_per_sec: float | int, length_in_sec: float | int - ) -> int: - start_of_interval_timestamp = datetime.datetime.now() - logger.warning( - f"Start interval with {msg_per_sec} msg/s at {start_of_interval_timestamp}" - ) - - while ( - datetime.datetime.now() - start_of_interval_timestamp - < datetime.timedelta(seconds=length_in_sec) - ): - try: - self.kafka_producer.produce( - PRODUCE_TO_TOPIC, - self.dataset_generator.generate_random_logline(), - ) - logger.info( - f"Sent message {cur_index + 1} at: {datetime.datetime.now()}" - ) - cur_index += 1 - except KafkaError: - logger.warning(KafkaError) - - if msg_per_sec > 0: - time.sleep(1.0 / msg_per_sec) - else: - time.sleep(1.0) - - logger.warning(f"Finish interval with {msg_per_sec} msg/s") - return cur_index - - -class RampUpTest(ScalabilityTest): - """Starts with a low rate and increases the rate in fixed intervals.""" - - def __init__( - self, - msg_per_sec_in_intervals: list[float | int], - interval_length_in_sec: int | float | list[int | float], - ): - super().__init__() - self.msg_per_sec_in_intervals = msg_per_sec_in_intervals - - if type(interval_length_in_sec) is list: - self.interval_lengths = interval_length_in_sec - else: - self.interval_lengths = [ - interval_length_in_sec for _ in range(len(msg_per_sec_in_intervals)) - ] - - if len(interval_length_in_sec) != len(msg_per_sec_in_intervals): - raise Exception("Different lengths of interval lists. Must be equal.") - - -class BurstTest(ScalabilityTest): - """Starts with a normal rate, sends a high rate for a short period, then returns to normal rate. Repeats the - process for a defined number of times.""" - - def __init__( - self, - normal_rate_msg_per_sec: float | int, - burst_rate_msg_per_sec: float | int, - normal_rate_interval_length: float | int, - burst_rate_interval_length: float | int, - number_of_intervals: int = 1, - ): - super().__init__() - - self.msg_per_sec_in_intervals = [normal_rate_msg_per_sec] - self.interval_lengths = [normal_rate_interval_length] - - for _ in range(number_of_intervals): - self.msg_per_sec_in_intervals.append(burst_rate_msg_per_sec) - self.msg_per_sec_in_intervals.append(normal_rate_msg_per_sec) - - self.interval_lengths.append(burst_rate_interval_length) - self.interval_lengths.append(normal_rate_interval_length) - - -class LongTermTest: - """Keeps a consistent rate for a long time.""" - - def __init__(self, full_length_in_min: float | int, msg_per_sec: float | int): - self.dataset_generator = DatasetGenerator() - self.kafka_producer = SimpleKafkaProduceHandler() - - self.msg_per_sec = msg_per_sec - self.full_length_in_min = full_length_in_min - - def execute(self): - """Executes the test with the configured parameters.""" - start_timestamp = datetime.datetime.now() - logger.warning( - f"Start {self.full_length_in_min} minute-test with " - f"rate {self.msg_per_sec} msg/sec at: {start_timestamp}" - ) - - cur_index = 0 - while datetime.datetime.now() - start_timestamp < datetime.timedelta( - minutes=self.full_length_in_min - ): - try: - self.kafka_producer.produce( - PRODUCE_TO_TOPIC, - self.dataset_generator.generate_random_logline(), - ) - logger.info( - f"Sent message {cur_index + 1} at: {datetime.datetime.now()}" - ) - cur_index += 1 - except KafkaError: - logger.warning(KafkaError) - time.sleep(1.0 / self.msg_per_sec) - - logger.warning( - f"Stop at: {datetime.datetime.now()}, sent {cur_index} messages in the " - f"past {(datetime.datetime.now() - start_timestamp).total_seconds() / 60} minutes." - ) - - -class MaximumThroughputTest(LongTermTest): - """Keeps a consistent rate that is too high to be handled.""" - - def __init__(self, length_in_min: float | int, msg_per_sec: int = 500): - super().__init__(full_length_in_min=length_in_min, msg_per_sec=msg_per_sec) - - -def main(): - # Get the environment variable, default to 1 if not set - env_test_type_nr = int(os.getenv("TEST_TYPE_NR", 1)) - - parser = argparse.ArgumentParser( - description="Example script with test_type_nr argument." - ) - - parser.add_argument( - "--test_type_nr", - type=int, - choices=[1, 2, 3, 4], - default=env_test_type_nr, - help=""" - 1 - Ramp-up test - 2 - Burst test - 3 - Maximum throughput test - 4 - Long-term test - """, - ) - - args = parser.parse_args() - - print(f"Selected test type number: {args.test_type_nr}") - test_type_nr = args.test_type_nr - - """Creates the test instance and executes the test.""" - match test_type_nr: - case 1: - ramp_up_test = RampUpTest( - msg_per_sec_in_intervals=[10, 50, 100, 150], - interval_length_in_sec=[120, 120, 120, 120], - ) - ramp_up_test.execute() - - case 2: - burst_test = BurstTest( - normal_rate_msg_per_sec=50, - burst_rate_msg_per_sec=1000, - normal_rate_interval_length=120, - burst_rate_interval_length=2, - number_of_intervals=3, - ) - burst_test.execute() - - case 3: - maximum_throughput_test = MaximumThroughputTest( - length_in_min=5, - ) - maximum_throughput_test.execute() - - case 4: - long_term_test = LongTermTest( - full_length_in_min=10, - msg_per_sec=15, - ) - long_term_test.execute() - - case _: - pass - - -if __name__ == "__main__": - """ """ - main() diff --git a/src/alerter/alerter.py b/src/alerter/alerter.py new file mode 100644 index 00000000..0eba4a64 --- /dev/null +++ b/src/alerter/alerter.py @@ -0,0 +1,224 @@ +import json +import os +import sys +import asyncio +from abc import ABC, abstractmethod +import importlib + +sys.path.append(os.getcwd()) +from confluent_kafka.admin import AdminClient, NewTopic +from src.base.utils import setup_config, ensure_directory +from src.base.kafka_handler import ( + ExactlyOnceKafkaConsumeHandler, + ExactlyOnceKafkaProduceHandler, + KafkaMessageFetchException, +) +from src.base.log_config import get_logger + +module_name = "pipeline.alerter" +logger = get_logger(module_name) + +config = setup_config() +CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"].get( + "detector_to_alerter", "pipeline-detector_to_alerter" +) +ALERTING_CONFIG = config["pipeline"].get("alerting", {}) +ALTERTERS = ALERTING_CONFIG.get("plugins", []) +PLUGIN_PATH = "src.alerter.plugins" + + +class AlerterAbstractBase(ABC): + """ + Abstract base class for all alerter implementations. + """ + @abstractmethod + def __init__(self, alerter_config, consume_topic) -> None: + pass + + @abstractmethod + def process_alert(self) -> None: + """ + Process the alert data. Subclasses can mutate self.alert_data. + """ + pass + + +class AlerterBase(AlerterAbstractBase): + """ + Base implementation for Alerters in the pipeline. + + This class handles the common logic for consuming alerts from Kafka, + executing custom processing via plugins, and performing base actions + like logging to a file or forwarding to an external Kafka topic. + """ + def __init__(self, alerter_config, consume_topic) -> None: + self.name = alerter_config.get("name", "generic") + self.consume_topic = consume_topic + self.alerter_config = alerter_config + self.alert_data = None + self.key = None + + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(self.consume_topic) + + # Base actions config + self.log_to_file = ALERTING_CONFIG.get("log_to_file", False) + self.log_file_path = ALERTING_CONFIG.get("log_file_path", "/opt/logs/alerts.txt") + self.log_to_kafka = ALERTING_CONFIG.get("log_to_kafka", False) + self.external_kafka_topic = ALERTING_CONFIG.get("external_kafka_topic", "external_alerts_topic") + + if self.log_to_file: + ensure_directory(self.log_file_path) + + if self.log_to_kafka: + self._setup_kafka_output_topics() + + + def _setup_kafka_output_topics(self): + """ + Ensure that the external Kafka topic exists. + + Since no internal consumer subscribes to this topic, auto-creation + via consumer polling won't happen. We use AdminClient to ensure + the topic exists before producing to it. + """ + brokers = ",".join( + [ + f"{broker['hostname']}:{broker['internal_port']}" + for broker in config["environment"]["kafka_brokers"] + ] + ) + admin_client = AdminClient({"bootstrap.servers": brokers}) + # Attempt to create topic (will do nothing if it already exists) + try: + admin_client.create_topics([NewTopic(self.external_kafka_topic, 1, 1)]) + except Exception as e: + logger.warning(f"Could not auto-create topic {self.external_kafka_topic}: {e}") + + self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() + + def get_and_fill_data(self) -> None: + if self.alert_data: + logger.warning( + "Alerter is busy: Not consuming new messages. Wait for the Alerter to finish the current workload." + ) + return + + key, data = self.kafka_consume_handler.consume_as_json() + if data: + self.alert_data = data + self.key = key + logger.info(f"Received alert for processing. Belongs to subnet_id {key}.") + else: + logger.info(f"Received empty alert message.") + + def clear_data(self) -> None: + self.alert_data = None + self.key = None + + def _log_to_file_action(self): + """ + Append the current alert_data to the configured log file. + """ + if not self.log_to_file: + return + + logger.info(f"{self.name}: Logging alert to file {self.log_file_path}") + try: + with open(self.log_file_path, "a+") as f: + json.dump(self.alert_data, f) + f.write("\n") + except IOError as e: + logger.error(f"{self.name}: Error writing alert to file: {e}") + raise + + def _log_to_kafka_action(self): + """ + Forward the current alert_data to the external Kafka topic. + """ + if not self.log_to_kafka: + return + + logger.info(f"{self.name}: Forwarding alert to topic {self.external_kafka_topic}") + try: + self.kafka_produce_handler.produce( + topic=self.external_kafka_topic, + data=json.dumps(self.alert_data), + key=self.key, + ) + except Exception as e: + logger.error(f"{self.name}: Error forwarding alert: {e}") + raise + + def bootstrap_alerter_instance(self): + """ + Main loop for the alerter instance. + Consumes alerts, processes them, and executes base actions. + """ + logger.info(f"Starting {self.name} Alerter") + while True: + try: + self.get_and_fill_data() + if self.alert_data: + # 1. Process specific action + self.process_alert() + # 2. Executing Base Logging Actions + self._log_to_file_action() + self._log_to_kafka_action() + + except KafkaMessageFetchException as e: + logger.debug(e) + except IOError as e: + logger.error(e) + raise e + except ValueError as e: + logger.debug(e) + except KeyboardInterrupt: + logger.info(f" {self.consume_topic} Closing down Alerter...") + break + except Exception as e: + logger.error(f"Unexpected error: {e}") + finally: + self.clear_data() + + async def start(self): + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self.bootstrap_alerter_instance) + + + +async def main(): + tasks = [] + + # Setup Generic Alerter Task + generic_topic = f"{CONSUME_TOPIC_PREFIX}-generic" + logger.info("Initializing Generic Alerter") + class_name = "GenericAlerter" + mod_name = f"{PLUGIN_PATH}.generic_alerter" + module = importlib.import_module(mod_name) + AlerterClass = getattr(module, class_name) + + generic_alerter = AlerterClass( + alerter_config={"name": "generic"}, consume_topic=generic_topic + ) + tasks.append(asyncio.create_task(generic_alerter.start())) + + # Setup Specific Custom Alerter Tasks + if ALTERTERS: + for alerter_config in ALTERTERS: + logger.info(f"Initializing Custom Alerter: {alerter_config['name']}") + consume_topic = f"{CONSUME_TOPIC_PREFIX}-{alerter_config['name']}" + class_name = alerter_config["alerter_class_name"] + mod_name = f"{PLUGIN_PATH}.{alerter_config['alerter_module_name']}" + module = importlib.import_module(mod_name) + AlerterClass = getattr(module, class_name) + + alerter_instance = AlerterClass( + alerter_config=alerter_config, consume_topic=consume_topic + ) + tasks.append(asyncio.create_task(alerter_instance.start())) + + await asyncio.gather(*tasks) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/zeek/__init__.py b/src/alerter/plugins/__init__.py similarity index 100% rename from src/zeek/__init__.py rename to src/alerter/plugins/__init__.py diff --git a/src/alerter/plugins/generic_alerter.py b/src/alerter/plugins/generic_alerter.py new file mode 100644 index 00000000..6816d106 --- /dev/null +++ b/src/alerter/plugins/generic_alerter.py @@ -0,0 +1,25 @@ +import os +import sys + +sys.path.append(os.getcwd()) +from src.alerter.alerter import AlerterBase +from src.base.log_config import get_logger + +module_name = "src.alerter.generic_alerter" +logger = get_logger(module_name) + + +class GenericAlerter(AlerterBase): + """ + Specific implementation for an Alerter that processes alerts + from a generic topic. + + It performs no additional processing or transformation by itself, + instead relying solely on the base actions (logging to file/Kafka). + """ + def process_alert(self): + """ + Generic implementation: no special processing needed. + """ + pass + diff --git a/src/base/utils.py b/src/base/utils.py index 1b60be83..3328a06e 100644 --- a/src/base/utils.py +++ b/src/base/utils.py @@ -228,3 +228,9 @@ def normalize_ipv6_address( def generate_collisions_resistant_uuid(): return f"{uuid.uuid4()}-{uuid.uuid4()}" + + +def ensure_directory(file_path): + directory = os.path.dirname(file_path) + if directory: + os.makedirs(directory, exist_ok=True) \ No newline at end of file diff --git a/src/detector/detector.py b/src/detector/detector.py index 0abc07d5..deb21f33 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -17,6 +17,7 @@ from src.base.utils import setup_config, generate_collisions_resistant_uuid from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, + ExactlyOnceKafkaProduceHandler, KafkaMessageFetchException, ) from src.base.log_config import get_logger @@ -34,6 +35,9 @@ CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "inspector_to_detector" ] +PRODUCE_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ + "detector_to_alerter" +] PLUGIN_PATH = "src.detector.plugins" @@ -57,7 +61,7 @@ class DetectorAbstractBase(ABC): # pragma: no cover """ @abstractmethod - def __init__(self, detector_config, consume_topic) -> None: + def __init__(self, detector_config, consume_topic, produce_topics) -> None: pass @abstractmethod @@ -85,7 +89,7 @@ class DetectorBase(DetectorAbstractBase): that provide model-specific prediction logic. """ - def __init__(self, detector_config, consume_topic) -> None: + def __init__(self, detector_config, consume_topic, produce_topics) -> None: """ Initialize the detector with configuration and Kafka topic settings. @@ -104,6 +108,7 @@ def __init__(self, detector_config, consume_topic) -> None: self.threshold = detector_config["threshold"] self.consume_topic = consume_topic + self.produce_topics = produce_topics self.suspicious_batch_id = None self.key = None self.messages = [] @@ -118,6 +123,7 @@ def __init__(self, detector_config, consume_topic) -> None: ) self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(self.consume_topic) + self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() self.model, self.scaler = self._get_model() @@ -256,22 +262,26 @@ def _get_model(self): requests.HTTPError: If there's an error downloading the model. """ logger.info(f"Get model: {self.model} with checksum {self.checksum}") - # TODO test the if! - if not os.path.isfile(self.model_path): - model_download_url = self.get_model_download_url() - logger.info( - f"downloading model {self.model} from {model_download_url} with checksum {self.checksum}" - ) - response = requests.get(model_download_url) - response.raise_for_status() - with open(self.model_path, "wb") as f: - f.write(response.content) - scaler_download_url = self.get_scaler_download_url() + # if not os.path.isfile(self.model_path): + model_download_url = self.get_model_download_url() + logger.info( + f"downloading model {self.model} from {model_download_url} with checksum {self.checksum}" + ) + response = requests.get(model_download_url) + response.raise_for_status() + with open(self.model_path, "wb") as f: + f.write(response.content) + # Handle optional scaler + scaler_download_url = self.get_scaler_download_url() + if scaler_download_url: scaler_response = requests.get(scaler_download_url) scaler_response.raise_for_status() with open(self.scaler_path, "wb") as f: f.write(scaler_response.content) - + with open(self.scaler_path, "rb") as input_file: + scaler = pickle.load(input_file) + else: + scaler = None # Check file sha256 local_checksum = self._sha256sum(self.model_path) @@ -286,9 +296,6 @@ def _get_model(self): with open(self.model_path, "rb") as input_file: clf = pickle.load(input_file) - with open(self.scaler_path, "rb") as input_file: - scaler = pickle.load(input_file) - return clf, scaler def detect(self) -> None: @@ -307,10 +314,10 @@ def detect(self) -> None: """ logger.info("Start detecting malicious requests.") for message in self.messages: - # TODO predict all messages y_pred = self.predict(message) logger.info(f"Prediction: {y_pred}") - if np.argmax(y_pred, axis=1) == 1 and y_pred[0][1] > self.threshold: + # TODO: DO NOT USE if TRUE for prod!!! + if True: # np.argmax(y_pred, axis=1) == 1 and y_pred[0][1] > self.threshold: logger.info("Append malicious request to warning.") warning = { "request": message, @@ -348,12 +355,23 @@ def send_warning(self) -> None: overall_score = median( [warning["probability"] for warning in self.warnings] ) - alert = {"overall_score": overall_score, "result": self.warnings} - - logger.info(f"Add alert: {alert}") - with open(os.path.join(tempfile.gettempdir(), "warnings.json"), "a+") as f: - json.dump(alert, f) - f.write("\n") + alert = { + "overall_score": overall_score, + "result": self.warnings, + "src_ip": self.key, + "alert_timestamp": datetime.datetime.now().isoformat(), + "suspicious_batch_id": str(self.suspicious_batch_id), + "detector_name": self.name + } + + logger.info(f"Producing alert to Kafka: {alert}") + + for topic in self.produce_topics: + self.kafka_produce_handler.produce( + topic=topic, + data=json.dumps(alert), + key=self.key, + ) self.alerts.insert( dict( @@ -506,15 +524,24 @@ async def main(): # pragma: no cover 3. Creates detector instances 4. Starts all detectors concurrently """ + # ensure all detectors configure what to do + # instead of doing ensure alert directly we now use alerter topics + tasks = [] for detector_config in DETECTORS: consume_topic = f"{CONSUME_TOPIC_PREFIX}-{detector_config['name']}" + produce_topics_str = detector_config.get("produce_topics", "") + if produce_topics_str: + produce_topics = [f"{PRODUCE_TOPIC_PREFIX}-{t.strip()}" for t in produce_topics_str.split(",")] + else: + produce_topics = [f"{PRODUCE_TOPIC_PREFIX}-generic"] + class_name = detector_config["detector_class_name"] module_name = f"{PLUGIN_PATH}.{detector_config['detector_module_name']}" module = importlib.import_module(module_name) DetectorClass = getattr(module, class_name) detector = DetectorClass( - detector_config=detector_config, consume_topic=consume_topic + detector_config=detector_config, consume_topic=consume_topic, produce_topics=produce_topics ) tasks.append(asyncio.create_task(detector.start())) await asyncio.gather(*tasks) diff --git a/src/detector/plugins/dga_detector.py b/src/detector/plugins/dga_detector.py index 2b60d732..a5098187 100644 --- a/src/detector/plugins/dga_detector.py +++ b/src/detector/plugins/dga_detector.py @@ -19,7 +19,7 @@ class DGADetector(DetectorBase): to make predictions about whether a domain is likely generated by a DGA. """ - def __init__(self, detector_config, consume_topic): + def __init__(self, detector_config, consume_topic, produce_topics): """ Initialize the DGA detector with configuration parameters. @@ -32,7 +32,7 @@ def __init__(self, detector_config, consume_topic): consume_topic (str): Kafka topic from which the detector will consume messages. """ self.model_base_url = detector_config["base_url"] - super().__init__(detector_config, consume_topic) + super().__init__(detector_config, consume_topic, produce_topics) def get_model_download_url(self): """ @@ -61,12 +61,7 @@ def get_scaler_download_url(self): Returns: str: Fully qualified URL where the model can be downloaded. """ - self.model_base_url = ( - self.model_base_url[:-1] - if self.model_base_url[-1] == "/" - else self.model_base_url - ) - return f"{self.model_base_url}/files/?p=%2F{self.model}%2F{self.checksum}%2Fscaler.pickle&dl=1" + return None def predict(self, message): """ diff --git a/src/detector/plugins/domainator_detector.py b/src/detector/plugins/domainator_detector.py new file mode 100644 index 00000000..2fec43d7 --- /dev/null +++ b/src/detector/plugins/domainator_detector.py @@ -0,0 +1,185 @@ +from src.detector.detector import DetectorBase +import math +import numpy as np +from collections import defaultdict +import itertools +import pylcs +import Levenshtein +from src.base.log_config import get_logger + +module_name = "data_analysis.detector" +logger = get_logger(module_name) + + +class DomainatorDetector(DetectorBase): + """ + Detector implementation for identifying data exfiltration and command and control on the + subdomain level. + + This class extends the DetectorBase to provide specific functionality for detecting + malicious queries. It analyzes subdomain similarity characteristics based on grouping + of the queries in windows of fixed size, in order to identify potential data exfiltration + or command and control. + + The detector extracts various statistical similarity features from windows of subdomains + to make predictions about whether a query is likely malicious. + """ + + def __init__(self, detector_config, consume_topic, produce_topics): + """ + Initialize the Domainator detector with configuration parameters. + + Sets up the detector with the model base URL and passes configuration to the + base class for standard detector initialization. + + Args: + detector_config (dict): Configuration dictionary containing detector-specific + parameters including base_url, model, checksum, and threshold. + consume_topic (str): Kafka topic from which the detector will consume messages. + """ + self.model_base_url = detector_config["base_url"] + self.message_queues = defaultdict(list) + super().__init__(detector_config, consume_topic, produce_topics) + + def get_model_download_url(self): + """ + Generate the complete URL for downloading the Domainator detection model. + + Constructs the URL using the base URL from configuration and appends the + specific model filename with checksum for verification. + + Returns: + str: Fully qualified URL where the model can be downloaded. + """ + self.model_base_url = ( + self.model_base_url[:-1] + if self.model_base_url[-1] == "/" + else self.model_base_url + ) + return f"{self.model_base_url}/files/?p=%2F{self.model}%2F{self.checksum}%2F{self.model}.pickle&dl=1" + + def get_scaler_download_url(self): + """ + Generate the complete URL for downloading the Domainator detection models scaler. + + Constructs the URL using the base URL from configuration and appends the + specific model filename with checksum for verification. + + Returns: + str: Fully qualified URL where the model can be downloaded. + """ + self.model_base_url = ( + self.model_base_url[:-1] + if self.model_base_url[-1] == "/" + else self.model_base_url + ) + return f"{self.model_base_url}/files/?p=%2F{self.model}%2F{self.checksum}%2Fscaler.pickle&dl=1" + + def predict(self, messages): + """ + Process a window of messages and predict if the domain is likely to be used + for malicious exfiltration and communication. + + Extracts features from the subdomains in the messages and uses the loaded + machine learning model to generate prediction probabilities. + + Args: + message (list): A list containing the messages data, expected to have + a "domain_name" key with the domain to analyze. + + Returns: + np.ndarray: Prediction probabilities for each class. Typically a 2D array + where the shape is (1, 2) for binary classification (benign/malicious). + """ + queries = [message['domain_name'] for message in messages] + + y_pred = self.model.predict_proba(self._get_features(queries)) + print(f"Prediction: {y_pred}") + return y_pred + + def detect(self): + logger.info("Start detecting malicious requests.") + for message in self.messages: + message_domain = self._strip_domain(message["domain_name"]) + self.message_queues[message_domain].append(message) + + if len(self.message_queues[message_domain]) >= 3: + y_pred = self.predict(self.message_queues[message_domain]) + logger.info(f"Prediction: {y_pred}") + if np.argmax(y_pred, axis=1) == 1 and y_pred[0][1] > self.threshold: + logger.info("Append malicious request domain to warning.") + warning = { + "request_domain": message_domain, + "probability": float(y_pred[0][1]), + "name": self.name, + "sha256": self.checksum, + } + self.warnings.append(warning) + + if len(self.message_queues[message_domain]) >= 10: + del self.message_queues[message_domain][0] + + + def _strip_domain(self, query: str): + """Extract the domain name from the message for the window grouping + + Currently does not differentiate between messages coming from + different users. + + Returns: + str: Domain name string that the window will be grouped by + """ + + query = query.strip(".") + query = query.split(".") + + domain = "" + + if len(query) >= 2: + domain = query[-2] + + return domain + + + def _get_features(self, queries: list) -> np.ndarray: + """Extracts feature vector from domain name for ML model inference. + + Computes various statistical and linguistic features from the domain name + including label lengths, character frequencies, entropy measures, and + counts of different character types across domain name levels. + + Args: + queries (list): List of query strings to extract features from. + + Returns: + numpy.ndarray: Feature vector ready for ML model prediction. + """ + + queries = [query.strip(".") for query in queries] + subdomains = ['.'.join(domain.split(".")[:-2]) for domain in queries] + + # Values can be put directly into an array, as the return converts them anyway, + # but this slightly improves readability + metrics = { + 'levenshtein': [], + 'jaro': [], + 'rev_jaro': [], + 'jaro_winkler': [], + 'rev_jaro_wink': [], + 'lcs_seq': [], + 'lcs_str': [], + } + + # if subdomains: + cartesian = list(itertools.combinations(subdomains, 2)) + + metrics['levenshtein'] = np.mean([Levenshtein.ratio(product[0], product[1]) for product in cartesian]) + metrics['jaro'] = np.mean([Levenshtein.jaro(product[0], product[1]) for product in cartesian]) + metrics['jaro_winkler'] = np.mean([Levenshtein.jaro_winkler(product[0], product[1], prefix_weight=0.2) for product in cartesian]) + metrics['rev_jaro'] = np.mean([Levenshtein.jaro(product[0][::-1], product[1][::-1]) for product in cartesian]) + metrics['rev_jaro_wink'] = np.mean([Levenshtein.jaro_winkler(product[0][::-1], product[1][::-1], prefix_weight=0.2) for product in cartesian]) + + metrics['lcs_seq'] = np.mean([pylcs.lcs_sequence_length(product[0], product[1])/((len(product[0]) + len(product[1]))/2) if len(product[0]) and len(product[1]) else 0.0 for product in cartesian ]) + metrics['lcs_str'] = np.mean([pylcs.lcs_string_length(product[0], product[1])/((len(product[0]) + len(product[1]))/2) if len(product[0]) and len(product[1]) else 0.0 for product in cartesian]) + + return np.fromiter(metrics.values(), dtype=float).reshape(1, -1) \ No newline at end of file diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index dc07313e..46931207 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -432,8 +432,8 @@ class BufferedBatchSender: def __init__(self, produce_topics, collector_name): self.topics = produce_topics self.batch_configuration = get_batch_configuration(collector_name) - self.batch = BufferedBatch(collector_name) self.timer = None + self.batch = BufferedBatch(collector_name) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() @@ -441,10 +441,16 @@ def __init__(self, produce_topics, collector_name): self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") def __del__(self): - if self.timer: - self.timer.cancel() - - self._send_all_batches(reset_timer=False) + timer = getattr(self, "timer", None) + if timer: + timer.cancel() + + batch = getattr(self, "batch", None) + if batch: + try: + self._send_all_batches(reset_timer=False) + except Exception as e: + logger.debug(f"Skipping batch flush during cleanup: {e}") def add_message(self, key: str, message: str) -> None: """Adds a message to the batch and triggers sending if batch size limit is reached. diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index b0e74a8d..e8901bdf 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -60,13 +60,13 @@ def __init__( """ self.protocol = protocol self.consume_topic = consume_topic + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(consume_topic) self.batch_configuration = utils.get_batch_configuration(collector_name) self.loglines = asyncio.Queue() self.batch_handler = BufferedBatchSender( produce_topics=produce_topics, collector_name=collector_name ) self.logline_handler = LoglineHandler(validation_config) - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(consume_topic) # databases self.failed_protocol_loglines = ClickHouseKafkaSender("failed_loglines") diff --git a/src/train/dataset.py b/src/train/dataset.py index 247b05c0..d3da13cf 100644 --- a/src/train/dataset.py +++ b/src/train/dataset.py @@ -12,7 +12,7 @@ logger = get_logger("train.dataset") -def preprocess(x: pl.DataFrame) -> pl.DataFrame: +def preprocess(x: pl.DataFrame, keep_all: bool = False) -> pl.DataFrame: """Preprocesses DataFrame into structured dataset for feature extraction. Filters out empty queries, removes duplicates, splits domain names into labels, @@ -27,7 +27,9 @@ def preprocess(x: pl.DataFrame) -> pl.DataFrame: """ logger.debug("Start preprocessing data.") x = x.filter(pl.col("query").str.len_chars() > 0) - x = x.unique(subset="query") + if not keep_all: + x = x.unique(subset="query") + x = x.with_columns( [ (pl.col("query").str.split(".").alias("labels")), @@ -136,39 +138,6 @@ def cast_bambenek(data_path: str, max_rows: int) -> pl.DataFrame: return df # pl.concat([df_legit, df_malicious]) -def cast_cic(data_path: List[str], max_rows: int) -> pl.DataFrame: - """Loads and processes CIC DNS dataset from multiple CSV files. - - Reads CIC DNS datasets (benign, malware, phishing, spam), assigns appropriate - class labels based on filename, and combines all datasets into a unified format. - - Args: - data_path (List[str]): List of paths to CIC dataset CSV files. - max_rows (int): Maximum number of rows to process per file. - - Returns: - pl.DataFrame: Combined CIC dataset with structured domain information. - """ - dataframes = [] - for data in data_path: - logger.info(f"Start casting data set {data}.") - y = data.split("_")[-1].split(".")[0] - df = pl.read_csv( - data, has_header=False, n_rows=max_rows if max_rows > 0 else None - ) - if y == "benign": - df = df.with_columns([pl.lit("legit").alias("class")]) - else: - df = df.with_columns([pl.lit(y).alias("class")]) - df = df.rename({"column_1": "query"}) - df = preprocess(df) - - logger.info(f"Data loaded with shape {df.shape}") - dataframes.append(df) - - return pl.concat(dataframes) - - def cast_dgarchive(data_path: str, max_rows: int) -> pl.DataFrame: """Loads and processes DGArchive dataset from CSV file. @@ -290,6 +259,38 @@ def cast_heicloud(data_path: str, max_rows: int) -> pl.DataFrame: return pl.concat(dataframes) +def cast_domainator(data_path: List[str], max_rows: int) -> pl.DataFrame: + """Loads and processes Domainator dataset from multiple CSV files. + + Reads Domainator datasets (benign, malicious), appends a user source if not present, + then processes the queries and combines the datasets into one for training + + Args: + data_path (str): Data path to data set + max_rows (int): Maximum rows. + + Returns: + pl.DataFrame: Loaded pl.DataFrame. + """ + + dataframes = [] + for path in data_path: + logger.info(f"Start casting data set {path}.") + df = pl.read_csv( + path, + separator=",", + has_header=True, + n_rows=max_rows if max_rows > 0 else None + ) + if 'user' not in df.columns: + df.insert_column(0, pl.Series('user', ['testbed']*len(df))) + df = preprocess(df, keep_all=True) + logger.info(f"Data loaded with shape {df.shape}") + dataframes.append(df) + + return pl.concat(dataframes) + + class DatasetLoader: """Manages loading and access to multiple DNS datasets for training. @@ -348,22 +349,6 @@ def heicloud_dataset(self) -> Dataset: max_rows=self.max_rows, ) return self.heicloud_data - - @property - def cic_dataset(self) -> Dataset: - self.cic_data = Dataset( - name="cic", - data_path=[ - f"{self.base_path}/cic/CICBellDNS2021_CSV_benign.csv", - f"{self.base_path}/cic/CICBellDNS2021_CSV_malware.csv", - f"{self.base_path}/cic/CICBellDNS2021_CSV_phishing.csv", - f"{self.base_path}/cic/CICBellDNS2021_CSV_spam.csv", - ], - cast_dataset=cast_cic, - max_rows=self.max_rows, - ) - return self.cic_data - @property def dgarchive_dataset(self) -> list[Dataset]: dgarchive_files = [ @@ -380,6 +365,23 @@ def dgarchive_dataset(self) -> list[Dataset]: ) ) return self.dgarchive_data + + @property + def domainator_dataset(self) -> Dataset: + self.domainator_data = Dataset( + name="domainator", + data_path={ + f"{self.base_path}/domainator/domainator_combined.csv", + f"{self.base_path}/domainator/domainator_ziza.csv" + }, + cast_dataset=cast_domainator, + max_rows=self.max_rows + ) + + logger.debug("Domainator Loader") + logger.debug(self.domainator_data) + + return self.domainator_data @dataclass diff --git a/src/train/feature.py b/src/train/feature.py index e415fd29..900a9890 100644 --- a/src/train/feature.py +++ b/src/train/feature.py @@ -6,6 +6,11 @@ import math import polars as pl +import numpy as np +import itertools +import pylcs +import Levenshtein + sys.path.append(os.getcwd()) from src.base.log_config import get_logger @@ -155,3 +160,66 @@ def transform(self, x: pl.DataFrame) -> pl.DataFrame: logger.info("Finished data transformation") return x + + def transform_domainator(self, x: pl.DataFrame) -> pl.DataFrame: + logger.debug("Domainator transform") + + metrics_list = [] + window_size = 10 + min_window_size = 3 + + + x = x.with_columns( + pl.concat_str([pl.col('secondleveldomain'), pl.col('tld')], separator='.').alias('domain') + ) + + for user in x['user'].unique(): + # logger.debug(x.filter(pl.col('user') == user)) + for domain in x.filter(pl.col('user') == user)['domain'].unique(): + sub_list = x.filter((pl.col('user') == user) & (pl.col('domain') == domain))['thirdleveldomain'] + true_class = x.filter(pl.col('domain') == domain)['class'].unique() # currently assumes domain is not both malicious and legitimate + + windows = [sub_list[i:i+window_size] for i in range(0, len(sub_list), window_size)] + + if not windows: + windows = sub_list + + + for item in windows: + if len(item) > min_window_size: + cartesian = list(itertools.combinations(item, 2)) + + metrics = { + 'user': user, + 'class': true_class[0], + 'query': domain, + 'levenshtein': [], + 'jaro': [], + 'rev_jaro': [], + 'jaro_winkler': [], + 'rev_jaro_wink': [], + 'lcs_seq': [], + 'lcs_str': [], + } + + metrics['levenshtein'] = np.mean([Levenshtein.ratio(product[0], product[1]) for product in cartesian]) + metrics['jaro'] = np.mean([Levenshtein.jaro(product[0], product[1]) for product in cartesian]) + metrics['jaro_winkler'] = np.mean([Levenshtein.jaro_winkler(product[0], product[1], prefix_weight=0.2) for product in cartesian]) + metrics['rev_jaro'] = np.mean([Levenshtein.jaro(product[0][::-1], product[1][::-1]) for product in cartesian]) + metrics['rev_jaro_wink'] = np.mean([Levenshtein.jaro_winkler(product[0][::-1], product[1][::-1], prefix_weight=0.2) for product in cartesian]) + + metrics['lcs_seq'] = np.mean([pylcs.lcs_sequence_length(product[0], product[1])/((len(product[0]) + len(product[1]))/2) if len(product[0]) and len(product[1]) else 0.0 for product in cartesian ]) + metrics['lcs_str'] = np.mean([pylcs.lcs_string_length(product[0], product[1])/((len(product[0]) + len(product[1]))/2) if len(product[0]) and len(product[1]) else 0.0 for product in cartesian]) + + metrics_list.append(metrics) + + df = pl.from_dicts(metrics_list) + + logger.debug(df) + logger.debug(df['class'].unique()) + + + df = df.drop(["user"]) + + logger.debug("Transform done") + return df \ No newline at end of file diff --git a/src/train/model.py b/src/train/model.py index 3353f2c3..d2b817f4 100644 --- a/src/train/model.py +++ b/src/train/model.py @@ -84,7 +84,11 @@ def __init__( try: X, y = self._load_npy(ds.name) except FileNotFoundError: - data = self.processor.transform(x=ds.data) + if ds.name == 'domainator': + ds.data = self.processor.transform_domainator(x=ds.data) + data = ds.data.drop('query') + else: + data = self.processor.transform(x=ds.data) X = data.drop("class").to_numpy() encoded, _, _ = self._label_encoder(data["class"].to_list()) y = np.asarray(encoded).reshape(-1) diff --git a/src/train/train.py b/src/train/train.py index 78aeac45..dbab9e2d 100644 --- a/src/train/train.py +++ b/src/train/train.py @@ -39,9 +39,9 @@ class DatasetEnum(str, Enum): """Available dataset configurations for DGA detection model training""" COMBINE = "combine" - CIC = "cic" DGTA = "dgta" DGARCHIVE = "dgarchive" + DOMAINATOR = 'domainator' @unique @@ -112,13 +112,12 @@ def __init__( self.dataset.append(self.dataset_loader.dga_dataset) self.dataset.append(self.dataset_loader.heicloud_dataset) self.dataset = self.dataset + self.dataset_loader.dgarchive_dataset - # CIC DNS does work in practice and data is not clean. - case "cic": - self.dataset.append(self.dataset_loader.cic_dataset) case "dgta": self.dataset.append(self.dataset_loader.dgta_dataset) case "dgarchive": - self.dataset.append(self.dataset_loader.dgarchive_data) + self.dataset.append(self.dataset_loader.dgarchive_dataset) + case "domainator": + self.dataset.append(self.dataset_loader.domainator_dataset) case _: raise NotImplementedError(f"Dataset not implemented!") logger.info(f"Set up Pipeline.") @@ -380,8 +379,8 @@ def _sha256sum(self, file_path: str) -> str: "--dataset", "dataset", default="combine", - type=click.Choice(["combine", "dgarchive", "cic", "dgta"]), - help="Data set to train model, choose between all available datasets, DGArchive, CIC and DGTA.", + type=click.Choice(["combine", "dgarchive", "dgta", "domainator"]), + help="Data set to train model, choose between all available datasets, DGArchive, dominator (DNS Exfiltration) and DGTA.", ), click.option( "--dataset_path", diff --git a/src/zeek/additional_configs/dns_config.zeek b/src/zeek/additional_configs/dns_config.zeek deleted file mode 100644 index a66253ab..00000000 --- a/src/zeek/additional_configs/dns_config.zeek +++ /dev/null @@ -1,93 +0,0 @@ -@load base/protocols/dns -module CustomDNS; -export { - redef enum Log::ID += { LOG }; - type Info: record { - ts: string &log; - uid: string &log; - src_ip: addr &log; - src_port: port &log; - dns_server_ip: addr &log; - dns_server_port: port &log; - domain_name: string &log &optional; - record_type: string &log &optional; - response_ip: vector of string &log &optional; - ttls: vector of interval &log &optional; - rejected: bool &log &default=F; - status_code_id: count &log &optional; - status_code: string &log &optional; - - }; - global log_dns: event(rec: Info); - global dns_payload_sizes: table[string] of count - &default=0 - &write_expire = 5min; -} - - - -event zeek_init() &priority=5 -{ - Log::create_stream(CustomDNS::LOG, [$columns=Info, $path="custom_dns"]); -} - - -redef record CustomDNS::Info += { - size: count &log &optional; -}; - -event dns_message(c: connection, is_query: bool, msg: dns_msg, len: count) -{ - dns_payload_sizes[c$uid] = len; -} - -event DNS::log_dns(rec: DNS::Info) -{ - local dnsLog: Info = [ - $ts = strftime("%Y-%m-%dT%H:%M:%S", rec$ts), - $uid = rec$uid, - $src_ip = rec$id$orig_h, - $src_port = rec$id$orig_p, - $dns_server_ip = rec$id$resp_h, - $dns_server_port = rec$id$resp_p, - $rejected = rec$rejected - ]; - - ##### add custom log messages if a given field that needs to be present is not present in the logline #### - # use this only for fields that are absolutely necessary - - # Keep this deactivated for now, as we want to use zeek at first to not prefilter anything - -# if ( ! rec?$query ) -# print fmt("Info: missing domain in DNS log %s, skipping the log...", rec); -# if ( ! rec?$conn ) -# print fmt("Info:could not determine request length for line %s, skipping the log...", rec); -# - ########################################################################################################### - - if ( rec?$query ) - dnsLog$domain_name = rec$query; - - if ( rec?$qtype_name ) - dnsLog$record_type = rec$qtype_name; - - if ( rec?$answers ) - dnsLog$response_ip = rec$answers; - - if ( rec?$TTLs ) - dnsLog$ttls = rec$TTLs; - - if ( rec?$rcode ) - dnsLog$status_code_id = rec$rcode; - - if ( rec?$rcode_name ) - dnsLog$status_code = rec$rcode_name; - - if ( rec$uid in dns_payload_sizes ) - { - dnsLog$size = dns_payload_sizes[rec$uid]; - delete dns_payload_sizes[rec$uid]; - } - - Log::write(CustomDNS::LOG, dnsLog); -} diff --git a/src/zeek/additional_configs/http_config.zeek b/src/zeek/additional_configs/http_config.zeek deleted file mode 100644 index 51c6a04b..00000000 --- a/src/zeek/additional_configs/http_config.zeek +++ /dev/null @@ -1,70 +0,0 @@ -@load base/protocols/http -module CustomHTTP; -export { - redef enum Log::ID += { LOG }; - type Info: record { - ts: string &log; - uid: string &log; - src_ip: addr &log; - src_port: port &log; - dst_ip: addr &log; - dst_port: port &log; - method: string &log; - host: string &log &optional; - uri: string &log; - referrer: string &log &optional; - version: string &log &optional; - user_agent: string &log &optional; - request_body_len: count &log; - response_body_len: count &log; - status_code: count &log &optional; - status_msg: string &log &optional; - }; - global log_http: event(rec: Info); -} -event zeek_init() &priority=5 -{ - Log::create_stream(CustomHTTP::LOG, [$columns=Info, $path="custom_http"]); -} - - - -event HTTP::log_http(rec: HTTP::Info) -{ - local mylog: Info = [ - $ts = strftime("%Y-%m-%dT%H:%M:%S", rec$ts), - $uid = rec$uid, - $src_ip = rec$id$orig_h, - $src_port = rec$id$orig_p, - $dst_ip = rec$id$resp_h, - $dst_port = rec$id$resp_p, - $method = rec$method, - $uri = rec$uri, - $request_body_len = rec$request_body_len, - $response_body_len = rec$response_body_len - ]; - - - ##### add custom log messages if a given field that needs to be present is not present in the logline #### - - if ( ! rec?$host ) - print fmt("Info: missing host name in HTTP log %s, skipping the log...", rec); - if ( ! rec?$status_code ) - print fmt("Info: missing status code in HTTP log %s, skipping the log...", rec); - ########################################################################################################### - - if ( rec?$host ) - mylog$host = rec$host; - if ( rec?$version ) - mylog$version = rec$version; - if ( rec?$referrer ) - mylog$referrer = rec$referrer; - if ( rec?$user_agent ) - mylog$user_agent = rec$user_agent; - if ( rec?$status_code ) - mylog$status_code = rec$status_code; - if ( rec?$status_msg ) - mylog$status_msg = rec$status_msg; - - Log::write(CustomHTTP::LOG, mylog); -} diff --git a/src/zeek/base_node.cfg b/src/zeek/base_node.cfg deleted file mode 100644 index f1f50d25..00000000 --- a/src/zeek/base_node.cfg +++ /dev/null @@ -1,11 +0,0 @@ -[logger] -type=logger -host=localhost - -[manager] -type=manager -host=localhost - -[proxy] -type=proxy -host=localhost diff --git a/src/zeek/zeek_analysis_handler.py b/src/zeek/zeek_analysis_handler.py deleted file mode 100644 index 2e1199b7..00000000 --- a/src/zeek/zeek_analysis_handler.py +++ /dev/null @@ -1,125 +0,0 @@ -import sys -import os -import threading -import subprocess -import glob - -sys.path.append(os.getcwd()) -from src.base.log_config import get_logger - -logger = get_logger("zeek.sensor") - - -class ZeekAnalysisHandler: - """ - Handles the execution of Zeek analysis in either static or network analysis mode. - - This class manages the Zeek processing workflow, supporting both static analysis of - PCAP files and live network traffic analysis. It provides the necessary infrastructure - for launching Zeek processes, managing their execution, and handling their output. - - """ - - def __init__(self, zeek_config_location: str, zeek_log_location: str): - """ - Initialize the Zeek analysis handler with configuration and log locations. - - Args: - zeek_config_location: Path to the Zeek configuration file that defines - the analysis scripts and plugins to be loaded - zeek_log_location: Path where Zeek will write its processing logs - - Note: - The configuration file location typically points to local.zeek or - another site-specific configuration file that incorporates the necessary - analysis scripts and Kafka plugin configuration. - """ - self.zeek_log_location = zeek_log_location - self.zeek_config_location = zeek_config_location - - def start_analysis(self, static_analysis: bool): - """ - Start Zeek analysis in the specified mode. - - This method serves as the main entry point for initiating Zeek processing, - delegating to the appropriate analysis method based on the mode parameter. - - Args: - static_analysis: If True, process stored PCAP files; if False, analyze - live network traffic - """ - if static_analysis: - logger.info("static analysis mode selected") - self.start_static_analysis() - else: - logger.info("network analysis mode selected") - self.start_network_analysis() - - def start_static_analysis(self): - """ - Start an analysis by reading in PCAP files - - This method: - 1. Locates all PCAP files in the directory specified by STATIC_FILES_DIR - 2. Creates a separate Zeek process for each PCAP file - 3. Runs these processes in parallel using threads - 4. Waits for all processes to complete before returning - - The Zeek processes use the configured analysis scripts to process the PCAP - files and output the results to the configured destinations (typically Kafka - via the Zeek Kafka plugin). - """ - self.static_files_dir = os.getenv("STATIC_FILES_DIR") - files = glob.glob(f"{self.static_files_dir}/*.pcap") - threads = [] - for file in files: - logger.info(f"Starting Analysis for file {file}...") - command = ["zeek", "-C", "-r", file, self.zeek_config_location] - thread = threading.Thread(target=subprocess.run, args=(command,)) - thread.start() - threads.append(thread) - - for thread in threads: - thread.join() - logger.info("Finished static analyses") - - def start_network_analysis(self): - """ - Start Zeek in live network analysis mode. - - This method: - 1. Deploys the Zeek configuration using zeekctl - 2. Starts monitoring Zeek's log output in real-time - 3. Streams the processed data to the configured output destinations - - The method creates a dedicated thread to monitor Zeek's log output to prevent - buffer overflow issues that would occur if the output was processed in the - main thread. This ensures continuous processing of network traffic without - data loss. - """ - start_zeek = ["zeekctl", "deploy"] - thread = threading.Thread(target=subprocess.run, args=(start_zeek,)) - thread.start() - thread.join() - - process = subprocess.Popen( - ["tail", "-f", "/dev/null"], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - - def read_output(): # pragma: no cover - for line in iter(process.stdout.readline, ""): - if line: - print(f"[ZEEK LOG] {line}", end="") - process.stdout.close() - - logger.info("network analysis started") - # Start background thread to read stdout line by line - # necesseray because otherwise subprocess stdout will run into buffer errors eventually - reader_thread = threading.Thread(target=read_output, daemon=True) - reader_thread.start() - logger.info("network analysis ongoing") - reader_thread.join() - logger.info("network analysis stopped") diff --git a/src/zeek/zeek_config_handler.py b/src/zeek/zeek_config_handler.py deleted file mode 100644 index 1f0861a3..00000000 --- a/src/zeek/zeek_config_handler.py +++ /dev/null @@ -1,271 +0,0 @@ -import sys -import os -import shutil - -sys.path.append(os.getcwd()) -from src.base.log_config import get_logger -import glob - -logger = get_logger("zeek.sensor") - - -class ZeekConfigurationHandler: - """ - Handles the configuration of Zeek sensors based on the pipeline configuration. - - This class is responsible for setting up Zeek to process network traffic according - to the specified configuration. It configures the Zeek Kafka plugin, sets up worker - nodes for network interfaces, and integrates additional custom configurations. - - The handler supports both static analysis (processing PCAP files) and network - analysis (live traffic monitoring) modes, with configuration adapted to the - specific sensor requirements defined in the pipeline configuration. - - Example: - >>> config = { - ... "environment": { - ... "kafka_brokers": [{"hostname": "kafka1", "port": 9092, "node_ip": "192.168.1.100"}], - ... "kafka_topics_prefix": {"pipeline": {"logserver_in": "pipeline-logserver_in"}} - ... }, - ... "pipeline": { - ... "zeek": { - ... "sensors": { - ... "zeek1": { - ... "static_analysis": True, - ... "protocols": ["http", "dns"], - ... "interfaces": ["eth0"] - ... } - ... } - ... } - ... } - ... } - >>> os.environ["CONTAINER_NAME"] = "zeek1" - >>> handler = ZeekConfigurationHandler(config) - >>> handler.configure() - """ - - def __init__( - self, - configuration_dict: dict, - zeek_config_location: str = "/usr/local/zeek/share/zeek/site/local.zeek", - zeek_node_config_template: str = "/opt/src/zeek/base_node.cfg", - zeek_log_location: str = "/usr/local/zeek/log/zeek.log", - additional_configurations: str = "/opt/src/zeek/additional_configs/", - ): - """ - Initialize the Zeek configuration handler with the pipeline configuration. - - Args: - configuration_dict: The complete pipeline configuration dictionary - loaded from config.yaml, containing sensor, Kafka, and environment settings - zeek_config_location: Path to the main Zeek configuration file where - plugin configurations will be appended (default: standard Zeek location) - zeek_node_config_template: Path to the template for node.cfg configuration - (default: internal template in the project) - zeek_log_location: Path where Zeek will write its log files - (default: standard Zeek log location) - additional_configurations: Directory containing additional Zeek configuration - files that should be appended to the main configuration - """ - logger.info(f"Setting up Zeek configuration...") - self.base_config_location = zeek_config_location - self.additional_configurations = additional_configurations - self.zeek_node_config_template = zeek_node_config_template - self.zeek_node_config_path: str = "/usr/local/zeek/etc/node.cfg" - self.zeek_log_location = zeek_log_location - - self.container_name = os.getenv("CONTAINER_NAME", None) - if self.container_name is None: - logger.error( - "CONTAINER_NAME ENV variable could not be found. Aborting configuration..." - ) - raise Exception("CONTAINER_NAME env. variable not found.") - - configured_kafka_brokers = configuration_dict["environment"]["kafka_brokers"] - # configured_kafka_topic = configuration_dict["environment"]["kafka_topics"]["pipeline"]["zeek_to_logserver"] - zeek_sensor_configuration = configuration_dict["pipeline"]["zeek"]["sensors"][ - self.container_name - ] - - if ( - "static_analysis" in zeek_sensor_configuration.keys() - and zeek_sensor_configuration["static_analysis"] - ): - self.is_analysis_static = True - else: - self.is_analysis_static = False - try: - self.network_interfaces = zeek_sensor_configuration["interfaces"] - except Exception as e: - logger.error(e) - logger.error( - "Could not parse configuration for zeek sensor, as the 'interfaces' parameter is not specified" - ) - - self.kafka_topic_prefix = configuration_dict["environment"][ - "kafka_topics_prefix" - ]["pipeline"]["logserver_in"] - - self.configured_protocols = [ - protocol for protocol in zeek_sensor_configuration["protocols"] - ] - self.kafka_brokers = [ - f"{broker['node_ip']}:{broker['external_port']}" - for broker in configured_kafka_brokers - ] - logger.info(f"Succesfully parse config.yaml") - - def configure(self): - """ - Execute the complete Zeek configuration process. - - This method orchestrates the entire configuration workflow: - 1. For network analysis mode: Sets up node configuration for network interfaces - 2. Appends any additional custom configurations - 3. Creates and writes the Kafka plugin configuration - - The method adapts the configuration based on whether the sensor is in - static analysis mode (processing PCAP files) or network analysis mode - (monitoring live traffic). - - Note: - This is the main entry point for configuring Zeek. After calling this - method, Zeek should be fully configured and ready to process traffic - according to the pipeline specifications. - """ - logger.info(f"configuring Zeek...") - if not self.is_analysis_static: - self.template_and_copy_node_config() - self.append_additional_configurations() - self.create_plugin_configuration() - - def append_additional_configurations(self): - """ - Append custom configuration files to the main Zeek configuration. - - This method: - 1. Finds all *.zeek files in the additional configurations directory - 2. Appends their contents to the main Zeek configuration file - - Custom configuration files can be used to extend Zeek's functionality - with custom scripts, event handlers, or protocol analyzers without - modifying the core configuration. - - Example: - If additional_configurations="/opt/src/zeek/additional_configs/" - contains a file custom_http.zeek with content: - @load base/protocols/http/main.zeek - redef HTTP::default_accept_gzip = T; - - This content will be appended to the main Zeek configuration file. - - Note: - The method adds a newline before appending each file to ensure - proper separation between configuration sections. - """ - config_files = find_files_in_dir(self.additional_configurations) - with open(self.base_config_location, "a") as base_config: - base_config.write("\n") - for file in config_files: - with open(file) as additional_config: - base_config.writelines(additional_config) - - def create_plugin_configuration(self): - """ - Generate and write the Kafka plugin configuration for Zeek. - - This method: - 1. Creates the core Kafka plugin configuration - 2. Sets up topic mappings for each configured protocol - 3. Writes the complete configuration to the main Zeek configuration file - - The configuration directs Zeek to send processed log data to Kafka topics - following the naming convention: {kafka_topic_prefix}-{protocol} - - """ - config_lines = [ - "@load packages/zeek-kafka\n", - 'redef Kafka::topic_name = "";\n', - f"redef Kafka::kafka_conf = table(\n" - f' ["metadata.broker.list"] = "{",".join(self.kafka_brokers)}");\n', - "redef Kafka::tag_json = F;\n", - "event zeek_init() &priority=-10\n", - "{\n", - ] - for protocol in self.configured_protocols: - topic_name = f"{self.kafka_topic_prefix}-{protocol.lower()}" - zeek_protocol_log_format = f"Custom{protocol.upper()}" - kafka_writer_name = f"{protocol.lower()}_filter" - filter_block = f""" - local {kafka_writer_name}: Log::Filter = [ - $name = "kafka-{kafka_writer_name}", - $writer = Log::WRITER_KAFKAWRITER, - $path = "{topic_name}" - ]; - Log::add_filter({zeek_protocol_log_format}::LOG, {kafka_writer_name});\n - """ - config_lines.append(filter_block) - config_lines.append("\n}") - - with open(self.base_config_location, "a") as f: - f.writelines(config_lines) - logger.info("Wrote kafka zeek plugin configuration to file") - - def create_worker_configurations_for_interfaces(self): - """ - Generate configuration lines for Zeek worker nodes. - - This method creates the configuration blocks needed for Zeek's cluster mode, - where each network interface gets its own worker node. - - Returns: - List[str]: Configuration lines that should be appended to node.cfg - - Example: - For network_interfaces=["eth0", "dummy"], returns: - [ - "[zeek-eth0]\n", - "type=worker\n", - "host=localhost\n", - "[zeek-dummy]\n", - "type=worker\n", - "host=localhost\n" - ] - - Note: - This method is only called when in network analysis mode (not static analysis). - Each worker is configured to run on the local host and process traffic - from a specific network interface. - """ - worker_configuration_lines = [] - for network_interface in self.network_interfaces: - worker_configuration_lines.extend( - [f"[zeek-{network_interface}]\n", "type=worker\n", "host=localhost\n"] - ) - return worker_configuration_lines - - def template_and_copy_node_config(self): - """ - Set up the node configuration for Zeek cluster mode. - - This method: - 1. Copies the node configuration template to Zeek's expected location - 2. Appends worker configurations for each network interface - - The node configuration (node.cfg) defines how Zeek should distribute - processing across multiple worker processes, which is necessary for - monitoring multiple network interfaces simultaneously. - - Note: - This method is only called when in network analysis mode. Static - analysis mode does not require worker configuration as it processes - PCAP files sequentially. - """ - shutil.copy2(self.zeek_node_config_template, self.zeek_node_config_path) - configuration_lines = self.create_worker_configurations_for_interfaces() - with open(self.zeek_node_config_path, "a") as f: - f.writelines(configuration_lines) - - -def find_files_in_dir(path): # pragma: no cover - return glob.glob(os.path.join(path, "*.zeek")) diff --git a/src/zeek/zeek_handler.py b/src/zeek/zeek_handler.py deleted file mode 100644 index a11c7ce0..00000000 --- a/src/zeek/zeek_handler.py +++ /dev/null @@ -1,97 +0,0 @@ -import click -import sys -import yaml -import shutil -import os -from src.zeek.zeek_analysis_handler import ZeekAnalysisHandler -from src.zeek.zeek_config_handler import ZeekConfigurationHandler - -sys.path.append(os.getcwd()) -from src.base.log_config import get_logger - -logger = get_logger("zeek.sensor") - - -@click.command() -@click.option( - "-c", - "--config", - "configuration_file_path", - required=True, - type=click.File(mode="r"), - help="Path to the configuration file location", -) -@click.option( - "--zeek-config-location", - "zeek_config_location", - help=( - "Overrides the default configuration location of Zeek under /usr/local/zeek/share/zeek/site/local.zeek" - ), -) -def setup_zeek(configuration_file_path, zeek_config_location): - """ - Configure and start Zeek analysis based on pipeline configuration. - - This is the main entry point for the Zeek configuration and analysis process. - It handles the complete workflow from configuration setup to analysis execution. - - The function: - 1. Manages Zeek configuration backups to ensure clean setup between runs - 2. Parses the pipeline configuration file - 3. Configures Zeek using the specified or default configuration location - 4. Starts analysis in the appropriate mode (static or network) - - Args: - configuration_file_path: File object pointing to the pipeline configuration - YAML file that defines sensor settings, Kafka brokers, and other parameters - zeek_config_location: Optional path to override the default Zeek configuration - location. If not provided, uses /usr/local/zeek/share/zeek/site/local.zeek - - Workflow: - 1. On first run: Backs up the default Zeek configuration - 2. On subsequent runs: Restores the backed-up configuration to ensure a clean state - 3. Parses the YAML configuration file - 4. Configures Zeek using ZeekConfigurationHandler - 5. Starts analysis using ZeekAnalysisHandler in the mode specified by the config - - Raises: - yaml.YAMLError: If the configuration file is not valid YAML - Exception: If required environment variables (like CONTAINER_NAME) are missing - """ - default_zeek_config_location = "/usr/local/zeek/share/zeek/site/local.zeek" - default_zeek_config_backup_location = "/opt/local.zeek_backup" - initial_zeek_setup: bool = ( - False if os.path.isfile(default_zeek_config_backup_location) else True - ) - logger.info(f"initial setup: {initial_zeek_setup}") - if initial_zeek_setup: - logger.info("Backup default config") - shutil.copy2(default_zeek_config_location, default_zeek_config_backup_location) - else: - logger.info("Restore default config") - shutil.copy2(default_zeek_config_backup_location, default_zeek_config_location) - - configuration_file_content = configuration_file_path.read() - try: - data = yaml.safe_load(configuration_file_content) - except yaml.YAMLError as e: - logger.error("Error parsing the config file. Is this proper yaml?") - raise (e) - - if zeek_config_location is None: - zeek_config_location = default_zeek_config_location - zeekConfigHandler = ZeekConfigurationHandler(data, default_zeek_config_location) - else: - zeekConfigHandler = ZeekConfigurationHandler(data, zeek_config_location) - - zeekConfigHandler.configure() - logger.info("configured zeek") - zeekAnalysisHandler = ZeekAnalysisHandler( - zeek_config_location, zeekConfigHandler.zeek_log_location - ) - logger.info("starting analysis...") - zeekAnalysisHandler.start_analysis(zeekConfigHandler.is_analysis_static) - - -if __name__ == "__main__": # pragma: no cover - setup_zeek() diff --git a/start-pipeline.sh b/start-pipeline.sh deleted file mode 100755 index a9642143..00000000 --- a/start-pipeline.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -# Start the complete HAMSTRING pipeline with Zeek network capture - -set -e - -CONFIG_FILE="${1:-config-test.yaml}" -CPP_BUILD_DIR="cpp/build/src" - -echo "🚀 Starting HAMSTRING Pipeline with Zeek Integration" -echo "==================================================" -echo "" - -# Check if Zeek is installed -if ! command -v zeek &> /dev/null; then - echo "❌ Zeek is not installed. Please install Zeek first." - exit 1 -fi - -# Check if config file exists -if [ ! -f "$CONFIG_FILE" ]; then - echo "❌ Config file not found: $CONFIG_FILE" - exit 1 -fi - -echo "📋 Configuration: $CONFIG_FILE" -echo "" - -# Function to start a module in background -start_module() { - local name=$1 - local cmd=$2 - echo "▶️ Starting $name..." - $cmd & - echo " PID: $!" -} - -# Start Zeek for network capture -echo "🔍 Starting Zeek Network Capture" -echo "================================" -start_module "Zeek" "python -m src.zeek.zeek_handler -c $CONFIG_FILE" -sleep 2 - -echo "" -echo "🔧 Starting C++ Pipeline Modules" -echo "================================" - -# Start LogServer -if [ -x "$CPP_BUILD_DIR/logserver/logserver" ]; then - start_module "LogServer" "$CPP_BUILD_DIR/logserver/logserver $CONFIG_FILE" -else - echo "⚠️ LogServer binary not found, skipping" -fi -sleep 1 - -# Start LogCollector -if [ -x "$CPP_BUILD_DIR/logcollector/logcollector" ]; then - start_module "LogCollector" "$CPP_BUILD_DIR/logcollector/logcollector $CONFIG_FILE" -else - echo "⚠️ LogCollector binary not found, skipping" -fi -sleep 1 - -# Start Prefilter -if [ -x "$CPP_BUILD_DIR/prefilter/prefilter" ]; then - start_module "Prefilter" "$CPP_BUILD_DIR/prefilter/prefilter $CONFIG_FILE" -else - echo "⚠️ Prefilter binary not found, skipping" -fi -sleep 1 - -# Start Inspector (with ML anomaly detection) -if [ -x "$CPP_BUILD_DIR/inspector/inspector" ]; then - start_module "Inspector (ML)" "$CPP_BUILD_DIR/inspector/inspector $CONFIG_FILE" -else - echo "⚠️ Inspector binary not found, skipping" -fi - -echo "" -echo "✅ All modules started!" -echo "" -echo "📊 Monitor pipeline:" -echo " - Zeek input: docker exec kafka1 kafka-console-consumer --bootstrap-server localhost:19092 --topic pipeline-logserver_in-dns" -echo " - LogCollector: docker exec kafka1 kafka-console-consumer --bootstrap-server localhost:19092 --topic pipeline-logserver_to_collector-dns" -echo " - Inspector out: docker exec kafka1 kafka-console-consumer --bootstrap-server localhost:19092 --topic pipeline-prefilter_to_inspector-dga_inspector" -echo "" -echo "🛑 Stop all: pkill -f 'zeek|logserver|logcollector|prefilter|inspector'" -echo "" -echo "Press Ctrl+C to stop all modules" - -# Wait for interrupt -trap 'echo ""; echo "🛑 Stopping all modules..."; pkill -f "zeek|logserver|logcollector|prefilter|inspector"; exit 0' INT - -# Keep script running -wait diff --git a/test-pipeline.sh b/test-pipeline.sh deleted file mode 100755 index 31691a33..00000000 --- a/test-pipeline.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# Test script for HAMSTRING C++ Pipeline -# This script sends test DNS messages to Kafka and verifies the pipeline - -echo "=== HAMSTRING Pipeline Test ===" -echo "" - -# 1. Send a test DNS message to the input topic -echo "1. Sending test DNS message to pipeline-logserver_in-dns..." - -docker exec kafka1 kafka-console-producer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-logserver_in-dns << EOF -example.com NOERROR 192.168.1.100 -test.example.org NOERROR 10.0.0.1 -malicious.domain.xyz NXDOMAIN 8.8.8.8 -EOF - -echo " ✓ Sent 3 test messages" -echo "" - -# 2. Wait a moment for processing -echo "2. Waiting 2 seconds for processing..." -sleep 2 -echo "" - -# 3. Check if collector topic has messages -echo "3. Checking collector topic for forwarded messages..." -docker exec kafka1 kafka-console-consumer \ - --bootstrap-server localhost:19092 \ - --topic pipeline-logserver_to_collector-dga_collector \ - --from-beginning \ - --max-messages 3 \ - --timeout-ms 5000 2>/dev/null || echo " (No messages yet - might need to start LogServer)" - -echo "" -echo "=== Test Complete ===" diff --git a/tests/detector/test_domainator_detector.py b/tests/detector/test_domainator_detector.py new file mode 100644 index 00000000..ae25f5ae --- /dev/null +++ b/tests/detector/test_domainator_detector.py @@ -0,0 +1,192 @@ +import math +import numpy as np +import unittest +from unittest.mock import MagicMock, patch, call + +import os +import sys +sys.path.append(os.getcwd()) + +from src.detector.plugins.domainator_detector import DomainatorDetector +from src.base.data_classes.batch import Batch + + +DEFAULT_DATA = { + "src_ip": "192.168.0.167", + "dns_ip": "10.10.0.10", + "response_ip": "252.79.173.222", + "ts": "", + "status": "NXDOMAIN", + "domain_name": "IF356gEnJHPdRxnkDId4RDUSgtqxx9I+pZ5n1V53MdghOGQncZWAQgAPRx3kswi.750jnH6iSqmiAAeyDUMX0W6SHGpVsVsKSX8ZkKYDs0GFh/9qU5N9cwl00XSD8ID.NNhBdHZIb7nc0hDQXFPlABDLbRwkJS38LZ8RMX4yUmR2Mb6YqTTJBn+nUcB9P+v.jBQdwdS53XV9W2p1BHjh.16.f.1.6037.tunnel.example.org", + "record_type": "A", + "size": "100b", +} + + +class TestDomainatorDetector(unittest.TestCase): + def setUp(self): + patcher = patch("src.detector.plugins.domainator_detector.logger") + self.mock_logger = patcher.start() + self.addCleanup(patcher.stop) + + def _create_detector(self, mock_kafka_handler=None, mock_clickhouse=None): + """Helper method to create a DomainatorDetector instance with proper mocks.""" + if mock_kafka_handler is None: + mock_kafka_handler = MagicMock() + if mock_clickhouse is None: + mock_clickhouse = MagicMock() + + detector_config = { + "name": "domainator_detector", + "detector_module_name": "domainator_detector", + "detector_class_name": "DomainatorDetector", + "model": "rf", + "checksum": "9d86d66b4976c9b325bed0934a9a9eb3a20960b08be9afe491454624cc0aaa6c", + "base_url": "https://ajknqwjdnkjnkjnsakjdnkjsandkndkjwndjksnkakndw.de/d/0d5cbcbe16cd46a58021", + "threshold": 0.005, + } + + with patch( + "src.detector.detector.ExactlyOnceKafkaConsumeHandler", + return_value=mock_kafka_handler, + ), patch( + "src.detector.detector.ClickHouseKafkaSender", return_value=mock_clickhouse + ), patch.object( + DomainatorDetector, "_get_model", return_value=(MagicMock(), MagicMock()) + ): + + detector = DomainatorDetector(detector_config, "test_topic") + detector.model = MagicMock() + detector.scaler = MagicMock() + return detector + + def test_get_model_download_url(self): + """Test that the model download URL is correctly formatted.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + # overwrite model here again to not interefere with other tests when using it globally + detector.model = "rf" + self.maxDiff = None + expected_url = "https://ajknqwjdnkjnkjnsakjdnkjsandkndkjwndjksnkakndw.de/d/0d5cbcbe16cd46a58021/files/?p=%2Frf%2Fcedf2d892c073c590df5cb2b2bb09b419bd1650d7cd40a66e231b19b8c0a9cde%2Frf.pickle&dl=1" + self.assertEqual(detector.get_model_download_url(), expected_url) + + def test_detect(self): + mock_kafka = MagicMock() + mock_ch = MagicMock() + sut = self._create_detector(mock_kafka, mock_ch) + for _ in range(0, 4, 1): + sut.messages.append((DEFAULT_DATA)) + with patch( + "src.detector.plugins.domainator_detector.DomainatorDetector.predict", + return_value=[[0.01, 0.99]], + ): + sut.detect() + self.assertNotEqual([], sut.warnings) + + def test_predict_calls_model(self): + """Test that predict method correctly uses the model with features.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + # Mock model prediction + mock_prediction = np.array([[0.2, 0.8]]) + detector.model.predict_proba.return_value = mock_prediction + + # Test prediction + message = [{"domain_name": "google.com"}, {"domain_name": "google.com"}] + result = detector.predict(message) + + # Verify model was called once + detector.model.predict_proba.assert_called_once() + + # Verify the argument was correct + called_features = detector.model.predict_proba.call_args[0][0] + expected_features = detector._get_features("google.com") + np.testing.assert_array_equal(called_features, expected_features) + + # Verify prediction result + np.testing.assert_array_equal(result, mock_prediction) + + def test_get_features_basic_attributes(self): + """Test basic label features calculation.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + # Test with various 'google.com' subdomains + features = detector._get_features(["sub1.google.com", "sub2.google.com", "sub3.google.com"]) + + # Basic features: label_length, label_max, label_average + leven_dist = features[0][0] # Levenshtein distance + jaro_dist = features[0][1] # Jaro distance + lcs = features[0][6] # Longest common string + + self.assertEqual(leven_dist, 0.75) + self.assertAlmostEqual(jaro_dist, 0.833, 3) # Rounded to 3 decimal places + self.assertEqual(lcs, 0.75) + + def test_get_features_empty_domains(self): + """Test handling of empty domain strings.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features = detector._get_features(["", "", "", ""]) + + print(features[0][0], features[0][1], features[0][2]) + + # Basic features + self.assertEqual(features[0][0], 1.) # Levenshtein distance of empty strings is 1 + self.assertEqual(features[0][1], 1.) # Jaro distance of empty strings is 1 + self.assertEqual(features[0][2], 1.) # Jaro distance on the reverse empty strings is 1 + self.assertEqual(features[0][3], 1.) # Jaro-Winkler distance of empty strings is 1 + self.assertEqual(features[0][4], 1.) # Jaro-Winkler distance on the reverse empty strings is 1 + self.assertEqual(features[0][5], 0.) # Longest common sequence of empty strings is 0 + self.assertEqual(features[0][6], 0.) # Longest common string of empty strings is 0 + + def test_get_features_single_same_character(self): + """Test handling of single character domain.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features = detector._get_features(["a", "a", "a"]) + + # Basic features + self.assertEqual(features[0][0], 1.) # Levenshtein distance of same strings is 1 + self.assertEqual(features[0][1], 1.) # Jaro distance of same strings is 1 + self.assertEqual(features[0][2], 1.) # Jaro distance on the reverse same strings is 1 + self.assertEqual(features[0][3], 1.) # Jaro-Winkler distance of same strings is 1 + self.assertEqual(features[0][4], 1.) # Jaro-Winkler distance on the reverse same strings is 1 + self.assertEqual(features[0][5], 0.) # Longest common sequence of same strings is 0 + self.assertEqual(features[0][6], 0.) # Longest common string of same strings is 0 + + def test_get_features_feature_vector_shape(self): + """Test that the feature vector has the expected shape.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features = detector._get_features(["test.domain.com", "test.domain.com", "test.domain.com"]) + + expected_entropy = 7 + + self.assertEqual(features.shape, (1, expected_entropy)) + + def test_get_features_case_insensitivity(self): + """Test that the statistical comparison is case-insensitive.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features_upper = detector._get_features(["DRIVE.GOOGLE.COM", "WORKSPACE.GOOGLE.COM"]) + features_lower = detector._get_features(["drive.google.com", "workspace.google.com"]) + + # The comparison features should be identical regardless of case + np.testing.assert_array_almost_equal( + features_upper[0][0:], + features_lower[0][0:], + decimal=5, + ) \ No newline at end of file diff --git a/tests/zeek/__init__.py b/tests/zeek/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/zeek/test_zeek_analysis_handler.py b/tests/zeek/test_zeek_analysis_handler.py deleted file mode 100644 index 562fa74e..00000000 --- a/tests/zeek/test_zeek_analysis_handler.py +++ /dev/null @@ -1,51 +0,0 @@ -import unittest -from unittest.mock import patch, MagicMock -from src.zeek.zeek_analysis_handler import ZeekAnalysisHandler -import os - - -class TestZeekAnalysisHandler(unittest.TestCase): - def setUp(self): - os.environ["STATIC_FILES_DIR"] = "/tmp" - self.handler = ZeekAnalysisHandler("/mock/config.zeek", "/mock/logs") - - def tearDown(self): - del os.environ["STATIC_FILES_DIR"] - - @patch("src.zeek.zeek_analysis_handler.glob.glob") - @patch("src.zeek.zeek_analysis_handler.threading.Thread") - def test_start_static_analysis(self, mock_thread_class, mock_glob): - mock_glob.return_value = ["/tmp/test.pcap"] - mock_thread_instance = MagicMock() - mock_thread_instance.start = MagicMock() - mock_thread_class.return_value = mock_thread_instance - - # Act - self.handler.start_static_analysis() - call, args = mock_thread_class.call_args_list[0] - args_list = args["args"] - # Assert - self.assertIn( - ["zeek", "-C", "-r", "/tmp/test.pcap", "/mock/config.zeek"], args_list - ) - mock_thread_class.assert_called_once() - mock_thread_instance.start.assert_called() - mock_thread_instance.join.assert_called() - - @patch("src.zeek.zeek_analysis_handler.threading.Thread") - def test_start_analysis_network_mode(self, mock_thread): - # Act - self.handler.start_analysis(static_analysis=False) - self.handler.start_static_analysis = MagicMock() - - # Assert - self.handler.start_static_analysis.assert_not_called() - - @patch("src.zeek.zeek_analysis_handler.threading.Thread") - def test_start_analysis_static_mode(self, mock_thread): - # Act - self.handler.start_analysis(static_analysis=True) - self.handler.start_network_analysis = MagicMock() - - # Assert - self.handler.start_network_analysis.assert_not_called() diff --git a/tests/zeek/test_zeek_config_handler.py b/tests/zeek/test_zeek_config_handler.py deleted file mode 100644 index aed72711..00000000 --- a/tests/zeek/test_zeek_config_handler.py +++ /dev/null @@ -1,226 +0,0 @@ -import unittest -from unittest.mock import patch, MagicMock, mock_open, call -import tempfile -import os -from src.zeek.zeek_config_handler import ZeekConfigurationHandler - - -class TestZeekConfigHandler(unittest.TestCase): - def setUp(self): - os.environ["CONTAINER_NAME"] = "ZEEK_TEST_CONTAINER" - self.mock_config = { - "pipeline": { - "zeek": { - "sensors": { - "ZEEK_TEST_CONTAINER": { - "static_analysis": True, - "protocols": ["http", "dns"], - "interfaces": ["enx84ba5960ffe6"], - } - } - } - }, - "environment": { - "kafka_brokers": [ - { - "hostname": "kafka1", - "external_port": 8097, - "node_ip": "192.168.175.69", - }, - { - "hostname": "kafka2", - "external_port": 8098, - "node_ip": "192.168.175.69", - }, - { - "hostname": "kafka3", - "external_port": 8099, - "node_ip": "192.168.175.69", - }, - ], - "kafka_topics_prefix": { - "pipeline": { - "logserver_in": "pipeline-logserver_in", - "logserver_to_collector": "pipeline-logserver_to_collector", - "batch_sender_to_prefilter": "pipeline-batch_sender_to_prefilter", - "prefilter_to_inspector": "pipeline-prefilter_to_inspector", - "inspector_to_detector": "pipeline-inspector_to_detector", - } - }, - }, - } - - def tearDown(self): - del os.environ["CONTAINER_NAME"] - - def test_default_initialization_static_analysis(self): - handler = ZeekConfigurationHandler(self.mock_config) - # Assert - self.assertEqual( - handler.base_config_location, "/usr/local/zeek/share/zeek/site/local.zeek" - ) - self.assertEqual(handler.zeek_log_location, "/usr/local/zeek/log/zeek.log") - self.assertEqual(handler.is_analysis_static, True) - - def test_default_initialization_network_analysis(self): - self.mock_config["pipeline"]["zeek"]["sensors"]["ZEEK_TEST_CONTAINER"][ - "static_analysis" - ] = False - handler = ZeekConfigurationHandler(self.mock_config) - # Assert - self.assertEqual( - handler.base_config_location, "/usr/local/zeek/share/zeek/site/local.zeek" - ) - self.assertEqual(handler.zeek_log_location, "/usr/local/zeek/log/zeek.log") - self.assertEqual(handler.is_analysis_static, False) - self.assertEqual(handler.network_interfaces, ["enx84ba5960ffe6"]) - - @patch("builtins.open", new_callable=unittest.mock.mock_open) - @patch("os.path.exists", return_value=True) - def test_configure_default_mode(self, mock_exists, mock_open): - handler = ZeekConfigurationHandler(self.mock_config) - # Arrange - pipline_in_topic_prefix = self.mock_config["environment"][ - "kafka_topics_prefix" - ]["pipeline"]["logserver_in"] - handler.zeek_node_config_path = "/tmp/node.cfg" - handler.potocol_to_topic_configurations = { - "http": "http-topic", - "dns": "dns-topic", - } - handler.kafka_brokers = ["localhost:9092"] - - # Act - handler.configure() - - # Assert - mock_open.assert_any_call("/usr/local/zeek/share/zeek/site/local.zeek", "a") - handle = mock_open() - - args, kwargs = handle.writelines.call_args_list[0] - written_lines = "".join(args[0]) - - expected_lines = [ - "@load packages/zeek-kafka", - "Log::add_filter(CustomHTTP::LOG, http_filter)", - "Log::add_filter(CustomDNS::LOG, dns_filter)", - '["metadata.broker.list"] = "localhost:9092"', - f"{pipline_in_topic_prefix}-dns", - f"{pipline_in_topic_prefix}-http", - ] - for expected_line in expected_lines: - self.assertIn(expected_line, written_lines) - - @patch("src.zeek.zeek_config_handler.glob.glob") - def test_append_additional_configurations(self, mock_glob): - # Arrange - mock_glob.return_value = ["/opt/src/zeek/additional_configs/custom.zeek"] - - # Create a mock that returns different content for different files - m = mock_open(read_data="@load custom-script\n") - - with patch("builtins.open", m, create=True): - handler = ZeekConfigurationHandler(self.mock_config) - handler.base_config_location = "/mock/zeek.cfg" - - # Act - handler.append_additional_configurations() - - # Assert - mock_glob.assert_called_once_with("/opt/src/zeek/additional_configs/*.zeek") - - # Check the content that was written to the base config - handle = m() - - args, kwargs = handle.writelines.call_args_list[0] - written_lines = "".join(args[0]) - - self.assertIn("@load custom-script", written_lines) - self.assertEqual(handle.write.call_count, 1) - self.assertEqual(handle.writelines.call_count, 1) - - def test_create_plugin_configuration(self): - # Arrange - mock_config = { - "environment": { - "kafka_brokers": [ - { - "hostname": "kafka1", - "external_port": 8097, - "node_ip": "192.168.175.69", - }, - { - "hostname": "kafka2", - "external_port": 8098, - "node_ip": "192.168.175.70", - }, - ], - "kafka_topics_prefix": { - "pipeline": {"logserver_in": "pipeline-logserver_in"} - }, - }, - "pipeline": { - "zeek": {"sensors": {"test_container": {"protocols": ["http", "dns"]}}} - }, - } - os.environ["CONTAINER_NAME"] = "test_container" - - handler = ZeekConfigurationHandler(mock_config) - handler.base_config_location = "/mock/zeek.cfg" - - # Create a mock for the file - m = mock_open() - - # Act - with patch("builtins.open", m): - handler.create_plugin_configuration() - - handle = m() - # Assert - m.assert_any_call("/mock/zeek.cfg", "a") - - args, kwargs = handle.writelines.call_args_list[ - 0 - ] # args is a tuple of positional args - written_lines = "".join( - args[0] - ) # the iterable passed to writelines concatenated to a single string - - expected_lines = [ - "@load packages/zeek-kafka", - 'redef Kafka::topic_name = ""', - "192.168.175.69:8097,192.168.175.70:8098", - "pipeline-logserver_in-http", - "CustomDNS::LOG", - "CustomHTTP::LOG", - "pipeline-logserver_in-dns", - ] - for expected_line in expected_lines: - self.assertIn(expected_line, written_lines) - - @patch("builtins.open", new_callable=mock_open) - @patch("src.zeek.zeek_config_handler.shutil.copy2") - def test_template_and_copy_node_config(self, mock_shutil_copy, mock_open_file): - # Arrange - handler = ZeekConfigurationHandler(self.mock_config) - handler.is_analysis_static = False - handler.network_interfaces = ["eth0", "dummy"] - - # Act - handler.template_and_copy_node_config() - - # Assert - mock_shutil_copy.assert_called_once_with( - "/opt/src/zeek/base_node.cfg", "/usr/local/zeek/etc/node.cfg" - ) - - expected_worker_config = [ - "[zeek-eth0]\n", - "type=worker\n", - "host=localhost\n", - "[zeek-dummy]\n", - "type=worker\n", - "host=localhost\n", - ] - mock_open_file.assert_called_once_with("/usr/local/zeek/etc/node.cfg", "a") - mock_open_file().writelines.assert_called_once_with(expected_worker_config) diff --git a/tests/zeek/test_zeek_handler.py b/tests/zeek/test_zeek_handler.py deleted file mode 100644 index 399dd679..00000000 --- a/tests/zeek/test_zeek_handler.py +++ /dev/null @@ -1,278 +0,0 @@ -import unittest -from unittest.mock import patch, MagicMock, call -import os -import tempfile -from src.zeek.zeek_handler import setup_zeek -from click.testing import CliRunner -import yaml -import shutil - - -class TestZeekHandler(unittest.TestCase): - def setUp(self): - # Create a temporary directory for test files - self.temp_dir = tempfile.mkdtemp() - self.default_zeek_config = os.path.join(self.temp_dir, "local.zeek") - self.default_backup = os.path.join(self.temp_dir, "local.zeek_backup") - - # Create a dummy default config file - with open(self.default_zeek_config, "w") as f: - f.write("# Default Zeek config\n") - - def tearDown(self): - # Clean up temporary directory - shutil.rmtree(self.temp_dir) - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("shutil.copy2") - def test_setup_zeek_success( - self, mock_copy, mock_open, mock_analysis_handler, mock_config_handler_cls - ): - # Arrange - mock_config_handler_obj = MagicMock() - mock_config_handler_obj.zeek_log_location = "/mock/location.log" - mock_config_handler_obj.is_analysis_static = False - mock_config_handler_cls.return_value = mock_config_handler_obj - - mock_analysis = MagicMock() - mock_analysis_handler.return_value = mock_analysis - - runner = CliRunner() - # Act - result = runner.invoke( - setup_zeek, - ["-c", "/mock/config.yaml", "--zeek-config-location", "/mock/zeek.cfg"], - ) - - # Assert - mock_config_handler_cls.assert_called_once_with("config_data", "/mock/zeek.cfg") - - mock_config_handler_cls.return_value.configure.assert_called_once() - mock_analysis_handler.assert_called_once_with( - "/mock/zeek.cfg", mock_config_handler_obj.zeek_log_location - ) - mock_analysis.start_analysis.assert_called_once_with( - mock_config_handler_obj.is_analysis_static - ) - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("shutil.copy2") - @patch("yaml.safe_load") - def test_setup_zeek_with_error( - self, - mock_yaml_safe_load, - mock_copy, - mock_open, - mock_analysis_handler, - mock_config_handler, - ): - # Arrange - runner = CliRunner() - mock_yaml_safe_load.side_effect = yaml.YAMLError - # Act & Assert - result = runner.invoke( - setup_zeek, - [ - "-c", - "/invalid/config.yaml", - "--zeek-config-location", - "/invalid/zeek.cfg", - ], - ) - self.assertIsInstance(result.exception, yaml.YAMLError) - mock_config_handler.assert_not_called() - mock_analysis_handler.assert_not_called() - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("shutil.copy2") - def test_setup_zeek_static_analysis( - self, mock_copy, mock_open, mock_analysis_handler, mock_config_handler_cls - ): - # Arrange - os.environ["STATIC_ANALYSIS"] = "true" - mock_analysis = MagicMock() - mock_analysis_handler.return_value = mock_analysis - - mock_config_handler_obj = MagicMock() - mock_config_handler_obj.is_analysis_static = True - mock_config_handler_cls.return_value = mock_config_handler_obj - runner = CliRunner() - # Act - result = runner.invoke( - setup_zeek, - ["-c", "/mock/config.yaml", "--zeek-config-location", "/mock/zeek.cfg"], - ) - # Assert - mock_analysis.start_analysis.assert_called_once() - mock_analysis.start_analysis.assert_called_once_with( - mock_config_handler_obj.is_analysis_static - ) - del os.environ["STATIC_ANALYSIS"] - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("shutil.copy2") - def test_default_config_location_used( - self, mock_copy, mock_open, mock_analysis_handler, mock_config_handler_cls - ): - """ - Test that the default config location is used when no custom location is provided. - """ - # Arrange - mock_config_handler_obj = MagicMock() - mock_config_handler_obj.zeek_log_location = "/mock/location.log" - mock_config_handler_obj.is_analysis_static = False - mock_config_handler_cls.return_value = mock_config_handler_obj - - runner = CliRunner() - - # Act - result = runner.invoke(setup_zeek, ["-c", "/mock/config.yaml"]) - - # Assert - # Verify that default location was used - mock_config_handler_cls.assert_called_once_with( - "config_data", "/usr/local/zeek/share/zeek/site/local.zeek" - ) - - # Verify the analysis handler was initialized with the default location - mock_analysis_handler.assert_called_once_with( - "/usr/local/zeek/share/zeek/site/local.zeek", - mock_config_handler_obj.zeek_log_location, - ) - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("shutil.copy2") - def test_custom_config_location_used( - self, mock_copy, mock_open, mock_analysis_handler, mock_config_handler_cls - ): - """ - Test that a custom config location is used when provided. - """ - # Arrange - mock_config_handler_obj = MagicMock() - mock_config_handler_obj.zeek_log_location = "/mock/location.log" - mock_config_handler_obj.is_analysis_static = False - mock_config_handler_cls.return_value = mock_config_handler_obj - - runner = CliRunner() - - # Act - result = runner.invoke( - setup_zeek, - ["-c", "/mock/config.yaml", "--zeek-config-location", "/custom/zeek.cfg"], - ) - - # Assert - # Verify that custom location was used - mock_config_handler_cls.assert_called_once_with( - "config_data", "/custom/zeek.cfg" - ) - - # Verify the analysis handler was initialized with the custom location - mock_analysis_handler.assert_called_once_with( - "/custom/zeek.cfg", mock_config_handler_obj.zeek_log_location - ) - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("os.path.isfile") - @patch("shutil.copy2") - def test_non_initial_setup_restore_backup( - self, - mock_copy, - mock_isfile, - mock_open, - mock_analysis_handler, - mock_config_handler_cls, - ): - """ - Test that the backup config is restored when it's not the initial setup. - """ - # Arrange - runner = CliRunner() - - # Mock os.path.isfile to return True, indicating backup exists (non-initial setup) - mock_isfile.return_value = True - - # Mock environment - mock_config_handler_obj = MagicMock() - mock_config_handler_obj.zeek_log_location = "/mock/location.log" - mock_config_handler_obj.is_analysis_static = False - mock_config_handler_cls.return_value = mock_config_handler_obj - - # Act - result = runner.invoke(setup_zeek, ["-c", "/mock/config.yaml"]) - - # Assert - # Verify that the backup was restored - mock_copy.assert_any_call( - "/opt/local.zeek_backup", "/usr/local/zeek/share/zeek/site/local.zeek" - ) - - # Verify configuration proceeded after restore - mock_config_handler_cls.return_value.configure.assert_called_once() - - @patch("src.zeek.zeek_handler.ZeekConfigurationHandler") - @patch("src.zeek.zeek_handler.ZeekAnalysisHandler") - @patch( - "builtins.open", new_callable=unittest.mock.mock_open, read_data="config_data" - ) - @patch("os.path.isfile") - @patch("shutil.copy2") - def test_initial_setup_backup_default( - self, - mock_copy, - mock_isfile, - mock_open, - mock_analysis_handler, - mock_config_handler_cls, - ): - """ - Test that the default config is backed up when it's the initial setup. - """ - # Arrange - runner = CliRunner() - - # Mock os.path.isfile to return False, indicating no backup exists (initial setup) - mock_isfile.return_value = False - - # Mock environment - mock_config_handler_obj = MagicMock() - mock_config_handler_obj.zeek_log_location = "/mock/location.log" - mock_config_handler_obj.is_analysis_static = False - mock_config_handler_cls.return_value = mock_config_handler_obj - - # Act - result = runner.invoke(setup_zeek, ["-c", "/mock/config.yaml"]) - - # Assert - # Verify that the default config was backed up - mock_copy.assert_any_call( - "/usr/local/zeek/share/zeek/site/local.zeek", "/opt/local.zeek_backup" - ) - - # Verify configuration proceeded after backup - mock_config_handler_cls.return_value.configure.assert_called_once() diff --git a/train_models.sh b/train_models.sh deleted file mode 100644 index 122a8fca..00000000 --- a/train_models.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -python src/train/train.py train --dataset combine --dataset_path ./data --model rf > rf_training.out -python src/train/train.py train --dataset combine --dataset_path ./data --model xg > xg_training.out -python src/train/train.py train --dataset combine --dataset_path ./data --model gbm > gbm_training.out