From 763708f76c8193dd173b83066dc0deec02255b7b Mon Sep 17 00:00:00 2001 From: Katie Strader Date: Thu, 25 Jun 2026 11:41:00 -0700 Subject: [PATCH] fix: set up log formatting to human readable text and set that as the dfault --- .../.dlt-example/config.toml | 4 +- .../.dlt-example/config.toml | 4 +- src/openhound/core/logging.py | 90 ++++++++++++++++--- tests/test_log_handlers.py | 66 +++++++++++++- 4 files changed, 150 insertions(+), 14 deletions(-) diff --git a/example-configurations/bloodhound-community/.dlt-example/config.toml b/example-configurations/bloodhound-community/.dlt-example/config.toml index 8122887..e1b7caa 100644 --- a/example-configurations/bloodhound-community/.dlt-example/config.toml +++ b/example-configurations/bloodhound-community/.dlt-example/config.toml @@ -2,8 +2,10 @@ [runtime] http_show_error_body = true log_cli_level = "WARNING" -log_format = "JSON" log_rotate_when = "midnight" +# Default: logs are set to human readable text +# To switch to structured JSON instead, uncomment the line below (must be uppercase "JSON") +# log_format = "JSON" [extract] workers = 8 diff --git a/example-configurations/bloodhound-enterprise/.dlt-example/config.toml b/example-configurations/bloodhound-enterprise/.dlt-example/config.toml index 119f7f9..584e4b4 100644 --- a/example-configurations/bloodhound-enterprise/.dlt-example/config.toml +++ b/example-configurations/bloodhound-enterprise/.dlt-example/config.toml @@ -2,8 +2,10 @@ [runtime] http_show_error_body = true log_cli_level = "WARNING" -log_format = "JSON" log_rotate_when = "midnight" +# Default: logs are set to human readable text +# To switch to structured JSON instead, uncomment the line below (must be uppercase "JSON") +# log_format = "JSON" [extract] workers = 8 diff --git a/src/openhound/core/logging.py b/src/openhound/core/logging.py index a11e253..a53233b 100644 --- a/src/openhound/core/logging.py +++ b/src/openhound/core/logging.py @@ -17,6 +17,8 @@ VALID_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] +VALID_FORMATS = ["json", "text"] + # This should be a complete list of default fields as part of a LogRecord # this is used to prevent custom fields overwriting the default LogRecord entries DEFAULT_LOG_FIELDS = [ @@ -155,6 +157,46 @@ def format(self, record: logging.LogRecord) -> str: return json.dumps(log_data) +class OpenHoundTextFormatter(logging.Formatter): + """Human-readable plain-text formatter for OpenHound file logs""" + + def format(self, record: logging.LogRecord) -> str: + """Format a log record as a single readable line with structured context. + + Args: + record (logging.LogRecord): The original log record + + Returns: + str: A human-readable plain-text representation of the log record + """ + timestamp = self.formatTime(record, "%Y-%m-%d %H:%M:%S") + location = f"{record.name}:{record.funcName}:{record.lineno}" + log_line = ( + f"{timestamp} [{record.levelname}] {location} - {record.getMessage()}" + ) + + # Append any custom/extra fields (e.g. resource, phase, extension) so the + # structured context is preserved without the noise of full JSON + extras = { + key: value + for key, value in record.__dict__.items() + if ( + not key.startswith("_") + and key not in DEFAULT_LOG_FIELDS + and key != "taskName" + and value is not None + ) + } + if extras: + extra_str = " ".join(f"{key}={value}" for key, value in extras.items()) + log_line = f"{log_line} | {extra_str}" + + if record.exc_info: + log_line = f"{log_line}\n{self.formatException(record.exc_info)}" + + return log_line + + class OpenHoundRichFormatter(logging.Formatter): """Custom formatter for Rich when logging in CLI mode""" @@ -182,6 +224,7 @@ def __init__( interval: int = 1, cli_level: str = "ERROR", base_path: str | None = None, + log_format: str = "text", ): self.level = level self.name = name @@ -192,6 +235,7 @@ def __init__( self.backup_count = backup_count self.interval = interval self.cli_level = cli_level + self.log_format = log_format self.base_path = Path(base_path) if base_path else self.default_platform_path() self.log_file_path: Path | None = None @@ -210,6 +254,14 @@ def _validate_level(level: str, default: str = "INFO") -> str: return default + @staticmethod + def _validate_format(log_format: str, default: str = "text") -> str: + normalized_format = log_format.lower() + if normalized_format in VALID_FORMATS: + return normalized_format + + return default + @staticmethod def _is_valid_path(base_path: Path, spec_path: Path) -> Path: """Validate the user provided path to prevent directory traversal and ensure it's within the base path @@ -275,12 +327,18 @@ def setup(self) -> None: dlt_interval = dlt.config.get("runtime.log_interval", int) dlt_cli_level = dlt.config.get("runtime.log_cli_level", str) dlt_log_path = dlt.config.get("runtime.log_path", str) + # dlt only treats the exact value "JSON" specially. We read it to select json logs and + # fall back to text for anything else (including dlt's default format string). + dlt_log_format = dlt.config.get("runtime.log_format", str) # Check if the DLT config values are valid and if so override the defaults self.level = self._validate_level(dlt_level or self.level, default="INFO") self.cli_level = self._validate_level( dlt_cli_level or self.cli_level, default="ERROR" ) + self.log_format = self._validate_format( + dlt_log_format or self.log_format, default="text" + ) # Override the base path if log_path is set in DLT config, otherwise use the default platform path self.base_path = Path(dlt_log_path) if dlt_log_path else self.base_path @@ -303,17 +361,29 @@ def setup(self) -> None: self.root_logger.handlers.clear() self.handlers[self.runtime_mode](self.root_logger, self.log_file_path) + def _file_formatter(self) -> logging.Formatter: + """Return the formatter for file/stdout handlers based on the configured log format. + + Returns: + logging.Formatter: A JSON formatter when log_format is 'json', otherwise a + human-readable plain-text formatter. + """ + if self.log_format == "json": + return OpenHoundJSONFormatter() + return OpenHoundTextFormatter() + def container_handlers(self, logger: logging.Logger, file_path: Path) -> None: """Set the logging handler/format when running in a container""" - json_formatter = OpenHoundJSONFormatter() + formatter = self._file_formatter() - # Log to stdout in JSON for better compatibility with container-based logging systems + # Log to stdout for better compatibility with container-based logging systems. + # Output is human-readable text by default; set runtime.log_format = "JSON" for structured JSON. stdout_handler = logging.StreamHandler(sys.stdout) - stdout_handler.setFormatter(json_formatter) + stdout_handler.setFormatter(formatter) logger.addHandler(stdout_handler) - # But also log the same json format to a file for persistence and debugging when needed + # But also log the same format to a file for persistence and debugging when needed rotating_file_handler = RotatingFileHandler( file_path, when=self.rotate_when, @@ -321,7 +391,7 @@ def container_handlers(self, logger: logging.Logger, file_path: Path) -> None: backupCount=self.backup_count, max_bytes=self.max_bytes, ) - rotating_file_handler.setFormatter(json_formatter) + rotating_file_handler.setFormatter(formatter) # This regular expression overrides the default extMatch to recognize both # default time based rotation filenames and size based rotation filenames (which gets a seconds added as well) rotating_file_handler.extMatch = re.compile( @@ -347,8 +417,8 @@ def cli_handlers(self, logger: logging.Logger, file_path: Path) -> None: console_handler.setFormatter(rich_formatter) logger.addHandler(console_handler) - # But also save the logs to a file in JSON format :) - json_formatter = OpenHoundJSONFormatter() + # But also save the logs to a file using the configured format (text by default) + file_formatter = self._file_formatter() rotating_file_handler = RotatingFileHandler( file_path, when=self.rotate_when, @@ -356,7 +426,7 @@ def cli_handlers(self, logger: logging.Logger, file_path: Path) -> None: backupCount=self.backup_count, max_bytes=self.max_bytes, ) - rotating_file_handler.setFormatter(json_formatter) + rotating_file_handler.setFormatter(file_formatter) # This regular expression overrides the default extMatch to recognize both # default time based rotation filenames and size based rotation filenames (which gets a seconds added as well) rotating_file_handler.extMatch = re.compile( @@ -367,7 +437,7 @@ def cli_handlers(self, logger: logging.Logger, file_path: Path) -> None: def service_handlers(self, logger: logging.Logger, file_path: Path) -> None: """Set the logging handler/format when running the OpenHound service""" - json_formatter = OpenHoundJSONFormatter() + file_formatter = self._file_formatter() rotating_file_handler = RotatingFileHandler( file_path, when=self.rotate_when, @@ -375,7 +445,7 @@ def service_handlers(self, logger: logging.Logger, file_path: Path) -> None: backupCount=self.backup_count, max_bytes=self.max_bytes, ) - rotating_file_handler.setFormatter(json_formatter) + rotating_file_handler.setFormatter(file_formatter) # This regular expression overrides the default extMatch to recognize both # default time based rotation filenames and size based rotation filenames (which gets a seconds added as well) rotating_file_handler.extMatch = re.compile( diff --git a/tests/test_log_handlers.py b/tests/test_log_handlers.py index 01f59a8..1a6ca1f 100644 --- a/tests/test_log_handlers.py +++ b/tests/test_log_handlers.py @@ -1,7 +1,15 @@ import json import logging -from openhound.core.logging import RotatingFileHandler, logger_override +import pytest + +from openhound.core.logging import ( + CustomLogger, + OpenHoundJSONFormatter, + OpenHoundTextFormatter, + RotatingFileHandler, + logger_override, +) def test_root_handler_setup(): @@ -59,8 +67,10 @@ def test_dlt_extension_handlers(): ) -def test_log_routing_content(tmp_path, caplog): +def test_log_routing_content(tmp_path, caplog, monkeypatch): """Test that logs are correctly routed and that the files are created for the expected paths""" + # Pin JSON (dlt's only crash-safe override value) so the JSON-parsing assertions are self-contained. + monkeypatch.setenv("RUNTIME__LOG_FORMAT", "JSON") logger_override.base_path = tmp_path logger_override.setup() logger_override.set_handler("test_extension") @@ -94,3 +104,55 @@ def test_log_routing_content(tmp_path, caplog): assert ext_logs_json[0]["message"] == "Extension DLT log", ( "The extension log message should be present in 'ext_test_extension.log'" ) + + +def test_validate_format_defaults_to_text(): + """Test that the log format validation accepts text/json and falls back to text""" + assert CustomLogger._validate_format("JSON") == "json" + assert CustomLogger._validate_format("text") == "text" + assert CustomLogger._validate_format("invalid") == "text" + + +def test_file_formatter_selection(): + """Test that the configured log_format selects the matching file formatter""" + json_logger = CustomLogger("openhound.log", log_format="json") + assert isinstance(json_logger._file_formatter(), OpenHoundJSONFormatter), ( + "log_format 'json' should produce a JSON formatter" + ) + + text_logger = CustomLogger("openhound.log", log_format="text") + assert isinstance(text_logger._file_formatter(), OpenHoundTextFormatter), ( + "log_format 'text' should produce a plain-text formatter" + ) + + +def test_text_formatter_produces_plain_text(): + """Test that the text formatter produces a readable, non-JSON line with extras""" + formatter = OpenHoundTextFormatter() + record = logging.LogRecord( + name="openhound.core.collect", + level=logging.ERROR, + pathname="collect.py", + lineno=61, + msg="Starting collector %s", + args=("github",), + exc_info=None, + func="run", + ) + record.resource = "scim_users" + record.taskName = None + + output = formatter.format(record) + + assert "Starting collector github" in output, ( + "The formatted message should be rendered with its args" + ) + assert "[ERROR" in output, "The log level should be present in the output" + assert "openhound.core.collect:run:61" in output, ( + "The logger/function/line location should be present in the output" + ) + assert "resource=scim_users" in output, "Extra fields should be preserved" + assert "taskName" not in output, "Null extras like taskName should be dropped" + + with pytest.raises(json.JSONDecodeError): + json.loads(output)