diff --git a/Cargo.lock b/Cargo.lock
index b4a9b6d86ef49..338fd114ae855 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2193,9 +2193,9 @@ dependencies = [
 
 [[package]]
 name = "bon"
-version = "3.8.1"
+version = "3.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebeb9aaf9329dff6ceb65c689ca3db33dbf15f324909c60e4e5eef5701ce31b1"
+checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe"
 dependencies = [
  "bon-macros",
  "rustversion",
@@ -2203,11 +2203,11 @@ dependencies = [
 
 [[package]]
 name = "bon-macros"
-version = "3.8.1"
+version = "3.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645"
+checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c"
 dependencies = [
- "darling 0.21.3",
+ "darling 0.23.0",
  "ident_case",
  "prettyplease",
  "proc-macro2 1.0.106",
@@ -3334,16 +3334,6 @@ dependencies = [
  "darling_macro 0.20.11",
 ]
 
-[[package]]
-name = "darling"
-version = "0.21.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0"
-dependencies = [
- "darling_core 0.21.3",
- "darling_macro 0.21.3",
-]
-
 [[package]]
 name = "darling"
 version = "0.23.0"
@@ -3368,20 +3358,6 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "darling_core"
-version = "0.21.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4"
-dependencies = [
- "fnv",
- "ident_case",
- "proc-macro2 1.0.106",
- "quote 1.0.45",
- "strsim",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "darling_core"
 version = "0.23.0"
@@ -3406,17 +3382,6 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "darling_macro"
-version = "0.21.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81"
-dependencies = [
- "darling_core 0.21.3",
- "quote 1.0.45",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "darling_macro"
 version = "0.23.0"
@@ -3851,6 +3816,17 @@ version = "0.15.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
 
+[[package]]
+name = "drain-log"
+version = "0.1.0"
+dependencies = [
+ "bon",
+ "fastrand",
+ "smallvec",
+ "snafu 0.8.9",
+ "string-interner",
+]
+
 [[package]]
 name = "duct"
 version = "0.13.6"
@@ -4224,9 +4200,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.3.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 
 [[package]]
 name = "ff"
@@ -11103,6 +11079,17 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "string-interner"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07f9fdfdd31a0ff38b59deb401be81b73913d76c9cc5b1aed4e1330a223420b9"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.14.5",
+ "serde",
+]
+
 [[package]]
 name = "string_cache"
 version = "0.8.7"
@@ -12890,6 +12877,7 @@ dependencies = [
  "dirs-next",
  "dnsmsg-parser",
  "dnstap-parser",
+ "drain-log",
  "dyn-clone",
  "encoding_rs",
  "enum_dispatch",
diff --git a/Cargo.toml b/Cargo.toml
index a0baa2a76a980..fe3cb8a85a060 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,6 +101,7 @@ members = [
   "lib/codecs",
   "lib/dnsmsg-parser",
   "lib/docs-renderer",
+  "lib/drain-log",
   "lib/fakedata",
   "lib/file-source",
   "lib/file-source-common",
@@ -379,6 +380,7 @@ csv = { version = "1.3", default-features = false }
 databend-client = { version = "0.28.0", default-features = false, features = ["rustls"], optional = true }
 derivative.workspace = true
 dirs-next = { version = "2.0.0", default-features = false, optional = true }
+drain-log = { path = "lib/drain-log", optional = true }
 dyn-clone = { version = "1.0.20", default-features = false }
 encoding_rs = { version = "0.8.35", default-features = false, features = ["serde"] }
 enum_dispatch = { version = "0.3.13", default-features = false }
@@ -791,6 +793,7 @@ transforms-logs = [
   "transforms-aws_ec2_metadata",
   "transforms-dedupe",
   "transforms-delay",
+  "transforms-drain",
   "transforms-filter",
   "transforms-window",
   "transforms-log_to_metric",
@@ -821,6 +824,7 @@ transforms-aggregate = []
 transforms-aws_ec2_metadata = ["dep:arc-swap"]
 transforms-dedupe = ["transforms-impl-dedupe"]
 transforms-delay = []
+transforms-drain = ["dep:drain-log"]
 transforms-filter = []
 transforms-incremental_to_absolute = []
 transforms-window = []
diff --git a/changelog.d/drain_transform.feature.md b/changelog.d/drain_transform.feature.md
new file mode 100644
index 0000000000000..384ea4294a015
--- /dev/null
+++ b/changelog.d/drain_transform.feature.md
@@ -0,0 +1,9 @@
+Added a new `drain` transform that clusters log lines using the Drain log
+parsing algorithm and annotates each event with a derived template string
+(e.g. `user <*> logged in from <*>`). Mirrors the OpenTelemetry Collector
+`drain` processor, including `seed_templates`, `seed_logs`, and
+`warmup_min_clusters` for stable templates across deployments. Use the
+emitted template field as input to a downstream `filter`/`route` to act on
+classes of log patterns.
+
+authors: srstrickland
diff --git a/lib/drain-log/Cargo.toml b/lib/drain-log/Cargo.toml
new file mode 100644
index 0000000000000..b9375b8a9e39c
--- /dev/null
+++ b/lib/drain-log/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "drain-log"
+version = "0.1.0"
+edition = "2021"
+authors = ["Vector Contributors <vector@datadoghq.com>"]
+description = "Log template extraction via the Drain algorithm with LRU cluster eviction. Adapted from drain3 (akshatagarwl)."
+license = "Apache-2.0"
+publish = false
+
+[dependencies]
+bon = "3.9.1"
+fastrand = "2.4.1"
+snafu = "0.8"
+string-interner = { version = "0.15", features = ["backends"] }
+smallvec = "1.13"
diff --git a/lib/drain-log/LICENSE b/lib/drain-log/LICENSE
new file mode 100644
index 0000000000000..c63d8ac95f8d6
--- /dev/null
+++ b/lib/drain-log/LICENSE
@@ -0,0 +1,17 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   Copyright 2026 Akshat Agarwal
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/lib/drain-log/NOTICE b/lib/drain-log/NOTICE
new file mode 100644
index 0000000000000..4b3bfff2bb954
--- /dev/null
+++ b/lib/drain-log/NOTICE
@@ -0,0 +1,24 @@
+drain-log
+Copyright 2026 Vector Contributors
+
+This product includes software derived from drain3 (Apache License 2.0):
+
+    drain3 — Fast log template extraction via fixed-depth prefix trees
+    Copyright 2026 Akshat Agarwal
+    https://github.com/akshatagarwl/drain3
+
+drain3 is itself a Rust port of logpai/Drain3:
+
+    Drain3 — Streaming log template miner with persistence and masking
+    https://github.com/logpai/Drain3
+    Released under the MIT License.
+
+Local additions on top of the upstream drain3 sources:
+  * True LRU eviction of clusters once `max_clusters` is reached, so the
+    matcher can adapt to drifting log vocabularies on long-running streams
+    without unbounded memory growth. The LRU is implemented as an intrusive
+    doubly-linked list threaded through `Cluster`, giving O(1) touch on
+    match and O(1) eviction on cap; freed cluster ids are recycled so the
+    `clusters` slot vector stays bounded.
+  * A `cluster_count` accessor on `Matcher` exposing the live tracked
+    cluster count after eviction.
diff --git a/lib/drain-log/src/lib.rs b/lib/drain-log/src/lib.rs
new file mode 100644
index 0000000000000..bccfafdb74896
--- /dev/null
+++ b/lib/drain-log/src/lib.rs
@@ -0,0 +1,1149 @@
+#![forbid(unsafe_code)]
+
+//! drain3 — fast log template extraction via fixed-depth prefix trees.
+//!
+//! Rust port of [logpai/Drain3](https://github.com/logpai/Drain3). Splits log lines into tokens,
+//! clusters them by a prefix tree keyed on token count, and replaces
+//! variable tokens with a param placeholder (`<*>` by default).
+//!
+//! # Example
+//! ```
+//! use drain3::Config;
+//!
+//! # fn main() -> Result<(), drain3::Error> {
+//! let samples: Vec<String> = vec![
+//!     "connection from 10.0.0.1 timeout after 5000ms".into(),
+//!     "connection from 10.0.0.2 timeout after 3000ms".into(),
+//!     "connection from 10.0.0.3 timeout after 1000ms".into(),
+//! ];
+//! let matcher = drain3::train(&samples, Config::default())?;
+//! let (id, args, ok) = matcher.match_line("connection from 192.168.1.1 timeout after 42ms");
+//! assert!(ok);
+//! assert_eq!(args, vec!["192.168.1.1", "42ms"]);
+//! # Ok(())
+//! # }
+//! ```
+use smallvec::SmallVec;
+use snafu::Snafu;
+use std::sync::{Arc, Mutex};
+use string_interner::backend::BucketBackend;
+use string_interner::StringInterner;
+
+mod prefilter;
+mod render;
+mod tokenizer;
+mod tree;
+
+pub use render::RenderPlan;
+pub(crate) use tree::{Cluster, Node};
+
+/// Errors that can occur during training or template reconstruction.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    /// Tree depth is below the minimum of 3.
+    #[snafu(display("depth must be >= 3, got {got}"))]
+    InvalidDepth { got: usize },
+    /// Similarity threshold is outside [0, 1].
+    #[snafu(display("similarity threshold must be in [0, 1], got {got}"))]
+    InvalidSimilarityThreshold { got: f64 },
+    /// Match threshold is outside [0, 1].
+    #[snafu(display("match threshold must be in [0, 1], got {got}"))]
+    InvalidMatchThreshold { got: f64 },
+    /// Max children is below the minimum of 2.
+    #[snafu(display("max children must be >= 2, got {got}"))]
+    InvalidMaxChildren { got: usize },
+    /// Max tokens must be >= 1.
+    #[snafu(display("max tokens must be >= 1, got {got}"))]
+    InvalidMaxTokens { got: usize },
+    /// Max bytes must be >= 1.
+    #[snafu(display("max bytes must be >= 1, got {got}"))]
+    InvalidMaxBytes { got: usize },
+    /// Param string was empty.
+    #[snafu(display("param string must not be empty"))]
+    EmptyParamString,
+    /// Template id must be > 0.
+    #[snafu(display("template id must be > 0, got {id}"))]
+    InvalidTemplateId { id: usize },
+    /// Duplicate template id encountered.
+    #[snafu(display("duplicate template id {id}"))]
+    DuplicateTemplateId { id: usize },
+    /// Template count must be > 0.
+    #[snafu(display("template {id} count must be > 0"))]
+    ZeroCountTemplate { id: usize },
+    /// Internal error: cluster not found (programming bug).
+    #[snafu(display("internal error: cluster {id} not found"))]
+    ClusterNotFound { id: usize },
+    /// Internal error: root node not initialized for token count.
+    #[snafu(display("internal error: root not initialized for token count {token_count}"))]
+    InternalRootNotInitialized { token_count: usize },
+    /// Max clusters reached during training.
+    #[snafu(display("max clusters {limit} reached"))]
+    MaxClustersReached { limit: usize },
+    /// Line exceeds max_bytes configuration.
+    #[snafu(display("line too long: {length} bytes (max: {max_bytes})"))]
+    LineTooLong { length: usize, max_bytes: usize },
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub(crate) struct TokenId(pub(crate) u64);
+
+impl From<usize> for TokenId {
+    fn from(s: usize) -> Self {
+        TokenId(s as u64)
+    }
+}
+
+#[allow(dead_code)]
+impl From<TokenId> for usize {
+    fn from(id: TokenId) -> Self {
+        id.0 as usize
+    }
+}
+
+const DEFAULT_DEPTH: usize = 4;
+
+/// Default similarity threshold for training (0.0–1.0).
+/// Fraction of tokens that must match for a line to join a cluster.
+const DEFAULT_SIMILARITY_THRESHOLD: f64 = 0.5;
+
+/// Default match threshold for matching (0.0–1.0).
+/// Fraction of tokens that must match for a line to be considered a match.
+const DEFAULT_MATCH_THRESHOLD: f64 = 1.0;
+
+/// Default max children per inner node.
+/// One slot is reserved for the param catch-all child.
+const DEFAULT_MAX_CHILDREN: usize = 100;
+
+/// Default max tokens per line.
+/// Lines exceeding this are skipped during training and matching.
+const DEFAULT_MAX_TOKENS: usize = 64;
+
+/// Default max bytes per line.
+/// Lines exceeding this are skipped during training and matching.
+const DEFAULT_MAX_BYTES: usize = 1024;
+
+/// Default max clusters. 0 = unlimited.
+const DEFAULT_MAX_CLUSTERS: usize = 0;
+
+/// Minimum allowed tree depth.
+const MIN_DEPTH: usize = 3;
+
+/// Minimum allowed max_children value.
+const MIN_MAX_CHILDREN: usize = 2;
+
+/// Minimum allowed max_tokens and max_bytes value.
+const MIN_LINE_LIMIT: usize = 1;
+
+/// Stack capacity for prefilter candidate buffer.
+/// Determines how many cluster candidates can be collected without heap allocation.
+const PREFILTER_CAPACITY: usize = 16;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub(crate) struct ClusterId(pub(crate) usize);
+
+impl From<ClusterId> for usize {
+    fn from(id: ClusterId) -> Self {
+        id.0
+    }
+}
+
+impl From<usize> for ClusterId {
+    fn from(s: usize) -> Self {
+        ClusterId(s)
+    }
+}
+
+/// Controls training and matching behavior.
+#[derive(Debug, Clone, PartialEq, bon::Builder)]
+pub struct Config {
+    #[builder(default = DEFAULT_DEPTH)]
+    pub depth: usize,
+    #[builder(default = DEFAULT_SIMILARITY_THRESHOLD)]
+    pub similarity_threshold: f64,
+    #[builder(default = DEFAULT_MATCH_THRESHOLD)]
+    pub match_threshold: f64,
+    #[builder(default = DEFAULT_MAX_CHILDREN)]
+    pub max_children: usize,
+    #[builder(default = DEFAULT_MAX_TOKENS)]
+    pub max_tokens: usize,
+    #[builder(default = DEFAULT_MAX_BYTES)]
+    pub max_bytes: usize,
+    #[builder(default = DEFAULT_MAX_CLUSTERS)]
+    pub max_clusters: usize,
+    #[builder(default = Arc::from("<*>"))]
+    pub param_string: Arc<str>,
+    #[builder(default = true)]
+    pub parametrize_numeric_tokens: bool,
+    #[builder(default)]
+    pub extra_delimiters: Vec<String>,
+    #[builder(default = true)]
+    pub enable_match_prefilter: bool,
+}
+
+impl Config {
+    fn validate(&self) -> Result<(), Error> {
+        if self.depth < MIN_DEPTH {
+            return Err(Error::InvalidDepth { got: self.depth });
+        }
+        if !(0.0..=1.0).contains(&self.similarity_threshold) {
+            return Err(Error::InvalidSimilarityThreshold {
+                got: self.similarity_threshold,
+            });
+        }
+        if !(0.0..=1.0).contains(&self.match_threshold) {
+            return Err(Error::InvalidMatchThreshold {
+                got: self.match_threshold,
+            });
+        }
+        if self.max_children < MIN_MAX_CHILDREN {
+            return Err(Error::InvalidMaxChildren {
+                got: self.max_children,
+            });
+        }
+        if self.max_tokens < MIN_LINE_LIMIT {
+            return Err(Error::InvalidMaxTokens {
+                got: self.max_tokens,
+            });
+        }
+        if self.max_bytes < MIN_LINE_LIMIT {
+            return Err(Error::InvalidMaxBytes {
+                got: self.max_bytes,
+            });
+        }
+        if self.param_string.is_empty() {
+            return Err(Error::EmptyParamString);
+        }
+        Ok(())
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            depth: DEFAULT_DEPTH,
+            similarity_threshold: DEFAULT_SIMILARITY_THRESHOLD,
+            match_threshold: DEFAULT_MATCH_THRESHOLD,
+            max_children: DEFAULT_MAX_CHILDREN,
+            max_tokens: DEFAULT_MAX_TOKENS,
+            max_bytes: DEFAULT_MAX_BYTES,
+            max_clusters: DEFAULT_MAX_CLUSTERS,
+            param_string: "<*>".into(),
+            parametrize_numeric_tokens: true,
+            extra_delimiters: Vec::new(),
+            enable_match_prefilter: true,
+        }
+    }
+}
+
+/// A trained log template.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Template {
+    id: usize,
+    tokens: Vec<Arc<str>>,
+    params: Vec<bool>,
+    token_count: usize,
+    count: usize,
+}
+
+impl Template {
+    /// Cluster id.
+    pub fn id(&self) -> usize {
+        self.id
+    }
+    /// Dense token list: only non-param tokens, in order.
+    pub fn tokens(&self) -> &[Arc<str>] {
+        &self.tokens
+    }
+    /// Whether position `idx` is a param placeholder.
+    ///
+    /// # Panics
+    /// Panics if `idx` is out of bounds (>= `token_count`).
+    pub fn is_param(&self, idx: usize) -> bool {
+        self.params[idx]
+    }
+    /// Total number of positions (len(tokens) + param_count).
+    pub fn token_count(&self) -> usize {
+        self.token_count
+    }
+    /// Number of matching log lines.
+    pub fn count(&self) -> usize {
+        self.count
+    }
+}
+/// A trained DRAIN matcher. Holds the prefix tree, token dictionary, and
+/// precomputed indices for fast line matching.
+///
+/// Create via [`train`] or [`matcher_from_templates`].
+pub struct Matcher {
+    cfg: Config,
+    templates: Vec<Template>,
+    nodes: Vec<Node>,
+    root_by_len: Vec<Option<usize>>,
+    clusters: Vec<Option<Cluster>>,
+    param_id: TokenId,
+    next_cluster_id: ClusterId,
+    min_match_scores: Vec<usize>,
+    prefilter_buckets: Vec<prefilter::PrefilterBucket>,
+    has_param_first: bool,
+    interner: StringInterner<BucketBackend<usize>>,
+    token_buf: Mutex<Vec<Arc<str>>>,
+    /// Cluster ids freed by LRU eviction, ready to be reused for new
+    /// clusters so the `clusters` Vec stays bounded at roughly
+    /// `max_clusters` slots on long-running streams.
+    free_ids: Vec<usize>,
+    /// Head of the LRU doubly-linked list (the least-recently-used cluster
+    /// — first to be evicted when the cap is hit).
+    lru_head: Option<usize>,
+    /// Tail of the LRU doubly-linked list (the most-recently-used cluster).
+    /// New and freshly-touched clusters are spliced in here.
+    lru_tail: Option<usize>,
+}
+impl Matcher {
+    pub fn new(cfg: Config) -> Self {
+        let mut interner = StringInterner::new();
+        let param_id = TokenId::from(interner.get_or_intern(&cfg.param_string));
+        Self {
+            cfg: cfg.clone(),
+            templates: Vec::new(),
+            nodes: Vec::new(),
+            root_by_len: Vec::new(),
+            clusters: vec![None],
+            param_id,
+            next_cluster_id: ClusterId(1),
+            min_match_scores: Vec::new(),
+            prefilter_buckets: Vec::new(),
+            has_param_first: false,
+            interner,
+            token_buf: Mutex::new(Vec::with_capacity(16)),
+            free_ids: Vec::new(),
+            lru_head: None,
+            lru_tail: None,
+        }
+    }
+
+    /// Number of clusters currently tracked. Accounts for slots freed by
+    /// LRU eviction.
+    pub fn cluster_count(&self) -> usize {
+        (self.next_cluster_id.0 - 1).saturating_sub(self.free_ids.len())
+    }
+
+    /// Unlink the given cluster id from the LRU doubly-linked list. The
+    /// cluster's slot is left intact — only the prev/next pointers and the
+    /// matcher's head/tail are touched. O(1).
+    fn lru_unlink(&mut self, id: usize) {
+        let (prev, next) = {
+            let Some(c) = self.clusters[id].as_ref() else {
+                return;
+            };
+            (c.lru_prev, c.lru_next)
+        };
+        match prev {
+            Some(p) => {
+                if let Some(pc) = self.clusters[p].as_mut() {
+                    pc.lru_next = next;
+                }
+            }
+            None => self.lru_head = next,
+        }
+        match next {
+            Some(n) => {
+                if let Some(nc) = self.clusters[n].as_mut() {
+                    nc.lru_prev = prev;
+                }
+            }
+            None => self.lru_tail = prev,
+        }
+        if let Some(c) = self.clusters[id].as_mut() {
+            c.lru_prev = None;
+            c.lru_next = None;
+        }
+    }
+
+    /// Splice the given cluster id onto the MRU end of the LRU list. The
+    /// caller must have already cleared any stale prev/next pointers (e.g.
+    /// by calling [`lru_unlink`] or constructing a fresh cluster). O(1).
+    fn lru_push_tail(&mut self, id: usize) {
+        let old_tail = self.lru_tail;
+        if let Some(c) = self.clusters[id].as_mut() {
+            c.lru_prev = old_tail;
+            c.lru_next = None;
+        }
+        match old_tail {
+            Some(t) => {
+                if let Some(tc) = self.clusters[t].as_mut() {
+                    tc.lru_next = Some(id);
+                }
+            }
+            None => self.lru_head = Some(id),
+        }
+        self.lru_tail = Some(id);
+    }
+
+    /// Promote the given cluster id to the MRU end. Cheap no-op if it is
+    /// already the tail. O(1).
+    fn lru_touch(&mut self, id: usize) {
+        if self.lru_tail == Some(id) {
+            return;
+        }
+        self.lru_unlink(id);
+        self.lru_push_tail(id);
+    }
+
+    /// Pop the LRU end of the list, returning its cluster id. O(1).
+    fn lru_pop_head(&mut self) -> Option<usize> {
+        let id = self.lru_head?;
+        self.lru_unlink(id);
+        Some(id)
+    }
+
+    /// Evict the least-recently-used cluster: pop it from the LRU list,
+    /// scrub it from the tree node that owns it, free its id slot for
+    /// reuse, and return the freed id (or `None` if there are no clusters
+    /// to evict). O(1) amortised — only the per-node `cluster_ids` retain
+    /// is linear in that one node's list length, which is bounded by
+    /// `max_node_children`.
+    fn evict_lru(&mut self) -> Option<usize> {
+        let id = self.lru_pop_head()?;
+        let node_idx = self.clusters[id].as_ref().and_then(|c| c.node_idx);
+        if let Some(idx) = node_idx {
+            let cid = ClusterId(id);
+            self.nodes[idx].cluster_ids.retain(|&x| x != cid);
+        }
+        self.clusters[id] = None;
+        self.free_ids.push(id);
+        Some(id)
+    }
+    fn resolve_token_id<T: AsRef<str>>(&self, token: T) -> TokenId {
+        self.interner
+            .get(token.as_ref())
+            .map(TokenId::from)
+            .unwrap_or(self.param_id)
+    }
+    fn intern_token(&mut self, token: &str) -> TokenId {
+        TokenId::from(self.interner.get_or_intern(token))
+    }
+    fn intern_token_ids(&mut self, tokens: &[Arc<str>], dst: &mut Vec<TokenId>) {
+        dst.clear();
+        dst.reserve(tokens.len());
+        for t in tokens {
+            dst.push(self.intern_token(t));
+        }
+    }
+    fn required_score(&self, token_count: usize, sim_th: f64) -> usize {
+        if sim_th == self.cfg.match_threshold && token_count < self.min_match_scores.len() {
+            return self.min_match_scores[token_count];
+        }
+        (sim_th * token_count as f64).ceil() as usize
+    }
+    fn rebuild_min_match_scores(&mut self) {
+        self.min_match_scores.resize(self.root_by_len.len(), 0);
+        for tc in 0..self.min_match_scores.len() {
+            self.min_match_scores[tc] = (self.cfg.match_threshold * tc as f64).ceil() as usize;
+        }
+    }
+
+    pub fn match_line(&self, line: &str) -> (usize, Vec<String>, bool) {
+        let (cluster_id, tc) = self.find_match_with_tc(line);
+        if let Some(cid) = cluster_id {
+            let mut args = Vec::new();
+            let id = self.fill_match_args(cid, tc, &mut args);
+            return (id, args, true);
+        }
+        (0, Vec::new(), false)
+    }
+
+    fn fill_match_args(&self, cluster_id: ClusterId, tc: usize, dst: &mut Vec<String>) -> usize {
+        let cluster = self.clusters[cluster_id.0].as_ref().unwrap();
+        let token_buf = self.token_buf.lock().unwrap();
+        dst.clear();
+        dst.reserve(cluster.param_positions.len());
+        for &pos in &cluster.param_positions {
+            if pos < tc {
+                dst.push(token_buf[pos].to_string());
+            }
+        }
+        cluster.id.0
+    }
+
+    fn find_match_with_tc(&self, line: &str) -> (Option<ClusterId>, usize) {
+        let mut token_buf = self.token_buf.lock().unwrap();
+        token_buf.clear();
+        if !self.has_param_first && self.cfg.extra_delimiters.is_empty() {
+            let first_tok = &line[..line.find(' ').unwrap_or(line.len())];
+            if self.interner.get(first_tok).is_none() {
+                return (None, 0);
+            }
+        }
+        let Some(tc) = self.tokenize_input_internal(&mut token_buf, line) else {
+            return (None, 0);
+        };
+        if tc >= self.root_by_len.len() || self.root_by_len[tc].is_none() {
+            return (None, tc);
+        }
+        if self.cfg.enable_match_prefilter && tc < self.prefilter_buckets.len() {
+            let mut candidates: SmallVec<[ClusterId; PREFILTER_CAPACITY]> = SmallVec::new();
+            if prefilter::prefilter_candidates_compact(
+                &self.prefilter_buckets,
+                &self.interner,
+                self.param_id,
+                &token_buf,
+                &mut candidates,
+            )
+            .is_some()
+            {
+                return (
+                    self.fast_match_strings(
+                        &candidates,
+                        &token_buf,
+                        self.cfg.match_threshold,
+                        true,
+                    )
+                    .map(|c| c.id),
+                    tc,
+                );
+            }
+            return (None, tc);
+        }
+        let cluster = self.tree_search_with_threshold(&token_buf, self.cfg.match_threshold, true);
+        (cluster.map(|c| c.id), tc)
+    }
+    fn tokenize_input_internal(
+        &self,
+        token_buf: &mut Vec<Arc<str>>,
+        content: &str,
+    ) -> Option<usize> {
+        if content.len() > self.cfg.max_bytes {
+            return None;
+        }
+        token_buf.clear();
+        if self.cfg.extra_delimiters.is_empty() {
+            let count =
+                tokenizer::tokenize_whitespace_count(content, token_buf, self.cfg.max_tokens);
+            if count == 0 || count > self.cfg.max_tokens {
+                return None;
+            }
+        } else {
+            tokenizer::tokenize(
+                content,
+                &self.cfg.extra_delimiters,
+                self.cfg.max_tokens,
+                token_buf,
+            );
+            if token_buf.is_empty() || token_buf.len() > self.cfg.max_tokens {
+                return None;
+            }
+        }
+        Some(token_buf.len())
+    }
+    pub fn match_into(&self, line: &str, dst: &mut Vec<String>) -> (usize, bool) {
+        let (cluster_id, tc) = self.find_match_with_tc(line);
+        if let Some(cid) = cluster_id {
+            let id = self.fill_match_args(cid, tc, dst);
+            return (id, true);
+        }
+        (0, false)
+    }
+    pub fn match_id(&self, line: &str) -> Option<usize> {
+        self.find_match_with_tc(line).0.map(|c| c.0)
+    }
+    pub fn find(&self, line: &str) -> (usize, Vec<String>, bool) {
+        self.match_line(line)
+    }
+    /// Return a reference to the matcher's config.
+    pub fn config(&self) -> &Config {
+        &self.cfg
+    }
+    /// Return trained templates sorted by descending count.
+    ///
+    /// This is a reference — mutations do not affect the matcher.
+    pub fn templates(&self) -> &[Template] {
+        &self.templates
+    }
+    /// Template by cluster id.
+    pub fn template_for_id(&self, id: usize) -> Option<Template> {
+        self.clusters
+            .get(id)?
+            .as_ref()?
+            .to_template(&self.interner, self.param_id)
+            .into()
+    }
+    fn tokenize_input(&self, content: &str) -> Option<usize> {
+        let mut token_buf = self.token_buf.lock().unwrap();
+        self.tokenize_input_internal(&mut token_buf, content)
+    }
+    fn tree_search_with_threshold(
+        &self,
+        tokens: &[Arc<str>],
+        threshold: f64,
+        include_params: bool,
+    ) -> Option<&Cluster> {
+        let tc = tokens.len();
+        if tc >= self.root_by_len.len() {
+            return None;
+        }
+        let root_idx = self.root_by_len[tc]?;
+        if tc == 0 {
+            return self.nodes[root_idx]
+                .cluster_ids
+                .first()
+                .and_then(|&id| self.clusters.get(id.0).and_then(|c| c.as_ref()));
+        }
+        let max_depth = self.cfg.depth.saturating_sub(2);
+        let cur_idx = self.descend_prefix(root_idx, tokens, max_depth, tc)?;
+        self.fast_match_strings(
+            &self.nodes[cur_idx].cluster_ids,
+            tokens,
+            threshold,
+            include_params,
+        )
+    }
+    fn descend_prefix(
+        &self,
+        cur_idx: usize,
+        tokens: &[Arc<str>],
+        max_depth: usize,
+        tc: usize,
+    ) -> Option<usize> {
+        let mut cur_idx = cur_idx;
+        let limit = (max_depth - 1).min(tc - 1);
+        for tok in tokens.iter().take(limit) {
+            let tid = self.resolve_token_id(tok);
+            let next = self.nodes[cur_idx]
+                .children
+                .get(&tid)
+                .copied()
+                .or_else(|| self.nodes[cur_idx].children.get(&self.param_id).copied());
+            cur_idx = next?;
+        }
+        Some(cur_idx)
+    }
+
+    fn fast_match_strings(
+        &self,
+        cluster_ids: &[ClusterId],
+        tokens: &[Arc<str>],
+        threshold: f64,
+        include_params: bool,
+    ) -> Option<&Cluster> {
+        let n_tokens = tokens.len();
+        let needed = self.required_score(n_tokens, threshold);
+        let exact_mode = include_params && threshold >= 1.0;
+
+        let mut best_score: isize = -1;
+        let mut best_param_count: isize = -1;
+        let mut best_cluster: Option<&Cluster> = None;
+
+        for &cid in cluster_ids {
+            let Some(c) = self.clusters.get(cid.0).and_then(|c| c.as_ref()) else {
+                continue;
+            };
+            if c.token_str.len() != n_tokens {
+                continue;
+            }
+
+            let param_count = c.param_count;
+            let mut sim_tokens: isize = if include_params {
+                param_count as isize
+            } else {
+                0
+            };
+            let mut remaining = c.non_param_idx.len();
+
+            let anchor0_pos = c.anchor0;
+            let anchor1_pos = c.anchor1;
+
+            if let Some(a) = anchor0_pos {
+                if c.token_str[a] == tokens[a] {
+                    sim_tokens += 1;
+                }
+                remaining -= 1;
+                if sim_tokens + (remaining as isize) < (needed as isize) {
+                    continue;
+                }
+            }
+            if let Some(a) = anchor1_pos {
+                if c.token_str[a] == tokens[a] {
+                    sim_tokens += 1;
+                }
+                remaining -= 1;
+                if sim_tokens + (remaining as isize) < (needed as isize) {
+                    continue;
+                }
+            }
+
+            for &idx in &c.non_param_idx {
+                if anchor0_pos == Some(idx) || anchor1_pos == Some(idx) {
+                    continue;
+                }
+                if c.token_str[idx] == tokens[idx] {
+                    sim_tokens += 1;
+                }
+                remaining -= 1;
+                if sim_tokens + (remaining as isize) < (needed as isize) {
+                    break;
+                }
+            }
+
+            if sim_tokens > best_score
+                || (sim_tokens == best_score && param_count as isize > best_param_count)
+            {
+                best_score = sim_tokens;
+                best_param_count = param_count as isize;
+                best_cluster = Some(c);
+                if exact_mode && sim_tokens >= (needed as isize) {
+                    return best_cluster;
+                }
+            }
+        }
+
+        if exact_mode {
+            None
+        } else if best_score >= (needed as isize) {
+            best_cluster
+        } else {
+            None
+        }
+    }
+    fn create_cluster(&mut self, tokens: Vec<Arc<str>>) -> Result<ClusterId, Error> {
+        let mut token_ids = Vec::new();
+        self.intern_token_ids(&tokens, &mut token_ids);
+        // Reuse a slot freed by LRU eviction before extending the clusters Vec.
+        let id = if let Some(reused) = self.free_ids.pop() {
+            ClusterId(reused)
+        } else {
+            let id = self.next_cluster_id;
+            self.next_cluster_id = ClusterId(self.next_cluster_id.0 + 1);
+            id
+        };
+        let cl = Cluster::new(id, tokens, token_ids, self.param_id);
+        if id.0 >= self.clusters.len() {
+            self.clusters.resize_with(id.0 + 1, || None);
+        }
+        self.clusters[id.0] = Some(cl);
+        self.add_seq_to_prefix_tree(id)?;
+        // Newly-created clusters are by definition the most recently used.
+        self.lru_push_tail(id.0);
+        Ok(id)
+    }
+    pub fn add_log_message(&mut self, content: &str) -> Result<Template, Error> {
+        let tc = self.tokenize_input(content).ok_or(Error::LineTooLong {
+            length: content.len(),
+            max_bytes: self.cfg.max_bytes,
+        })?;
+        if tc >= self.root_by_len.len() {
+            // Token count not seen before — a new cluster will be created.
+            // Evict the LRU cluster first if we're already at the cap so the
+            // matcher continues to learn new patterns instead of stalling.
+            if self.cfg.max_clusters > 0 && self.cluster_count() >= self.cfg.max_clusters {
+                self.evict_lru();
+            }
+            let tokens = self.token_buf.lock().unwrap().clone();
+            let cid = self.create_cluster(tokens)?;
+            let cluster = self.clusters[cid.0]
+                .as_ref()
+                .ok_or(Error::ClusterNotFound { id: cid.0 })?;
+            return Ok(cluster.to_template(&self.interner, self.param_id));
+        }
+        let token_buf = self.token_buf.lock().unwrap();
+        if let Some(c) =
+            self.tree_search_with_threshold(&token_buf, self.cfg.similarity_threshold, false)
+        {
+            let cid = c.id;
+            let mut changed = false;
+            let cluster = self.clusters[cid.0]
+                .as_mut()
+                .ok_or(Error::ClusterNotFound { id: cid.0 })?;
+            for (i, tok) in token_buf.iter().enumerate().take(cluster.token_str.len()) {
+                if cluster.token_ids[i] == self.param_id {
+                    continue;
+                }
+                if cluster.token_str[i] != *tok {
+                    cluster.token_ids[i] = self.param_id;
+                    cluster.token_str[i] = self.cfg.param_string.clone();
+                    cluster.param_count += 1;
+                    changed = true;
+                }
+            }
+            if changed {
+                cluster.rebuild_indices(self.param_id);
+            }
+            cluster.count += 1;
+            let template = cluster.to_template(&self.interner, self.param_id);
+            // Drop the token_buf guard and the cluster borrow before mutating
+            // the LRU list (which re-borrows the clusters vec mutably).
+            drop(token_buf);
+            self.lru_touch(cid.0);
+            return Ok(template);
+        }
+        drop(token_buf);
+        if self.cfg.max_clusters > 0 && self.cluster_count() >= self.cfg.max_clusters {
+            self.evict_lru();
+        }
+        let tokens = self.token_buf.lock().unwrap().clone();
+        let cid = self.create_cluster(tokens)?;
+        let cluster = self.clusters[cid.0]
+            .as_ref()
+            .ok_or(Error::ClusterNotFound { id: cid.0 })?;
+        Ok(cluster.to_template(&self.interner, self.param_id))
+    }
+    fn add_seq_to_prefix_tree(&mut self, cluster_id: ClusterId) -> Result<(), Error> {
+        let cluster = self.clusters[cluster_id.0]
+            .as_ref()
+            .ok_or(Error::ClusterNotFound { id: cluster_id.0 })?;
+        let tc = cluster.token_ids.len();
+        if tc >= self.root_by_len.len() {
+            self.root_by_len.resize_with(tc + 1, || None);
+        }
+        if self.root_by_len[tc].is_none() {
+            let idx = self.nodes.len();
+            self.nodes.push(Node::new());
+            self.root_by_len[tc] = Some(idx);
+        }
+        let mut cur_idx =
+            self.root_by_len[tc].ok_or(Error::InternalRootNotInitialized { token_count: tc })?;
+        // Index of the leaf node the cluster ultimately lands in. Stored on
+        // the cluster so LRU eviction can scrub it from the tree later.
+        let leaf_idx;
+        if tc == 0 {
+            self.nodes[cur_idx].cluster_ids.push(cluster_id);
+            leaf_idx = cur_idx;
+        } else {
+            let mut leaf = cur_idx;
+            for (i, &token_id) in cluster.token_ids.iter().enumerate() {
+                let cur_depth = i + 1;
+                if cur_depth >= self.cfg.depth - 2 || cur_depth >= tc {
+                    self.nodes[cur_idx].cluster_ids.push(cluster_id);
+                    leaf = cur_idx;
+                    break;
+                }
+                let key = {
+                    let node = &self.nodes[cur_idx];
+                    if node.children.contains_key(&token_id) {
+                        token_id
+                    } else if self.cfg.parametrize_numeric_tokens
+                        && tokenizer::has_numbers(&cluster.token_str[i])
+                    {
+                        self.param_id
+                    } else {
+                        let specific_count = node.children.len();
+                        let has_wild = node.children.contains_key(&self.param_id);
+                        let available = self.cfg.max_children - 1;
+                        if specific_count < available
+                            || (!has_wild && specific_count < self.cfg.max_children - 1)
+                        {
+                            token_id
+                        } else {
+                            self.param_id
+                        }
+                    }
+                };
+                if !self.nodes[cur_idx].children.contains_key(&key) {
+                    let new_idx = self.nodes.len();
+                    self.nodes.push(Node::new());
+                    self.nodes[cur_idx].children.insert(key, new_idx);
+                }
+                cur_idx = self.nodes[cur_idx].children[&key];
+            }
+            leaf_idx = leaf;
+        }
+        // Drop the immutable borrow before re-borrowing as mutable.
+        let _ = cluster;
+        if let Some(c) = self.clusters[cluster_id.0].as_mut() {
+            c.node_idx = Some(leaf_idx);
+        }
+        Ok(())
+    }
+    fn sync_templates_from_clusters(&mut self) {
+        let mut out: Vec<Template> = Vec::with_capacity(self.clusters.len().saturating_sub(1));
+        for id in 1..self.clusters.len() {
+            let Some(c) = self.clusters[id].as_ref() else {
+                continue;
+            };
+            out.push(c.to_template(&self.interner, self.param_id));
+        }
+        out.sort_by_key(|b| std::cmp::Reverse(b.count()));
+        self.templates = out;
+    }
+    fn finalize_training(&mut self) {
+        self.sync_templates_from_clusters();
+        self.rebuild_min_match_scores();
+        self.prefilter_buckets = prefilter::rebuild_match_prefilter(&self.clusters, self.param_id);
+        self.has_param_first = self.clusters.iter().skip(1).any(|c| {
+            c.as_ref()
+                .is_some_and(|cl| cl.token_ids.first() == Some(&self.param_id))
+        });
+    }
+}
+/// Train a matcher with the provided config.
+pub fn train(samples: &[String], cfg: Config) -> Result<Matcher, Error> {
+    cfg.validate()?;
+    let mut m = Matcher::new(cfg);
+    for sample in samples {
+        m.add_log_message(sample)?;
+    }
+    m.finalize_training();
+    Ok(m)
+}
+/// Rebuild a matcher from pre-existing templates.
+pub fn matcher_from_templates(cfg: Config, templates: &[Template]) -> Result<Matcher, Error> {
+    cfg.validate()?;
+    let mut m = Matcher::new(cfg);
+    if templates.is_empty() {
+        m.finalize_training();
+        return Ok(m);
+    }
+    let mut sorted: Vec<_> = templates.to_vec();
+    sorted.sort_by_key(|t| t.id());
+    let mut seen = std::collections::HashSet::new();
+    let mut max_id = 0;
+    for t in &sorted {
+        if t.id() == 0 {
+            return Err(Error::InvalidTemplateId { id: t.id() });
+        }
+        if !seen.insert(t.id()) {
+            return Err(Error::DuplicateTemplateId { id: t.id() });
+        }
+        if t.count() == 0 {
+            return Err(Error::ZeroCountTemplate { id: t.id() });
+        }
+        if t.id() > max_id {
+            max_id = t.id();
+        }
+    }
+    m.clusters.resize_with(max_id + 1, || None);
+    m.next_cluster_id = ClusterId(max_id + 1);
+    for t in &sorted {
+        let mut full: Vec<Arc<str>> = vec![Arc::from(""); t.token_count()];
+        let mut dense_idx = 0;
+        for (i, slot) in full.iter_mut().enumerate().take(t.token_count()) {
+            if t.is_param(i) {
+                *slot = m.cfg.param_string.clone();
+            } else {
+                *slot = t.tokens()[dense_idx].clone();
+                dense_idx += 1;
+            }
+        }
+        let mut token_ids = Vec::new();
+        m.intern_token_ids(&full, &mut token_ids);
+        let cl = Cluster::new(ClusterId(t.id()), full, token_ids, m.param_id);
+        m.clusters[t.id()] = Some(cl);
+    }
+    for id in 1..m.clusters.len() {
+        if m.clusters[id].is_some() {
+            m.add_seq_to_prefix_tree(ClusterId(id))?;
+        }
+    }
+    m.finalize_training();
+    Ok(m)
+}
+/// Deterministically sample lines as fixed-size blocks at regular strides
+/// with random jitter inside each stride window.
+///
+/// The target sample count is `frac * len(lines)`, but the actual returned
+/// count is rounded up to the nearest multiple of `block_size` (capped by the
+/// input length) because entire blocks are appended per stride.
+///
+/// Uses a seeded rng derived from the input length so the result is
+/// deterministic — same input produces the same sample across runs.
+pub fn stride_sample(lines: &[String], frac: f64, block_size: usize) -> Vec<String> {
+    let total = lines.len();
+    if total == 0 {
+        return Vec::new();
+    }
+    let sample_n = (total as f64 * frac) as usize;
+    if sample_n == 0 {
+        return Vec::new();
+    }
+    let num_blocks = (sample_n / block_size).max(1);
+    let stride = (total / num_blocks).max(block_size);
+    let mut rng = fastrand::Rng::with_seed(total as u64);
+    let mut out: Vec<String> = Vec::with_capacity(sample_n);
+    let mut start = 0usize;
+    while start < total && out.len() < sample_n {
+        let max_offset = stride.saturating_sub(block_size).max(1);
+        let offset = start + (rng.u32(..max_offset as u32) as usize);
+        if offset >= total {
+            break;
+        }
+        let end = (offset + block_size).min(total);
+        out.extend(lines[offset..end].iter().cloned());
+        start += stride;
+    }
+    out
+}
+
+#[cfg(test)]
+mod lru_tests {
+    use super::*;
+
+    fn small_matcher(max_clusters: usize) -> Matcher {
+        Matcher::new(
+            Config::builder()
+                .depth(3)
+                .similarity_threshold(0.4)
+                .max_clusters(max_clusters)
+                .parametrize_numeric_tokens(false)
+                .build(),
+        )
+    }
+
+    /// Build a line with `len` unique tokens. Each distinct length lands in
+    /// a separate tree branch (drain only merges clusters with matching
+    /// token counts), giving us a reliable way to manufacture distinct
+    /// clusters without fighting the similarity heuristic.
+    fn distinct_line(len: usize) -> String {
+        (0..len)
+            .map(|i| format!("tok{i}"))
+            .collect::<Vec<_>>()
+            .join(" ")
+    }
+
+    #[test]
+    fn cluster_count_reflects_active_clusters() {
+        let mut m = small_matcher(0);
+        m.add_log_message(&distinct_line(1)).unwrap();
+        m.add_log_message(&distinct_line(2)).unwrap();
+        m.add_log_message(&distinct_line(3)).unwrap();
+        assert_eq!(m.cluster_count(), 3);
+    }
+
+    #[test]
+    fn lru_evicts_least_recently_used_cluster() {
+        // Cap of two clusters. After the third distinct line, the matcher
+        // must evict whichever of the first two was least recently touched.
+        let mut m = small_matcher(2);
+        let a = distinct_line(1);
+        let b = distinct_line(2);
+        let c = distinct_line(3);
+        m.add_log_message(&a).unwrap(); // cluster A
+        m.add_log_message(&b).unwrap(); // cluster B
+        // Touch A again so B becomes the LRU candidate.
+        m.add_log_message(&a).unwrap();
+        // Third distinct pattern forces eviction; total stays at the cap.
+        m.add_log_message(&c).unwrap();
+        assert_eq!(m.cluster_count(), 2);
+        // Re-issuing the recently-touched A must still hit an existing
+        // cluster — total stays at the cap.
+        m.add_log_message(&a).unwrap();
+        assert_eq!(m.cluster_count(), 2);
+        // Re-issuing the LRU B must create a NEW cluster (confirms B was
+        // the one evicted). That pushes us over the cap, triggering another
+        // eviction — net effect: still capped at 2.
+        m.add_log_message(&b).unwrap();
+        assert_eq!(m.cluster_count(), 2);
+    }
+
+    #[test]
+    fn lru_reuses_evicted_cluster_ids() {
+        // With a cap of two and a parade of distinct never-seen patterns,
+        // freed ids must be recycled — the clusters vec must stay bounded.
+        let mut m = small_matcher(2);
+        for n in 1..=50 {
+            // Distinct token count per iteration → guaranteed new cluster.
+            // We deliberately start at 1 so the matcher always sees a
+            // never-before-seen length on every call.
+            m.add_log_message(&distinct_line(n)).unwrap();
+        }
+        assert_eq!(m.cluster_count(), 2);
+        assert!(
+            m.clusters.len() <= 4,
+            "clusters vec grew unexpectedly: len={}",
+            m.clusters.len()
+        );
+    }
+
+    #[test]
+    fn no_eviction_when_max_clusters_is_zero() {
+        let mut m = small_matcher(0);
+        for n in 1..=20 {
+            m.add_log_message(&distinct_line(n)).unwrap();
+        }
+        assert_eq!(m.cluster_count(), 20);
+    }
+
+    /// Walk the LRU list from head to tail (and back) and verify it
+    /// contains exactly the active clusters with consistent prev/next
+    /// links. Returns the ordered ids head→tail.
+    fn lru_order(m: &Matcher) -> Vec<usize> {
+        let mut forward = Vec::new();
+        let mut cur = m.lru_head;
+        while let Some(id) = cur {
+            forward.push(id);
+            cur = m.clusters[id].as_ref().unwrap().lru_next;
+        }
+        let mut backward = Vec::new();
+        let mut cur = m.lru_tail;
+        while let Some(id) = cur {
+            backward.push(id);
+            cur = m.clusters[id].as_ref().unwrap().lru_prev;
+        }
+        backward.reverse();
+        assert_eq!(
+            forward, backward,
+            "LRU forward/backward traversals disagree"
+        );
+        forward
+    }
+
+    #[test]
+    fn lru_order_matches_recency_under_interleaving() {
+        // Hold a cap of 3 and exercise the DLL with interleaved touches
+        // and evictions. Confirms head==LRU and tail==MRU after each step.
+        let mut m = small_matcher(3);
+        let a = distinct_line(1);
+        let b = distinct_line(2);
+        let c = distinct_line(3);
+        let d = distinct_line(4);
+
+        let ida = m.add_log_message(&a).unwrap().id();
+        let idb = m.add_log_message(&b).unwrap().id();
+        let idc = m.add_log_message(&c).unwrap().id();
+        // Order after the first 3 inserts: head=A, ..., tail=C
+        assert_eq!(lru_order(&m), vec![ida, idb, idc]);
+
+        // Touch A — should move to tail. Order: B, C, A
+        m.add_log_message(&a).unwrap();
+        assert_eq!(lru_order(&m), vec![idb, idc, ida]);
+
+        // New pattern D forces eviction of head (B). Drain re-uses freed
+        // cluster ids, so the new D cluster lands in B's old slot — i.e.
+        // `idd == idb`. Verify by structure, not by id: head should be C,
+        // followed by A, with the brand-new D cluster on the tail.
+        let idd = m.add_log_message(&d).unwrap().id();
+        assert_eq!(idd, idb, "D should recycle the evicted cluster id");
+        let order = lru_order(&m);
+        assert_eq!(order, vec![idc, ida, idd]);
+    }
+
+    #[test]
+    fn hot_clusters_survive_long_churn() {
+        // Three "hot" patterns plus a long tail of one-shot patterns. The
+        // hot ones get touched on every iteration; the cold ones never
+        // come back. After many rounds the hot patterns must still match
+        // an existing cluster (not be evicted).
+        let mut m = small_matcher(5);
+        let hot = [distinct_line(1), distinct_line(2), distinct_line(3)];
+        let hot_ids: Vec<usize> = hot
+            .iter()
+            .map(|line| m.add_log_message(line).unwrap().id())
+            .collect();
+
+        // Fire 50 distinct cold patterns interleaved with re-hits on the
+        // hot set. The hot ones must keep getting promoted to MRU.
+        for n in 0..50 {
+            for line in &hot {
+                m.add_log_message(line).unwrap();
+            }
+            // Cold pattern (token count from 4..) — never repeated.
+            m.add_log_message(&distinct_line(4 + n)).unwrap();
+        }
+
+        // Hot ids should still resolve to existing clusters with no new
+        // allocations — re-feeding them returns the same template ids.
+        for (line, &expected_id) in hot.iter().zip(hot_ids.iter()) {
+            let id = m.add_log_message(line).unwrap().id();
+            assert_eq!(id, expected_id, "hot cluster for {line:?} was evicted");
+        }
+        assert_eq!(m.cluster_count(), 5);
+    }
+}
diff --git a/lib/drain-log/src/prefilter.rs b/lib/drain-log/src/prefilter.rs
new file mode 100644
index 0000000000000..9980e09952c76
--- /dev/null
+++ b/lib/drain-log/src/prefilter.rs
@@ -0,0 +1,219 @@
+use smallvec::SmallVec;
+use std::collections::HashMap;
+
+use crate::tree::Cluster;
+use crate::{BucketBackend, ClusterId, StringInterner, TokenId};
+
+/// Packs two token IDs into a single 64-bit key for the first-last prefilter index.
+/// Layout: lower 32 bits = first token ID, upper 32 bits = last token ID.
+/// This replaces the raw bit-manipulation `(first << 32) | (last & 0xFFFFFFFF)`.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct FirstLastKey(u64);
+
+impl FirstLastKey {
+    /// Create a FirstLastKey from two token ID values (u64).
+    pub fn from_token_ids(first: u64, last: u64) -> Self {
+        FirstLastKey((first & 0xFFFFFFFF) | (last << 32))
+    }
+
+    /// Return the packed u64 value for use as a lookup key.
+    pub fn pack(&self) -> u64 {
+        self.0
+    }
+}
+
+/** Bucket of cluster ids indexed by first / last token for a single token-count
+ *  length. Built once after training, read-only during matching. */
+#[derive(Debug, Default, Clone)]
+pub struct PrefilterBucket {
+    pub any: Vec<ClusterId>,
+    pub first_keys: Vec<TokenId>,
+    pub first_vals: Vec<Vec<ClusterId>>,
+    pub last_keys: Vec<TokenId>,
+    pub last_vals: Vec<Vec<ClusterId>>,
+    pub fl_keys: Vec<TokenId>,
+    pub fl_vals: Vec<Vec<ClusterId>>,
+}
+
+/** Rebuild prefilter buckets from the current set of clusters.
+ *
+ *  Called automatically by [`Matcher::finalize_training`][crate::Matcher]. */
+pub fn rebuild_match_prefilter(
+    clusters: &[Option<Cluster>],
+    param_id: TokenId,
+) -> Vec<PrefilterBucket> {
+    let mut any_by_tc: HashMap<usize, Vec<ClusterId>> = HashMap::new();
+    let mut first_by_tc: HashMap<usize, HashMap<TokenId, Vec<ClusterId>>> = HashMap::new();
+    let mut last_by_tc: HashMap<usize, HashMap<TokenId, Vec<ClusterId>>> = HashMap::new();
+    let mut fl_by_tc: HashMap<usize, HashMap<TokenId, Vec<ClusterId>>> = HashMap::new();
+    let mut max_len = 0usize;
+
+    for (id, cluster) in clusters.iter().enumerate().skip(1) {
+        let Some(c) = cluster.as_ref() else {
+            continue;
+        };
+
+        let token_count = c.token_ids.len();
+        if token_count > max_len {
+            max_len = token_count;
+        }
+        if token_count == 0 {
+            any_by_tc.entry(0).or_default().push(ClusterId(id));
+            continue;
+        }
+
+        let first_id = c.token_ids[0];
+        let last_id = c.token_ids[token_count - 1];
+        let first_is_param = first_id == param_id;
+        let last_is_param = last_id == param_id;
+
+        match (first_is_param, last_is_param) {
+            (true, true) => {
+                any_by_tc
+                    .entry(token_count)
+                    .or_default()
+                    .push(ClusterId(id));
+            }
+            (false, true) => {
+                first_by_tc
+                    .entry(token_count)
+                    .or_default()
+                    .entry(first_id)
+                    .or_default()
+                    .push(ClusterId(id));
+            }
+            (true, false) => {
+                last_by_tc
+                    .entry(token_count)
+                    .or_default()
+                    .entry(last_id)
+                    .or_default()
+                    .push(ClusterId(id));
+            }
+            (false, false) => {
+                let combined = TokenId(FirstLastKey::from_token_ids(first_id.0, last_id.0).pack());
+                fl_by_tc
+                    .entry(token_count)
+                    .or_default()
+                    .entry(combined)
+                    .or_default()
+                    .push(ClusterId(id));
+            }
+        }
+    }
+
+    let mut buckets = vec![PrefilterBucket::default(); max_len + 1];
+    for (tc, ids) in any_by_tc {
+        buckets[tc].any = ids;
+    }
+    for (tc, mm) in first_by_tc {
+        let (keys, vals) = sorted_token_id_keys(mm);
+        buckets[tc].first_keys = keys;
+        buckets[tc].first_vals = vals;
+    }
+    for (tc, mm) in last_by_tc {
+        let (keys, vals) = sorted_token_id_keys(mm);
+        buckets[tc].last_keys = keys;
+        buckets[tc].last_vals = vals;
+    }
+    for (tc, mm) in fl_by_tc {
+        let (keys, vals) = sorted_token_id_keys(mm);
+        buckets[tc].fl_keys = keys;
+        buckets[tc].fl_vals = vals;
+    }
+
+    buckets
+}
+
+/** Look up candidate cluster ids for a tokenized line using first/last token
+ *  indexes. Returns `None` when no candidates exist. */
+pub fn prefilter_candidates_compact<'a>(
+    buckets: &'a [PrefilterBucket],
+    interner: &'a StringInterner<BucketBackend<usize>>,
+    param_id: TokenId,
+    tokens: &[std::sync::Arc<str>],
+    dst: &mut SmallVec<[ClusterId; 16]>,
+) -> Option<()> {
+    let tc = tokens.len();
+    let b = buckets.get(tc)?;
+
+    // Fast path: no tokens → only "any" bucket applies
+    if tc == 0 {
+        return merge_prefilter_groups(&b.any[..], &[], &[], &[], dst);
+    }
+
+    let first_id = interner
+        .get(tokens[0].as_ref())
+        .map(TokenId::from)
+        .unwrap_or(param_id);
+    let last_id = interner
+        .get(tokens[tc - 1].as_ref())
+        .map(TokenId::from)
+        .unwrap_or(param_id);
+    let first_known = first_id != param_id;
+    let last_known = last_id != param_id;
+
+    let first = if first_known {
+        search_sorted_token_id(&b.first_keys, &b.first_vals, first_id)
+    } else {
+        &[]
+    };
+    let last = if last_known {
+        search_sorted_token_id(&b.last_keys, &b.last_vals, last_id)
+    } else {
+        &[]
+    };
+    let first_last = if first_known && last_known {
+        let combined = TokenId(FirstLastKey::from_token_ids(first_id.0, last_id.0).pack());
+        search_sorted_token_id(&b.fl_keys, &b.fl_vals, combined)
+    } else {
+        &[]
+    };
+
+    merge_prefilter_groups(&b.any[..], first, last, first_last, dst)
+}
+
+fn merge_prefilter_groups(
+    any: &[ClusterId],
+    first: &[ClusterId],
+    last: &[ClusterId],
+    first_last: &[ClusterId],
+    dst: &mut SmallVec<[ClusterId; 16]>,
+) -> Option<()> {
+    let groups: [&[ClusterId]; 4] = [any, first, last, first_last];
+    let non_empty = groups.iter().filter(|g| !g.is_empty()).count();
+    if non_empty == 0 {
+        return None;
+    }
+    if non_empty == 1 {
+        dst.clear();
+        let group = groups.into_iter().find(|g| !g.is_empty()).unwrap();
+        dst.extend_from_slice(group);
+        return Some(());
+    }
+    dst.clear();
+    dst.reserve(any.len() + first.len() + last.len() + first_last.len());
+    dst.extend_from_slice(any);
+    dst.extend_from_slice(first);
+    dst.extend_from_slice(last);
+    dst.extend_from_slice(first_last);
+    Some(())
+}
+
+fn search_sorted_token_id<'a>(
+    keys: &'a [TokenId],
+    vals: &'a [Vec<ClusterId>],
+    target: TokenId,
+) -> &'a [ClusterId] {
+    keys.binary_search(&target)
+        .map(|i| &vals[i][..])
+        .unwrap_or(&[])
+}
+
+fn sorted_token_id_keys(
+    m: HashMap<TokenId, Vec<ClusterId>>,
+) -> (Vec<TokenId>, Vec<Vec<ClusterId>>) {
+    let mut items: Vec<(TokenId, Vec<ClusterId>)> = m.into_iter().collect();
+    items.sort_unstable_by_key(|(k, _)| *k);
+    items.into_iter().unzip()
+}
diff --git a/lib/drain-log/src/render.rs b/lib/drain-log/src/render.rs
new file mode 100644
index 0000000000000..f42b2d4018a27
--- /dev/null
+++ b/lib/drain-log/src/render.rs
@@ -0,0 +1,78 @@
+use crate::Template;
+
+#[derive(Debug, Clone)]
+pub struct RenderPlan {
+    head: Vec<u8>,
+    segments: Vec<RenderSegment>,
+    max_size: usize,
+}
+
+#[derive(Debug, Clone)]
+struct RenderSegment {
+    arg_idx: usize,
+    tail: Vec<u8>,
+}
+
+impl RenderPlan {
+    pub fn new(t: &Template, max_arg_len: Option<&dyn Fn(usize) -> usize>) -> Self {
+        let mut head: Vec<u8> = Vec::new();
+        let mut segments: Vec<RenderSegment> = Vec::new();
+        let mut arg_idx = 0usize;
+        let mut tok_idx = 0usize;
+        let mut cur: Vec<u8> = Vec::new();
+
+        for i in 0..t.token_count() {
+            if i > 0 {
+                cur.push(b' ');
+            }
+            if t.is_param(i) {
+                if let Some(last) = segments.last_mut() {
+                    last.tail = cur;
+                } else {
+                    head = cur;
+                }
+                segments.push(RenderSegment {
+                    arg_idx,
+                    tail: Vec::new(),
+                });
+                cur = Vec::new();
+                arg_idx += 1;
+            } else {
+                cur.extend_from_slice(t.tokens()[tok_idx].as_bytes());
+                tok_idx += 1;
+            }
+        }
+        if let Some(last) = segments.last_mut() {
+            last.tail = cur;
+        } else {
+            head = cur;
+        }
+
+        let mut max_size = head.len();
+        for seg in &segments {
+            max_size += seg.tail.len();
+            if let Some(f) = max_arg_len {
+                max_size += f(seg.arg_idx);
+            }
+        }
+        Self {
+            head,
+            segments,
+            max_size,
+        }
+    }
+
+    pub fn max_size(&self) -> usize {
+        self.max_size
+    }
+
+    pub fn append(&self, dst: &mut Vec<u8>, args: Option<&[&str]>) {
+        dst.extend_from_slice(&self.head);
+        for seg in &self.segments {
+            if let Some(s) = args.and_then(|a| a.get(seg.arg_idx)) {
+                dst.extend_from_slice(s.as_bytes());
+            }
+            dst.extend_from_slice(&seg.tail);
+        }
+    }
+}
diff --git a/lib/drain-log/src/tokenizer.rs b/lib/drain-log/src/tokenizer.rs
new file mode 100644
index 0000000000000..a6fff3dee4339
--- /dev/null
+++ b/lib/drain-log/src/tokenizer.rs
@@ -0,0 +1,65 @@
+use std::sync::Arc;
+
+pub fn tokenize_whitespace_count(
+    content: &str,
+    dst: &mut Vec<Arc<str>>,
+    max_tokens: usize,
+) -> usize {
+    if content.is_empty() || max_tokens == 0 {
+        return 0;
+    }
+    dst.clear();
+    let bytes = content.as_bytes();
+    let mut start = 0;
+    let mut count = 1;
+    for i in 0..bytes.len() {
+        if bytes[i] != b' ' {
+            continue;
+        }
+        dst.push(Arc::from(std::str::from_utf8(&bytes[start..i]).unwrap()));
+        start = i + 1;
+        if count >= max_tokens {
+            return count + 1;
+        }
+        count += 1;
+    }
+    dst.push(Arc::from(std::str::from_utf8(&bytes[start..]).unwrap()));
+    count
+}
+
+pub fn tokenize(
+    content: &str,
+    extra_delimiters: &[String],
+    max_tokens: usize,
+    dst: &mut Vec<Arc<str>>,
+) {
+    dst.clear();
+    let trimmed = content.trim();
+    if trimmed.is_empty() {
+        return;
+    }
+
+    // Fast path: no extra delimiters - use split_whitespace (zero allocation)
+    if extra_delimiters.is_empty() {
+        dst.reserve(16.min(max_tokens));
+        for t in trimmed.split_whitespace().take(max_tokens) {
+            dst.push(Arc::from(t));
+        }
+        return;
+    }
+
+    // Slow path: need delimiter replacement
+    let mut s = trimmed.to_string();
+    for d in extra_delimiters {
+        if !d.is_empty() {
+            s = s.replace(d, " ");
+        }
+    }
+    for t in s.split(' ').filter(|t| !t.is_empty()).take(max_tokens) {
+        dst.push(Arc::from(t));
+    }
+}
+
+pub fn has_numbers(s: &str) -> bool {
+    s.bytes().any(|b| b.is_ascii_digit())
+}
diff --git a/lib/drain-log/src/tree.rs b/lib/drain-log/src/tree.rs
new file mode 100644
index 0000000000000..4cd6379a84330
--- /dev/null
+++ b/lib/drain-log/src/tree.rs
@@ -0,0 +1,113 @@
+use std::sync::Arc;
+
+use string_interner::backend::BucketBackend;
+use string_interner::StringInterner;
+
+use crate::{ClusterId, Template, TokenId};
+
+pub(crate) struct Cluster {
+    pub id: ClusterId,
+    pub count: usize,
+    pub param_count: usize,
+    pub token_str: Vec<Arc<str>>,
+    pub token_ids: Vec<TokenId>,
+    pub non_param_idx: Vec<usize>,
+    pub param_positions: Vec<usize>,
+    pub anchor0: Option<usize>,
+    pub anchor1: Option<usize>,
+    /// Index of the tree node holding this cluster id in its `cluster_ids`
+    /// list. Stored so eviction can scrub the cluster from the tree in
+    /// O(node list length) without walking the tree from the root. `None`
+    /// until the cluster has been threaded into the tree.
+    pub node_idx: Option<usize>,
+    /// Previous cluster id in the LRU doubly-linked list (older / closer to
+    /// the head). `None` means this cluster is currently the LRU head.
+    pub lru_prev: Option<usize>,
+    /// Next cluster id in the LRU doubly-linked list (newer / closer to the
+    /// tail). `None` means this cluster is currently the MRU tail.
+    pub lru_next: Option<usize>,
+}
+
+impl Cluster {
+    pub fn new(
+        id: ClusterId,
+        token_str: Vec<Arc<str>>,
+        token_ids: Vec<TokenId>,
+        param_id: TokenId,
+    ) -> Self {
+        let mut s = Self {
+            id,
+            count: 1,
+            param_count: 0,
+            token_str,
+            token_ids,
+            non_param_idx: Vec::new(),
+            param_positions: Vec::new(),
+            anchor0: None,
+            anchor1: None,
+            node_idx: None,
+            lru_prev: None,
+            lru_next: None,
+        };
+        s.rebuild_indices(param_id);
+        s
+    }
+
+    pub fn rebuild_indices(&mut self, param_id: TokenId) {
+        self.non_param_idx.clear();
+        self.param_positions.clear();
+        self.param_count = 0;
+        for (i, &tid) in self.token_ids.iter().enumerate() {
+            if tid == param_id {
+                self.param_count += 1;
+                self.param_positions.push(i);
+            } else {
+                self.non_param_idx.push(i);
+            }
+        }
+        self.anchor0 = self.non_param_idx.first().copied();
+        self.anchor1 = if self.non_param_idx.len() >= 2 {
+            self.non_param_idx.last().copied()
+        } else {
+            None
+        };
+    }
+
+    pub fn to_template(
+        &self,
+        interner: &StringInterner<BucketBackend<usize>>,
+        param_id: TokenId,
+    ) -> Template {
+        let token_count = self.token_ids.len();
+        let mut params = vec![false; token_count];
+        let mut dense = Vec::with_capacity(token_count - self.param_count);
+        for (i, &tid) in self.token_ids.iter().enumerate() {
+            if tid == param_id {
+                params[i] = true;
+            } else {
+                dense.push(Arc::from(interner.resolve(usize::from(tid)).unwrap()));
+            }
+        }
+        Template {
+            id: self.id.0,
+            tokens: dense,
+            params,
+            token_count,
+            count: self.count,
+        }
+    }
+}
+
+pub(crate) struct Node {
+    pub children: std::collections::HashMap<TokenId, usize>,
+    pub cluster_ids: Vec<ClusterId>,
+}
+
+impl Node {
+    pub fn new() -> Self {
+        Self {
+            children: std::collections::HashMap::with_capacity_and_hasher(8, Default::default()),
+            cluster_ids: Vec::new(),
+        }
+    }
+}
diff --git a/src/transforms/drain.rs b/src/transforms/drain.rs
new file mode 100644
index 0000000000000..64fbbea13e288
--- /dev/null
+++ b/src/transforms/drain.rs
@@ -0,0 +1,511 @@
+use std::pin::Pin;
+
+use async_stream::stream;
+use drain_log::{Config as DrainLogConfig, Matcher, Template};
+use futures::{Stream, StreamExt};
+use snafu::Snafu;
+use vector_lib::{
+    config::clone_input_definitions,
+    configurable::configurable_component,
+    lookup::{OwnedTargetPath, lookup_v2::OptionalValuePath, owned_value_path},
+};
+
+use crate::{
+    config::{
+        DataType, Input, OutputId, TransformConfig, TransformContext, TransformOutput,
+    },
+    event::{Event, Value},
+    schema,
+    transforms::{TaskTransform, Transform},
+};
+
+const PARAM_STR: &str = "<*>";
+
+const fn default_tree_depth() -> usize {
+    4
+}
+
+const fn default_max_node_children() -> usize {
+    100
+}
+
+const fn default_merge_threshold() -> f64 {
+    0.4
+}
+
+const fn default_max_bytes() -> usize {
+    8192
+}
+
+const fn default_max_tokens() -> usize {
+    256
+}
+
+fn default_field() -> OptionalValuePath {
+    OptionalValuePath::from(owned_value_path!("message"))
+}
+
+fn default_template_field() -> OptionalValuePath {
+    OptionalValuePath::from(owned_value_path!("drain_template"))
+}
+
+/// Configuration for the `drain` transform.
+#[configurable_component(transform(
+    "drain",
+    "Cluster log events with the Drain algorithm and annotate each event with the derived template."
+))]
+#[derive(Clone, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct DrainConfig {
+    /// The log field to read text from before passing it through the Drain
+    /// algorithm.
+    ///
+    /// If the field is missing or not a string, the event is forwarded without
+    /// annotation.
+    #[serde(default = "default_field")]
+    #[configurable(metadata(docs::examples = "message", docs::examples = "body"))]
+    pub field: OptionalValuePath,
+
+    /// The log field to write the derived template string to.
+    ///
+    /// Set to an empty string to disable writing the template (the tree still
+    /// trains on every event). To target an OpenTelemetry-style attribute name
+    /// containing dots — e.g. `log.record.template` — set this to the quoted
+    /// path `"log.record.template"` so the dots are treated as part of a
+    /// single field name rather than a nested path.
+    #[serde(default = "default_template_field")]
+    #[configurable(metadata(docs::examples = "drain_template"))]
+    pub template_field: OptionalValuePath,
+
+    /// Maximum depth of the Drain parse tree (called `depth` in the Drain
+    /// paper). Higher values produce more specific templates. Minimum: 3.
+    #[serde(default = "default_tree_depth")]
+    pub tree_depth: usize,
+
+    /// Minimum fraction of tokens that must match an existing cluster template
+    /// for a log line to be merged into it rather than forming a new cluster
+    /// (called `st` in the Drain paper). Range: 0.0–1.0.
+    #[serde(default = "default_merge_threshold")]
+    pub merge_threshold: f64,
+
+    /// Maximum children per internal parse tree node (called `maxChild` in the
+    /// Drain paper). Bounds memory on high-cardinality token positions.
+    #[serde(default = "default_max_node_children")]
+    pub max_node_children: usize,
+
+    /// Maximum number of clusters to track. Once the limit is reached, the
+    /// least-recently-used cluster is evicted to make room for a new one,
+    /// so the matcher continues to adapt to drifting log vocabularies on
+    /// long-running pipelines without unbounded memory growth. `0` means
+    /// unlimited (no eviction).
+    ///
+    /// Pick a value comfortably above the steady-state pattern count for
+    /// your workload so genuinely useful templates are not churned out by
+    /// transient noise.
+    #[serde(default)]
+    pub max_clusters: usize,
+
+    /// Maximum byte length of a single log line to consider. Lines longer than
+    /// this are skipped (no annotation).
+    #[serde(default = "default_max_bytes")]
+    pub max_bytes: usize,
+
+    /// Maximum number of tokens per line. Lines exceeding this are skipped.
+    #[serde(default = "default_max_tokens")]
+    pub max_tokens: usize,
+
+    /// Additional token delimiters beyond whitespace (for example `[",", ":"]`).
+    #[serde(default)]
+    pub extra_delimiters: Vec<String>,
+
+    /// Pre-known template strings to train on at startup. Improves template
+    /// stability across restarts for known log patterns.
+    #[serde(default)]
+    pub seed_templates: Vec<String>,
+
+    /// Raw example log lines to train on at startup. Drain derives templates
+    /// from these lines itself.
+    #[serde(default)]
+    pub seed_logs: Vec<String>,
+
+    /// Number of distinct clusters that must be observed before annotation is
+    /// enabled. During warmup, events pass through unannotated while the tree
+    /// keeps training. `0` (default) disables warmup suppression.
+    #[serde(default)]
+    pub warmup_min_clusters: usize,
+}
+
+impl Default for DrainConfig {
+    fn default() -> Self {
+        Self {
+            field: default_field(),
+            template_field: default_template_field(),
+            tree_depth: default_tree_depth(),
+            merge_threshold: default_merge_threshold(),
+            max_node_children: default_max_node_children(),
+            max_clusters: 0,
+            max_bytes: default_max_bytes(),
+            max_tokens: default_max_tokens(),
+            extra_delimiters: Vec::new(),
+            seed_templates: Vec::new(),
+            seed_logs: Vec::new(),
+            warmup_min_clusters: 0,
+        }
+    }
+}
+
+impl_generate_config_from_default!(DrainConfig);
+
+#[async_trait::async_trait]
+#[typetag::serde(name = "drain")]
+impl TransformConfig for DrainConfig {
+    async fn build(&self, _context: &TransformContext) -> crate::Result<Transform> {
+        Ok(Transform::event_task(Drain::new(self)?))
+    }
+
+    fn input(&self) -> Input {
+        Input::log()
+    }
+
+    fn outputs(
+        &self,
+        _: &TransformContext,
+        input_definitions: &[(OutputId, schema::Definition)],
+    ) -> Vec<TransformOutput> {
+        // The transform only adds a string field; pass the input schema through.
+        vec![TransformOutput::new(
+            DataType::Log,
+            clone_input_definitions(input_definitions),
+        )]
+    }
+}
+
+#[derive(Debug, Snafu)]
+pub enum BuildError {
+    #[snafu(display("failed to build drain matcher: {source}"))]
+    Build { source: drain_log::Error },
+}
+
+pub struct Drain {
+    /// Pre-resolved event-root path for the source field, or `None` when the
+    /// user supplied an empty path (effectively disabling annotation).
+    field: Option<OwnedTargetPath>,
+    /// Pre-resolved event-root path for the destination field. `None` means
+    /// "train but don't write" (the matcher still updates its tree).
+    template_field: Option<OwnedTargetPath>,
+    warmup_min_clusters: usize,
+    warmed_up: bool,
+    matcher: Matcher,
+}
+
+impl Drain {
+    pub fn new(cfg: &DrainConfig) -> crate::Result<Self> {
+        let drain_cfg = DrainLogConfig::builder()
+            .depth(cfg.tree_depth)
+            .similarity_threshold(cfg.merge_threshold)
+            .max_children(cfg.max_node_children)
+            .max_clusters(cfg.max_clusters)
+            .max_bytes(cfg.max_bytes)
+            .max_tokens(cfg.max_tokens)
+            .extra_delimiters(cfg.extra_delimiters.clone())
+            .build();
+
+        let mut matcher = drain_log::train(&[], drain_cfg)
+            .map_err(|source| Box::new(BuildError::Build { source }))?;
+
+        for tmpl in &cfg.seed_templates {
+            if tmpl.trim().is_empty() {
+                continue;
+            }
+            if let Err(error) = matcher.add_log_message(tmpl) {
+                warn!(
+                    message = "Failed to seed drain template, skipping.",
+                    template = %tmpl,
+                    %error,
+                );
+            }
+        }
+        for line in &cfg.seed_logs {
+            if line.trim().is_empty() {
+                continue;
+            }
+            if let Err(error) = matcher.add_log_message(line) {
+                warn!(
+                    message = "Failed to seed drain log line, skipping.",
+                    line = %line,
+                    %error,
+                );
+            }
+        }
+
+        let warmed_up = cfg.warmup_min_clusters == 0
+            || matcher.cluster_count() >= cfg.warmup_min_clusters;
+
+        let field = cfg.field.path.clone().map(OwnedTargetPath::event);
+        let template_field = cfg.template_field.path.clone().map(OwnedTargetPath::event);
+
+        Ok(Self {
+            field,
+            template_field,
+            warmup_min_clusters: cfg.warmup_min_clusters,
+            warmed_up,
+            matcher,
+        })
+    }
+
+    fn transform_one(&mut self, mut event: Event) -> Event {
+        let Some(field_path) = self.field.as_ref() else {
+            return event;
+        };
+
+        let log = match &mut event {
+            Event::Log(log) => log,
+            _ => return event,
+        };
+
+        let text = match log.get(field_path) {
+            Some(Value::Bytes(b)) => String::from_utf8_lossy(b).into_owned(),
+            _ => return event,
+        };
+
+        if text.is_empty() {
+            return event;
+        }
+
+        let template = match self.matcher.add_log_message(&text) {
+            Ok(t) => t,
+            Err(drain_log::Error::LineTooLong { .. }) => {
+                // The line exceeds the configured max_bytes. Forward without
+                // annotation; logging per-event would be too noisy.
+                return event;
+            }
+            Err(error) => {
+                debug!(message = "Drain training failed; skipping annotation.", %error);
+                return event;
+            }
+        };
+
+        if !self.warmed_up {
+            if self.matcher.cluster_count() >= self.warmup_min_clusters {
+                self.warmed_up = true;
+            } else {
+                return event;
+            }
+        }
+
+        if let Some(template_path) = self.template_field.as_ref() {
+            log.insert(template_path, Value::from(template_to_string(&template)));
+        }
+        event
+    }
+}
+
+fn template_to_string(t: &Template) -> String {
+    let mut out = String::new();
+    let mut tok_idx = 0;
+    for i in 0..t.token_count() {
+        if i > 0 {
+            out.push(' ');
+        }
+        if t.is_param(i) {
+            out.push_str(PARAM_STR);
+        } else {
+            out.push_str(&t.tokens()[tok_idx]);
+            tok_idx += 1;
+        }
+    }
+    out
+}
+
+impl TaskTransform<Event> for Drain {
+    fn transform(
+        self: Box<Self>,
+        mut input_rx: Pin<Box<dyn Stream<Item = Event> + Send>>,
+    ) -> Pin<Box<dyn Stream<Item = Event> + Send>>
+    where
+        Self: 'static,
+    {
+        let mut inner = *self;
+        Box::pin(stream! {
+            while let Some(event) = input_rx.next().await {
+                yield inner.transform_one(event);
+            }
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use indoc::indoc;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
+    use super::*;
+    use crate::{
+        event::LogEvent,
+        test_util::components::assert_transform_compliance,
+        transforms::test::create_topology,
+    };
+
+    #[test]
+    fn generate_config() {
+        crate::test_util::test_generate_config::<DrainConfig>();
+    }
+
+    fn log(message: &str) -> Event {
+        Event::Log(LogEvent::from(message))
+    }
+
+    #[tokio::test]
+    async fn annotates_with_template() {
+        let config: DrainConfig = toml::from_str(indoc! {r#"
+            seed_templates = [
+              "user <*> logged in from <*>",
+            ]
+        "#})
+        .unwrap();
+
+        assert_transform_compliance(async move {
+            let (tx, rx) = mpsc::channel(2);
+            let (topology, mut out) = create_topology(ReceiverStream::new(rx), config).await;
+
+            tx.send(log("user alice logged in from 10.0.0.1"))
+                .await
+                .unwrap();
+            tx.send(log("user bob logged in from 192.168.1.1"))
+                .await
+                .unwrap();
+
+            let first = out.recv().await.unwrap();
+            let second = out.recv().await.unwrap();
+
+            let first = first.as_log();
+            let second = second.as_log();
+
+            let t1 = first
+                .get("drain_template")
+                .and_then(|v| v.as_str())
+                .map(|s| s.to_string())
+                .expect("first event should have drain_template");
+            let t2 = second
+                .get("drain_template")
+                .and_then(|v| v.as_str())
+                .map(|s| s.to_string())
+                .expect("second event should have drain_template");
+
+            assert!(
+                t1.contains("user") && t1.contains("logged in from"),
+                "template should retain anchor tokens, got {t1}"
+            );
+            assert_eq!(
+                t1, t2,
+                "both events should resolve to the same template"
+            );
+            assert!(t1.contains("<*>"));
+
+            drop(tx);
+            topology.stop().await;
+            assert_eq!(out.recv().await, None);
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn warmup_suppresses_annotation() {
+        // warmup requires 3 clusters before annotation kicks in. The first
+        // two distinct lines train but don't write the template. The third
+        // (a third distinct cluster) triggers warmup completion and gets
+        // annotated.
+        let config: DrainConfig = toml::from_str(indoc! {r#"
+            warmup_min_clusters = 3
+        "#})
+        .unwrap();
+
+        assert_transform_compliance(async move {
+            let (tx, rx) = mpsc::channel(3);
+            let (topology, mut out) = create_topology(ReceiverStream::new(rx), config).await;
+
+            tx.send(log("alpha event happened")).await.unwrap();
+            tx.send(log("connection refused 5")).await.unwrap();
+            tx.send(log("disk usage 99 percent")).await.unwrap();
+
+            let first = out.recv().await.unwrap();
+            let second = out.recv().await.unwrap();
+            let third = out.recv().await.unwrap();
+
+            assert!(
+                first.as_log().get("drain_template").is_none(),
+                "first event should be unannotated during warmup"
+            );
+            assert!(
+                second.as_log().get("drain_template").is_none(),
+                "second event should be unannotated during warmup"
+            );
+            assert!(
+                third.as_log().get("drain_template").is_some(),
+                "third event should be annotated once warmup completes"
+            );
+
+            drop(tx);
+            topology.stop().await;
+            assert_eq!(out.recv().await, None);
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn missing_field_passes_through() {
+        let config: DrainConfig = toml::from_str(indoc! {r#"
+            field = "body"
+        "#})
+        .unwrap();
+
+        assert_transform_compliance(async move {
+            let (tx, rx) = mpsc::channel(1);
+            let (topology, mut out) = create_topology(ReceiverStream::new(rx), config).await;
+
+            // The event has a `message` field but no `body`, so the transform
+            // should leave it unannotated.
+            tx.send(log("nothing here")).await.unwrap();
+
+            let event = out.recv().await.unwrap();
+            assert!(event.as_log().get("drain_template").is_none());
+
+            drop(tx);
+            topology.stop().await;
+            assert_eq!(out.recv().await, None);
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn custom_template_field() {
+        let config: DrainConfig = toml::from_str(indoc! {r#"
+            template_field = "tpl"
+            seed_templates = ["request <*> handled"]
+        "#})
+        .unwrap();
+
+        assert_transform_compliance(async move {
+            let (tx, rx) = mpsc::channel(1);
+            let (topology, mut out) = create_topology(ReceiverStream::new(rx), config).await;
+
+            tx.send(log("request 42 handled")).await.unwrap();
+            let event = out.recv().await.unwrap();
+
+            assert!(event.as_log().get("drain_template").is_none());
+            let tpl = event
+                .as_log()
+                .get("tpl")
+                .and_then(|v| v.as_str())
+                .expect("custom template field should be set");
+            assert!(tpl.contains("request") && tpl.contains("handled"));
+
+            drop(tx);
+            topology.stop().await;
+            assert_eq!(out.recv().await, None);
+        })
+        .await;
+    }
+
+}
diff --git a/src/transforms/mod.rs b/src/transforms/mod.rs
index 2a9f606df3c6a..e65c772a05e75 100644
--- a/src/transforms/mod.rs
+++ b/src/transforms/mod.rs
@@ -13,6 +13,8 @@ pub mod aggregate;
 pub mod aws_ec2_metadata;
 #[cfg(feature = "transforms-delay")]
 pub mod delay;
+#[cfg(feature = "transforms-drain")]
+pub mod drain;
 #[cfg(feature = "transforms-exclusive-route")]
 mod exclusive_route;
 #[cfg(feature = "transforms-filter")]