From 1a769dded3b27a8aff1f8619187c2971610510f0 Mon Sep 17 00:00:00 2001
From: masklinn <github.com@masklinn.net>
Date: Sat, 21 Mar 2026 17:52:42 +0100
Subject: [PATCH 1/2] Allow devices script to generate regex files for any
 section

as well as add stdin or URLs support. Generating regex files out of
`regexes.yaml` is a convenient first step to make subsequent scripts
simpler (e.g. not require every one of them to read yaml).

This script could be an yq command (plus an optional curl first step),
but e.g. nix's `yq` is a python wrapper around `jq` which depends on
pyyaml so the gain is limited.
---
 scripts/devices | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/scripts/devices b/scripts/devices
index e1fa488..3fd7ee3 100755
--- a/scripts/devices
+++ b/scripts/devices
@@ -31,18 +31,41 @@ r"""Compiles regexes.yaml to just the device regexps, with rewriting:
   Note that this is only done for a lower bound of `0` or `1`, but
   that's the case of all large bounded repetitions in regexes.yaml.
 """
+import argparse
 import string
 import sys
+import urllib.request
 
 from yaml import SafeLoader, load
 
 def main() -> None:
-    with open(sys.argv[1]) as f:
-        regexes = load(f, Loader=SafeLoader)
-    for dev in regexes['device_parsers']:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'regexes_yaml',
+        help="path or url to a regexes.yaml file, `-` for stdin",
+    )
+    parser.add_argument(
+        '--section',
+        default='device',
+        choices=['user_agent', 'os', 'device'],
+        help="regex section to convert, defaults to `device`"
+    )
+    args = parser.parse_args()
+
+    regexes = args.regexes_yaml
+    if regexes == '-':
+        regexes = load(sys.stdin, Loader=SafeLoader)
+    elif regexes.startswith('http'):
+        with urllib.request.urlopen(regexes) as f:
+            regexes = load(f, Loader=SafeLoader)
+    else:
+        with open(regexes) as f:
+            regexes = load(f, Loader=SafeLoader)
+
+    for item in regexes[f'{args.section}_parsers']:
         print(
-            f'(?{f})' if (f := dev.get('regex_flag')) else '',
-            rewrite(dev['regex']),
+            f'(?{f})' if (f := item.get('regex_flag')) else '',
+            rewrite(item['regex']),
             sep='',
         )
 

From d67517ab467db620a243bfea02850b1e6e9f7343 Mon Sep 17 00:00:00 2001
From: masklinn <github.com@masklinn.net>
Date: Sat, 21 Mar 2026 18:05:00 +0100
Subject: [PATCH 2/2] Add regex index extractors

These three files / scripts are 3 different implementations (python,
regex, regex-filtered) of the same thing: taking a regex set and a
bunch of needles, for each needle find the first matching regex, and
output its index (0-indexed).

This is the core loop of ua-parser, and allows validating that
regex-filtered matches a more naive version of the same process.
Happily I couldn't find any divergence although that means I did a
fair amount of useless work. Also the python version is really slow
compared to even the regex one, so probably don't use that...

`paste` allows using it to combine index extraction of multiple
domains as well as the original needle as TSV documents if that's of
use. This could also be expanded to multi-index extraction if that's a
need for anyone and should be checked more extensively.

Note that only the python version supports stdin input at this point,
I couldn't be arsed to do that with the Rust ones, but process
substitution ought work fine anyway? The needles are read on the go so
they should not need to be an actual file.

This may not be in a state fit for performance checking as the output
loop of the rust version is the worst (no buffering, no
stdout-locking).
---
 regex-filtered/examples/matchindex.rs  | 38 ++++++++++++++++++++++++
 regex-filtered/examples/matchindex2.rs | 40 ++++++++++++++++++++++++++
 scripts/matchindex                     | 32 +++++++++++++++++++++
 3 files changed, 110 insertions(+)
 create mode 100644 regex-filtered/examples/matchindex.rs
 create mode 100644 regex-filtered/examples/matchindex2.rs
 create mode 100755 scripts/matchindex

diff --git a/regex-filtered/examples/matchindex.rs b/regex-filtered/examples/matchindex.rs
new file mode 100644
index 0000000..e281d86
--- /dev/null
+++ b/regex-filtered/examples/matchindex.rs
@@ -0,0 +1,38 @@
+use clap::Parser;
+use std::io::BufRead;
+
+#[derive(Parser)]
+struct Args {
+    regexes: String,
+    useragents: String,
+}
+
+fn main() {
+    let Args {
+        regexes,
+        useragents,
+    } = Args::parse();
+    let regexes: Vec<_> = std::io::BufReader::new(std::fs::File::open(regexes).unwrap())
+        .lines()
+        .map(|l| regex::Regex::new(&l.unwrap()).unwrap())
+        .collect();
+
+    let mut uas = std::io::BufReader::new(std::fs::File::open(useragents).unwrap());
+    let mut line = String::with_capacity(150);
+    while let Ok(n) = uas.read_line(&mut line) {
+        if n == 0 {
+            break;
+        }
+        let line_ = line.strip_suffix("\n").unwrap_or(&line);
+        let m = regexes
+            .iter()
+            .enumerate()
+            .find(|(_, regex)| regex.is_match(line_));
+        if let Some((i, _)) = m {
+            println!("{i}");
+        } else {
+            println!("-");
+        }
+        line.clear();
+    }
+}
diff --git a/regex-filtered/examples/matchindex2.rs b/regex-filtered/examples/matchindex2.rs
new file mode 100644
index 0000000..fac58be
--- /dev/null
+++ b/regex-filtered/examples/matchindex2.rs
@@ -0,0 +1,40 @@
+use clap::Parser;
+use std::io::BufRead;
+
+#[derive(Parser)]
+struct Args {
+    regexes: String,
+    useragents: String,
+}
+
+fn main() {
+    let Args {
+        regexes,
+        useragents,
+    } = Args::parse();
+    let regexes = regex_filtered::Builder::new()
+        .push_all(
+            std::io::BufReader::new(std::fs::File::open(regexes).unwrap())
+                .lines()
+                .map(Result::unwrap),
+        )
+        .unwrap()
+        .build()
+        .unwrap();
+
+    let mut uas = std::io::BufReader::new(std::fs::File::open(useragents).unwrap());
+    let mut line = String::with_capacity(150);
+    while let Ok(n) = uas.read_line(&mut line) {
+        if n == 0 {
+            break;
+        }
+        let line_ = line.strip_suffix("\n").unwrap_or(&line);
+        let m = regexes.matching(line_).next();
+        if let Some((i, _)) = m {
+            println!("{i}");
+        } else {
+            println!("-");
+        }
+        line.clear();
+    }
+}
diff --git a/scripts/matchindex b/scripts/matchindex
new file mode 100755
index 0000000..e880404
--- /dev/null
+++ b/scripts/matchindex
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+import argparse
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    'regexes',
+    help="regexes to try on the user agents",
+)
+parser.add_argument(
+    'useragents',
+    type=argparse.FileType(),
+    help="user agents to parse, `-` for stdin",
+)
+args = parser.parse_args()
+
+with open(args.regexes) as r:
+    regexes = [
+        re.compile(pattern.rstrip('\n'))
+        for pattern in r
+    ]
+
+with args.useragents as r:
+    for u in r:
+        u = u.rstrip('\n')
+        for i, p in enumerate(regexes):
+            if p.search(u):
+                print(i)
+                break
+        else:
+            print('-')