From 1a769dded3b27a8aff1f8619187c2971610510f0 Mon Sep 17 00:00:00 2001 From: masklinn Date: Sat, 21 Mar 2026 17:52:42 +0100 Subject: [PATCH 1/2] Allow devices script to generate regex files for any section as well as add stdin or URLs support. Generating regex files out of `regexes.yaml` is a convenient first step to make subsequent scripts simpler (e.g. not require every one of them to read yaml). This script could be an yq command (plus an optional curl first step), but e.g. nix's `yq` is a python wrapper around `jq` which depends on pyyaml so the gain is limited. --- scripts/devices | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/scripts/devices b/scripts/devices index e1fa488..3fd7ee3 100755 --- a/scripts/devices +++ b/scripts/devices @@ -31,18 +31,41 @@ r"""Compiles regexes.yaml to just the device regexps, with rewriting: Note that this is only done for a lower bound of `0` or `1`, but that's the case of all large bounded repetitions in regexes.yaml. """ +import argparse import string import sys +import urllib.request from yaml import SafeLoader, load def main() -> None: - with open(sys.argv[1]) as f: - regexes = load(f, Loader=SafeLoader) - for dev in regexes['device_parsers']: + parser = argparse.ArgumentParser() + parser.add_argument( + 'regexes_yaml', + help="path or url to a regexes.yaml file, `-` for stdin", + ) + parser.add_argument( + '--section', + default='device', + choices=['user_agent', 'os', 'device'], + help="regex section to convert, defaults to `device`" + ) + args = parser.parse_args() + + regexes = args.regexes_yaml + if regexes == '-': + regexes = load(sys.stdin, Loader=SafeLoader) + elif regexes.startswith('http'): + with urllib.request.urlopen(regexes) as f: + regexes = load(f, Loader=SafeLoader) + else: + with open(regexes) as f: + regexes = load(f, Loader=SafeLoader) + + for item in regexes[f'{args.section}_parsers']: print( - f'(?{f})' if (f := dev.get('regex_flag')) else '', - rewrite(dev['regex']), + f'(?{f})' if (f := item.get('regex_flag')) else '', + rewrite(item['regex']), sep='', ) From d67517ab467db620a243bfea02850b1e6e9f7343 Mon Sep 17 00:00:00 2001 From: masklinn Date: Sat, 21 Mar 2026 18:05:00 +0100 Subject: [PATCH 2/2] Add regex index extractors These three files / scripts are 3 different implementations (python, regex, regex-filtered) of the same thing: taking a regex set and a bunch of needles, for each needle find the first matching regex, and output its index (0-indexed). This is the core loop of ua-parser, and allows validating that regex-filtered matches a more naive version of the same process. Happily I couldn't find any divergence although that means I did a fair amount of useless work. Also the python version is really slow compared to even the regex one, so probably don't use that... `paste` allows using it to combine index extraction of multiple domains as well as the original needle as TSV documents if that's of use. This could also be expanded to multi-index extraction if that's a need for anyone and should be checked more extensively. Note that only the python version supports stdin input at this point, I couldn't be arsed to do that with the Rust ones, but process substitution ought work fine anyway? The needles are read on the go so they should not need to be an actual file. This may not be in a state fit for performance checking as the output loop of the rust version is the worst (no buffering, no stdout-locking). --- regex-filtered/examples/matchindex.rs | 38 ++++++++++++++++++++++++ regex-filtered/examples/matchindex2.rs | 40 ++++++++++++++++++++++++++ scripts/matchindex | 32 +++++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 regex-filtered/examples/matchindex.rs create mode 100644 regex-filtered/examples/matchindex2.rs create mode 100755 scripts/matchindex diff --git a/regex-filtered/examples/matchindex.rs b/regex-filtered/examples/matchindex.rs new file mode 100644 index 0000000..e281d86 --- /dev/null +++ b/regex-filtered/examples/matchindex.rs @@ -0,0 +1,38 @@ +use clap::Parser; +use std::io::BufRead; + +#[derive(Parser)] +struct Args { + regexes: String, + useragents: String, +} + +fn main() { + let Args { + regexes, + useragents, + } = Args::parse(); + let regexes: Vec<_> = std::io::BufReader::new(std::fs::File::open(regexes).unwrap()) + .lines() + .map(|l| regex::Regex::new(&l.unwrap()).unwrap()) + .collect(); + + let mut uas = std::io::BufReader::new(std::fs::File::open(useragents).unwrap()); + let mut line = String::with_capacity(150); + while let Ok(n) = uas.read_line(&mut line) { + if n == 0 { + break; + } + let line_ = line.strip_suffix("\n").unwrap_or(&line); + let m = regexes + .iter() + .enumerate() + .find(|(_, regex)| regex.is_match(line_)); + if let Some((i, _)) = m { + println!("{i}"); + } else { + println!("-"); + } + line.clear(); + } +} diff --git a/regex-filtered/examples/matchindex2.rs b/regex-filtered/examples/matchindex2.rs new file mode 100644 index 0000000..fac58be --- /dev/null +++ b/regex-filtered/examples/matchindex2.rs @@ -0,0 +1,40 @@ +use clap::Parser; +use std::io::BufRead; + +#[derive(Parser)] +struct Args { + regexes: String, + useragents: String, +} + +fn main() { + let Args { + regexes, + useragents, + } = Args::parse(); + let regexes = regex_filtered::Builder::new() + .push_all( + std::io::BufReader::new(std::fs::File::open(regexes).unwrap()) + .lines() + .map(Result::unwrap), + ) + .unwrap() + .build() + .unwrap(); + + let mut uas = std::io::BufReader::new(std::fs::File::open(useragents).unwrap()); + let mut line = String::with_capacity(150); + while let Ok(n) = uas.read_line(&mut line) { + if n == 0 { + break; + } + let line_ = line.strip_suffix("\n").unwrap_or(&line); + let m = regexes.matching(line_).next(); + if let Some((i, _)) = m { + println!("{i}"); + } else { + println!("-"); + } + line.clear(); + } +} diff --git a/scripts/matchindex b/scripts/matchindex new file mode 100755 index 0000000..e880404 --- /dev/null +++ b/scripts/matchindex @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument( + 'regexes', + help="regexes to try on the user agents", +) +parser.add_argument( + 'useragents', + type=argparse.FileType(), + help="user agents to parse, `-` for stdin", +) +args = parser.parse_args() + +with open(args.regexes) as r: + regexes = [ + re.compile(pattern.rstrip('\n')) + for pattern in r + ] + +with args.useragents as r: + for u in r: + u = u.rstrip('\n') + for i, p in enumerate(regexes): + if p.search(u): + print(i) + break + else: + print('-')