diff --git a/regex-filtered/examples/matchindex.rs b/regex-filtered/examples/matchindex.rs new file mode 100644 index 0000000..e281d86 --- /dev/null +++ b/regex-filtered/examples/matchindex.rs @@ -0,0 +1,38 @@ +use clap::Parser; +use std::io::BufRead; + +#[derive(Parser)] +struct Args { + regexes: String, + useragents: String, +} + +fn main() { + let Args { + regexes, + useragents, + } = Args::parse(); + let regexes: Vec<_> = std::io::BufReader::new(std::fs::File::open(regexes).unwrap()) + .lines() + .map(|l| regex::Regex::new(&l.unwrap()).unwrap()) + .collect(); + + let mut uas = std::io::BufReader::new(std::fs::File::open(useragents).unwrap()); + let mut line = String::with_capacity(150); + while let Ok(n) = uas.read_line(&mut line) { + if n == 0 { + break; + } + let line_ = line.strip_suffix("\n").unwrap_or(&line); + let m = regexes + .iter() + .enumerate() + .find(|(_, regex)| regex.is_match(line_)); + if let Some((i, _)) = m { + println!("{i}"); + } else { + println!("-"); + } + line.clear(); + } +} diff --git a/regex-filtered/examples/matchindex2.rs b/regex-filtered/examples/matchindex2.rs new file mode 100644 index 0000000..fac58be --- /dev/null +++ b/regex-filtered/examples/matchindex2.rs @@ -0,0 +1,40 @@ +use clap::Parser; +use std::io::BufRead; + +#[derive(Parser)] +struct Args { + regexes: String, + useragents: String, +} + +fn main() { + let Args { + regexes, + useragents, + } = Args::parse(); + let regexes = regex_filtered::Builder::new() + .push_all( + std::io::BufReader::new(std::fs::File::open(regexes).unwrap()) + .lines() + .map(Result::unwrap), + ) + .unwrap() + .build() + .unwrap(); + + let mut uas = std::io::BufReader::new(std::fs::File::open(useragents).unwrap()); + let mut line = String::with_capacity(150); + while let Ok(n) = uas.read_line(&mut line) { + if n == 0 { + break; + } + let line_ = line.strip_suffix("\n").unwrap_or(&line); + let m = regexes.matching(line_).next(); + if let Some((i, _)) = m { + println!("{i}"); + } else { + println!("-"); + } + line.clear(); + } +} diff --git a/scripts/devices b/scripts/devices index e1fa488..3fd7ee3 100755 --- a/scripts/devices +++ b/scripts/devices @@ -31,18 +31,41 @@ r"""Compiles regexes.yaml to just the device regexps, with rewriting: Note that this is only done for a lower bound of `0` or `1`, but that's the case of all large bounded repetitions in regexes.yaml. """ +import argparse import string import sys +import urllib.request from yaml import SafeLoader, load def main() -> None: - with open(sys.argv[1]) as f: - regexes = load(f, Loader=SafeLoader) - for dev in regexes['device_parsers']: + parser = argparse.ArgumentParser() + parser.add_argument( + 'regexes_yaml', + help="path or url to a regexes.yaml file, `-` for stdin", + ) + parser.add_argument( + '--section', + default='device', + choices=['user_agent', 'os', 'device'], + help="regex section to convert, defaults to `device`" + ) + args = parser.parse_args() + + regexes = args.regexes_yaml + if regexes == '-': + regexes = load(sys.stdin, Loader=SafeLoader) + elif regexes.startswith('http'): + with urllib.request.urlopen(regexes) as f: + regexes = load(f, Loader=SafeLoader) + else: + with open(regexes) as f: + regexes = load(f, Loader=SafeLoader) + + for item in regexes[f'{args.section}_parsers']: print( - f'(?{f})' if (f := dev.get('regex_flag')) else '', - rewrite(dev['regex']), + f'(?{f})' if (f := item.get('regex_flag')) else '', + rewrite(item['regex']), sep='', ) diff --git a/scripts/matchindex b/scripts/matchindex new file mode 100755 index 0000000..e880404 --- /dev/null +++ b/scripts/matchindex @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument( + 'regexes', + help="regexes to try on the user agents", +) +parser.add_argument( + 'useragents', + type=argparse.FileType(), + help="user agents to parse, `-` for stdin", +) +args = parser.parse_args() + +with open(args.regexes) as r: + regexes = [ + re.compile(pattern.rstrip('\n')) + for pattern in r + ] + +with args.useragents as r: + for u in r: + u = u.rstrip('\n') + for i, p in enumerate(regexes): + if p.search(u): + print(i) + break + else: + print('-')