diff --git a/build/rust/Cargo.lock b/build/rust/Cargo.lock index edfc9c3..65d895c 100644 --- a/build/rust/Cargo.lock +++ b/build/rust/Cargo.lock @@ -1,14 +1,5 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -[[package]] -name = "aho-corasick" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" -dependencies = [ - "memchr", -] - [[package]] name = "atty" version = "0.2.14" @@ -64,19 +55,12 @@ dependencies = [ "syn", ] -[[package]] -name = "fnv" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" - [[package]] name = "freq" version = "0.1.0" dependencies = [ "clap", - "fnv", - "regex", + "memmap", ] [[package]] @@ -90,9 +74,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61565ff7aaace3525556587bd2dc31d4a07071957be715e63ce7b1eccf51a8f4" +checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71" dependencies = [ "libc", ] @@ -119,10 +103,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" [[package]] -name = "memchr" -version = "2.3.3" +name = "memmap" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi", +] [[package]] name = "os_str_bytes" @@ -158,40 +146,22 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8872cf6f48eee44265156c111456a700ab3483686b3f96df4cf5481c89157319" +checksum = "53f5ffe53a6b28e37c9c1ce74893477864d64f74778a93a4beb43c8fa167f639" dependencies = [ "unicode-xid", ] [[package]] name = "quote" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42934bc9c8ab0d3b273a16d8551c8f0fcff46be73276ca083ec2414c15c4ba5e" +checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea" dependencies = [ "proc-macro2", ] -[[package]] -name = "regex" -version = "1.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", - "thread_local", -] - -[[package]] -name = "regex-syntax" -version = "0.6.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" - [[package]] name = "strsim" version = "0.10.0" @@ -200,9 +170,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4696caa4048ac7ce2bcd2e484b3cef88c1004e41b8e945a277e2c25dc0b72060" +checksum = "1425de3c33b0941002740a420b1a906a350b88d08b82b2c8a01035a3f9447bac" dependencies = [ "proc-macro2", "quote", @@ -238,15 +208,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - [[package]] name = "unicode-segmentation" version = "1.6.0" diff --git a/build/rust/Cargo.toml b/build/rust/Cargo.toml index 63f419b..2dceea5 100644 --- a/build/rust/Cargo.toml +++ b/build/rust/Cargo.toml @@ -8,8 +8,7 @@ edition = "2018" [dependencies] clap = "3.0.0-beta.1" -fnv = "1" -regex = "1" +memmap = "0.7" [[bin]] name = "freq01" diff --git a/src/freq01.rs b/src/freq01.rs index 2000737..330e53d 100644 --- a/src/freq01.rs +++ b/src/freq01.rs @@ -1,10 +1,17 @@ -use clap::Clap; -use fnv::FnvHashMap; use std::{ + cmp::Ordering, fs::File, - io::{BufRead, BufReader, BufWriter, Write}, + io::{BufWriter, Read, Write}, }; +use clap::Clap; +use memmap::*; + +/// FNV1 hash basis +const H: usize = 0x811c_9dc5; +/// FNV1 hash prime +const P: usize = 0x0100_0193; + /// Counts number of unique `[a-zA-Z]+` words in input. #[derive(Clap, Debug)] #[clap(version = "0.1.0")] @@ -15,89 +22,219 @@ struct Opts { output: Option, } -struct FreqDict { - dict: FnvHashMap, u32>, -} - fn main() { let opts: Opts = Opts::parse(); - let mut input = open_input(&opts); + let input = open_mmap(&opts); + + let mut hash = H; + let mut dict = FrequencyHashMap::new(); + + let mut word_start = 0; + let mut word_end = 0; + for (idx, &byte) in input.iter().enumerate() { + if (b'a' <= byte && byte <= b'z') || (b'A' <= byte && byte <= b'Z') { + hash ^= (byte & 0x1F) as usize; + hash *= P; + word_end = idx + 1; + } else { + if word_start < word_end { + dict.register(hash, &input[word_start..word_end]); + hash = H; + } + word_start = idx + 1; + } + } + if word_start < word_end { + dict.register(hash, &input[word_start..word_end]); + } + let mut output = create_output(&opts); + for (count, word) in dict.into_iter() { + writeln!(&mut output, "{} {}", count, word).unwrap_or_else(|e| { + let output = opts.output.as_ref().map_or("-", |s| s.as_str()); + panic!("Unable to write results in '{}': {}", output, e) + }) + } +} + +struct FrequencyHashMap { + buckets: Vec>, + capacity: usize, + length: usize, + mask: usize, + max: usize, +} - let mut word = Vec::with_capacity(16); - let mut dict = FreqDict::new(); +#[derive(Clone, Debug)] +struct FrequencyHashEntry { + key: Box<[u8]>, + value: usize, + hash: usize, +} + +struct FrequencyHashIntoIter { + iter: std::vec::IntoIter>, +} - let mut buffer = [0u8; 16 * 1024]; - loop { - let read_count = input - .read(&mut buffer) - .unwrap_or_else(|e| panic!("Unable to read bytes from '{}': {}", opts.input, e)); +impl FrequencyHashMap { + const INITIAL: usize = 128; + const LOAD_FACTOR: f32 = 0.9; - if read_count == 0 { - break; + fn new() -> FrequencyHashMap { + FrequencyHashMap { + buckets: vec![None; Self::INITIAL], + capacity: Self::INITIAL, + length: 0, + mask: Self::INITIAL - 1, + max: (Self::LOAD_FACTOR * Self::INITIAL as f32) as usize, } + } - for &byte in buffer.iter().take(read_count) { - if b'a' <= byte && byte <= b'z' { - word.push(byte); - continue; - } else if b'A' <= byte && byte <= b'Z' { - word.push(byte ^ 0x20); - continue; + fn register(&mut self, hash: usize, word: &[u8]) { + let mut index = hash & self.mask; + loop { + match unsafe { self.buckets.get_unchecked_mut(index) } { + Some(entry) => { + if entry.same_as(hash, word) { + entry.value += 1; + return; + } else { + index = (index + 1) & self.mask + } + } + none => { + none.replace(FrequencyHashEntry::new(hash, word)); + + self.length += 1; + if self.length > self.max { + self.ensure_capacity(); + } + break; + } } + } + } - dict.add_word(&word); - word.clear(); + fn ensure_capacity(&mut self) { + while self.length > self.max { + self.capacity *= 2; + self.mask = self.capacity - 1; + self.max = (Self::LOAD_FACTOR * self.capacity as f32) as usize; + } + + let new_buckets = vec![None; self.capacity]; + for bucket in std::mem::replace(&mut self.buckets, new_buckets) { + if let Some(entry) = bucket { + let mut index = entry.hash & self.mask; + loop { + match unsafe { self.buckets.get_unchecked_mut(index) } { + Some(_) => index = (index + 1) & self.mask, + none => { + none.replace(entry); + break; + } + } + } + } } } - dict.add_word(&word); +} - for (count, word) in dict.get_freq() { - writeln!(&mut output, "{} {}", count, word).unwrap_or_else(|e| { - let output = opts.output.as_ref().map_or("-", |s| s.as_str()); - panic!("Unable to write results in '{}': {}", output, e) - }) +impl IntoIterator for FrequencyHashMap { + type Item = (usize, String); + type IntoIter = FrequencyHashIntoIter; + + fn into_iter(self) -> Self::IntoIter { + let mut buckets = self.buckets; + buckets.sort_unstable(); + + FrequencyHashIntoIter { + iter: buckets.into_iter(), + } } } -impl FreqDict { - fn new() -> Self { - FreqDict { - dict: FnvHashMap::default(), +impl FrequencyHashEntry { + #[inline] + fn new(hash: usize, word: &[u8]) -> FrequencyHashEntry { + FrequencyHashEntry { + key: word.iter().map(|b| b | 0x20).collect(), + hash, + value: 1, } } - fn add_word(&mut self, word: &[u8]) { - if !word.is_empty() { - if let Some(counter) = self.dict.get_mut(word) { - *counter += 1; - } else { - self.dict.insert(word.into(), 1); + #[inline] + fn same_as(&self, hash: usize, word: &[u8]) -> bool { + hash == self.hash + && self.key.len() == word.len() + && Iterator::zip(self.key.iter(), word.iter()).all(|(&l, &r)| l == (r | 0x20)) + } +} + +impl Ord for FrequencyHashEntry { + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&other.value, &self.value) + .then_with(|| Ord::cmp(self.key.as_ref(), other.key.as_ref())) + } +} + +impl PartialOrd for FrequencyHashEntry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(Ord::cmp(&self, &other)) + } +} + +impl Eq for FrequencyHashEntry {} + +impl PartialEq for FrequencyHashEntry { + fn eq(&self, other: &Self) -> bool { + Ord::cmp(&self, &other) == Ordering::Equal + } +} + +impl Iterator for FrequencyHashIntoIter { + type Item = (usize, String); + + fn next(&mut self) -> Option { + while let Some(opt) = self.iter.next() { + if let Some(entry) = opt { + let key = std::str::from_utf8(&entry.key).unwrap().to_owned(); + return Some((entry.value, key)); } } + None } - fn get_freq(&self) -> Vec<(u32, &str)> { - let mut freq = self.dict.iter() - .map(|(w, c)| { - let key = std::str::from_utf8(w).unwrap(); - (*c, key) - }) - .collect::>(); - freq.sort_unstable_by(|(c1, w1), (c2, w2)| { - Ord::cmp(c1, c2).reverse().then_with(|| Ord::cmp(w1, w2)) - }); - freq + fn size_hint(&self) -> (usize, Option) { + let (_, upper) = self.iter.size_hint(); + (0, upper) } } -fn open_input(opts: &Opts) -> Box { +fn open_mmap(opts: &Opts) -> Mmap { match opts.input.as_str() { - "-" => Box::new(BufReader::new(std::io::stdin())), + "-" => { + let mut buffer = vec![]; + std::io::stdin() + .read_to_end(&mut buffer) + .unwrap_or_else(|e| panic!("Unable to read STDIN: {}", e)); + let mut mmap = MmapOptions::new() + .len(buffer.len()) + .map_anon() + .unwrap_or_else(|e| panic!("Unable to read STDIN: {}", e)); + mmap.copy_from_slice(&buffer); + mmap.make_read_only() + .unwrap_or_else(|e| panic!("Unable to read STDIN: {}", e)) + } fnm => { let file = File::open(fnm) .unwrap_or_else(|e| panic!("Unable to open '{}' for reading: {}", fnm, e)); - Box::new(BufReader::new(file)) + unsafe { + MmapOptions::new() + .map(&file) + .unwrap_or_else(|e| panic!("Unable to read '{}' in memory: {}", fnm, e)) + } } } }