diff --git a/regex-filtered/Cargo.toml b/regex-filtered/Cargo.toml index 3cb021f..2d39acf 100644 --- a/regex-filtered/Cargo.toml +++ b/regex-filtered/Cargo.toml @@ -14,9 +14,7 @@ repository = "https://github.com/ua-parser/uap-rust/" [dependencies] aho-corasick = "1.1.3" -indexmap = "2.2.6" itertools = "0.13.0" -nohash = "0.2.0" regex = "1.10.4" regex-syntax = "0.8.3" diff --git a/regex-filtered/src/int_set.rs b/regex-filtered/src/int_set.rs new file mode 100644 index 0000000..7765b07 --- /dev/null +++ b/regex-filtered/src/int_set.rs @@ -0,0 +1,56 @@ +pub struct IntSet { + sparse: Vec, + dense: Vec, +} + +impl IntSet { + pub fn new(capacity: usize) -> Self { + Self { + sparse: vec![usize::MAX;capacity], + dense: Vec::with_capacity(capacity), + } + } + + pub fn insert(&mut self, value: usize) -> bool { + let idx = self.sparse[value]; + if self.dense.get(idx) != Some(&value) { + self.sparse[value] = self.dense.len(); + self.dense.push(value); + true + } else { + false + } + } + + pub fn len(&self) -> usize { + self.dense.len() + } + + pub fn into_vec(self) -> Vec { + self.dense + } +} + +impl std::ops::Index for IntSet { + type Output = usize; + + fn index(&self, index: usize) -> &Self::Output { + self.dense.index(index) + } +} + +impl std::iter::Extend for IntSet { + fn extend>(&mut self, iter: T) { + for val in iter { + self.insert(val); + } + } +} + +impl <'a> std::iter::Extend<&'a usize> for IntSet { + fn extend>(&mut self, iter: T) { + for val in iter { + self.insert(*val); + } + } +} diff --git a/regex-filtered/src/lib.rs b/regex-filtered/src/lib.rs index d6c5036..e79b16f 100644 --- a/regex-filtered/src/lib.rs +++ b/regex-filtered/src/lib.rs @@ -6,6 +6,7 @@ use aho_corasick::AhoCorasick; mod mapper; mod model; +mod int_set; pub use model::Error as ModelError; /// Builder for the regexes set diff --git a/regex-filtered/src/mapper.rs b/regex-filtered/src/mapper.rs index 3b5513b..b8a506f 100644 --- a/regex-filtered/src/mapper.rs +++ b/regex-filtered/src/mapper.rs @@ -1,8 +1,7 @@ use std::fmt::Display; use std::fmt::Formatter; -use indexmap::IndexSet; - +use crate::int_set::IntSet; use super::model::Model; pub struct Builder { @@ -184,7 +183,9 @@ impl Display for Mapper { writeln!(f, "#Unique Atoms: {}", self.atom_to_entry.len())?; for (i, e) in self.atom_to_entry.iter().copied().enumerate() { writeln!(f, "\tatom {i} -> entry {e}")?; - for r in self.propagate_match(&mut FromIterator::from_iter([e])) { + let mut s = IntSet::new(self.entries.len()); + s.insert(e); + for r in self.propagate_match(&mut s).into_vec() { writeln!(f, "\t\tregex {r}")?; } } @@ -229,7 +230,6 @@ struct Entry { regexps: Vec, } -type Set = IndexSet>; pub struct Mapper { /// Number of regexes covered by the mapper regexp_count: usize, @@ -244,14 +244,11 @@ pub struct Mapper { } impl Mapper { // name is shit and also needs to see if we can generate stuff on the fly - pub fn atom_to_re(&self, atoms: impl IntoIterator) -> Set { - let mut matched_atom_ids = IndexSet::with_capacity_and_hasher( - self.entries.len(), - nohash::BuildNoHashHasher::default(), - ); + pub fn atom_to_re(&self, atoms: impl IntoIterator) -> Vec { + let mut matched_atom_ids = IntSet::new(self.entries.len()); matched_atom_ids.extend(atoms.into_iter().map(|idx| self.atom_to_entry[idx])); - let mut regexps = self.propagate_match(&mut matched_atom_ids); + let mut regexps = self.propagate_match(&mut matched_atom_ids).into_vec(); regexps.extend(&self.unfiltered); @@ -259,13 +256,10 @@ impl Mapper { regexps } - fn propagate_match(&self, work: &mut Set) -> Set { + fn propagate_match(&self, work: &mut IntSet) -> IntSet { let mut count = vec![0;self.entries.len()]; - let mut regexps = IndexSet::with_capacity_and_hasher( - self.regexp_count, - nohash::BuildNoHashHasher::default(), - ); + let mut regexps = IntSet::new(self.regexp_count); let mut i = 0; while i < work.len() { @@ -334,8 +328,12 @@ mod test { assert_eq!(m.entries.len(), 3); assert_eq!(&m.atom_to_entry, &[0, 1]); - assert_eq!(m.propagate_match(&mut FromIterator::from_iter([0])), [0].into(),); - assert_eq!(m.propagate_match(&mut FromIterator::from_iter([1])), [0].into(),); + let mut s = IntSet::new(3); + s.insert(0); + assert_eq!(m.propagate_match(&mut s).into_vec(), vec![0]); + let mut s = IntSet::new(3); + s.insert(1); + assert_eq!(m.propagate_match(&mut s).into_vec(), vec![0]); } fn check_patterns(patterns: &'static [&'static str], expected: &'static [&'static str]) {