Skip to content

Commit

Permalink
Implement bespoke cut down IntMap
Browse files Browse the repository at this point in the history
As noted in 4f1c7df, turns out it's
trivial enough I could well have done that in the first place after
validating that was the issue, rather than add nohash and keep
indexmap.

This is a highly cut down implementation to just the operations
necessary for `atom_to_re` and `propagate_match`. The gain in runtime
are near 10% which is nothing to sneeze at. Though with almost 20%
less instructions (!!!) that means the IPC has taken a dive and is now
a hair below 3. I still have essentially no idea what that means, but
I think it bears noting anyway.

```
       42.63 real        42.49 user         0.02 sys
           145833984  maximum resident set size
                   0  average shared memory size
                   0  average unshared data size
                   0  average unshared stack size
                9015  page reclaims
                   6  page faults
                   0  swaps
                   0  block input operations
                   0  block output operations
                   0  messages sent
                   0  messages received
                   0  signals received
                   0  voluntary context switches
                 118  involuntary context switches
        408805706973  instructions retired
        137020670968  cycles elapsed
           140592576  peak memory footprint
```
  • Loading branch information
masklinn committed Jun 23, 2024
1 parent 42b45e7 commit f500c57
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 19 deletions.
2 changes: 0 additions & 2 deletions regex-filtered/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ repository = "https://github.com/ua-parser/uap-rust/"

[dependencies]
aho-corasick = "1.1.3"
indexmap = "2.2.6"
itertools = "0.13.0"
nohash = "0.2.0"
regex = "1.10.4"
regex-syntax = "0.8.3"

Expand Down
56 changes: 56 additions & 0 deletions regex-filtered/src/int_set.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
pub struct IntSet {
sparse: Vec<usize>,
dense: Vec<usize>,
}

impl IntSet {
pub fn new(capacity: usize) -> Self {
Self {
sparse: vec![usize::MAX;capacity],
dense: Vec::with_capacity(capacity),
}
}

pub fn insert(&mut self, value: usize) -> bool {
let idx = self.sparse[value];
if self.dense.get(idx) != Some(&value) {
self.sparse[value] = self.dense.len();
self.dense.push(value);
true
} else {
false
}
}

pub fn len(&self) -> usize {
self.dense.len()
}

pub fn into_vec(self) -> Vec<usize> {
self.dense
}
}

impl std::ops::Index<usize> for IntSet {
type Output = usize;

fn index(&self, index: usize) -> &Self::Output {
self.dense.index(index)
}
}

impl std::iter::Extend<usize> for IntSet {
fn extend<T: IntoIterator<Item = usize>>(&mut self, iter: T) {
for val in iter {
self.insert(val);
}
}
}

impl <'a> std::iter::Extend<&'a usize> for IntSet {
fn extend<T: IntoIterator<Item = &'a usize>>(&mut self, iter: T) {
for val in iter {
self.insert(*val);
}
}
}
1 change: 1 addition & 0 deletions regex-filtered/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use aho_corasick::AhoCorasick;

mod mapper;
mod model;
mod int_set;
pub use model::Error as ModelError;

/// Builder for the regexes set
Expand Down
32 changes: 15 additions & 17 deletions regex-filtered/src/mapper.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use std::fmt::Display;
use std::fmt::Formatter;

use indexmap::IndexSet;

use crate::int_set::IntSet;
use super::model::Model;

pub struct Builder {
Expand Down Expand Up @@ -184,7 +183,9 @@ impl Display for Mapper {
writeln!(f, "#Unique Atoms: {}", self.atom_to_entry.len())?;
for (i, e) in self.atom_to_entry.iter().copied().enumerate() {
writeln!(f, "\tatom {i} -> entry {e}")?;
for r in self.propagate_match(&mut FromIterator::from_iter([e])) {
let mut s = IntSet::new(self.entries.len());
s.insert(e);
for r in self.propagate_match(&mut s).into_vec() {
writeln!(f, "\t\tregex {r}")?;
}
}
Expand Down Expand Up @@ -229,7 +230,6 @@ struct Entry {
regexps: Vec<usize>,
}

type Set = IndexSet<usize, nohash::BuildNoHashHasher<usize>>;
pub struct Mapper {
/// Number of regexes covered by the mapper
regexp_count: usize,
Expand All @@ -244,28 +244,22 @@ pub struct Mapper {
}
impl Mapper {
// name is shit and also needs to see if we can generate stuff on the fly
pub fn atom_to_re(&self, atoms: impl IntoIterator<Item = usize>) -> Set {
let mut matched_atom_ids = IndexSet::with_capacity_and_hasher(
self.entries.len(),
nohash::BuildNoHashHasher::default(),
);
pub fn atom_to_re(&self, atoms: impl IntoIterator<Item = usize>) -> Vec<usize> {
let mut matched_atom_ids = IntSet::new(self.entries.len());
matched_atom_ids.extend(atoms.into_iter().map(|idx| self.atom_to_entry[idx]));

let mut regexps = self.propagate_match(&mut matched_atom_ids);
let mut regexps = self.propagate_match(&mut matched_atom_ids).into_vec();

regexps.extend(&self.unfiltered);

regexps.sort_unstable();
regexps
}

fn propagate_match(&self, work: &mut Set) -> Set {
fn propagate_match(&self, work: &mut IntSet) -> IntSet {
let mut count = vec![0;self.entries.len()];

let mut regexps = IndexSet::with_capacity_and_hasher(
self.regexp_count,
nohash::BuildNoHashHasher::default(),
);
let mut regexps = IntSet::new(self.regexp_count);

let mut i = 0;
while i < work.len() {
Expand Down Expand Up @@ -334,8 +328,12 @@ mod test {

assert_eq!(m.entries.len(), 3);
assert_eq!(&m.atom_to_entry, &[0, 1]);
assert_eq!(m.propagate_match(&mut FromIterator::from_iter([0])), [0].into(),);
assert_eq!(m.propagate_match(&mut FromIterator::from_iter([1])), [0].into(),);
let mut s = IntSet::new(3);
s.insert(0);
assert_eq!(m.propagate_match(&mut s).into_vec(), vec![0]);
let mut s = IntSet::new(3);
s.insert(1);
assert_eq!(m.propagate_match(&mut s).into_vec(), vec![0]);
}

fn check_patterns(patterns: &'static [&'static str], expected: &'static [&'static str]) {
Expand Down

0 comments on commit f500c57

Please sign in to comment.