From b71c75ff5c3bebdf9e576b61240862452307904f Mon Sep 17 00:00:00 2001 From: Mihir Nanavati Date: Mon, 9 Oct 2023 15:15:31 -0400 Subject: [PATCH] dataspec: add percentile operation to histogram --- Cargo.lock | 1 + lib/dataspec/Cargo.toml | 1 + lib/dataspec/src/histogram.rs | 64 +++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 28ec9520..19846099 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1916,6 +1916,7 @@ version = "0.1.1" dependencies = [ "histogram", "serde", + "thiserror", ] [[package]] diff --git a/lib/dataspec/Cargo.toml b/lib/dataspec/Cargo.toml index bec00299..f5bcbac2 100644 --- a/lib/dataspec/Cargo.toml +++ b/lib/dataspec/Cargo.toml @@ -10,3 +10,4 @@ license = { workspace = true } [dependencies] histogram = { workspace = true } serde = { workspace = true } +thiserror = "1.0.47" diff --git a/lib/dataspec/src/histogram.rs b/lib/dataspec/src/histogram.rs index 87046ca9..00692add 100644 --- a/lib/dataspec/src/histogram.rs +++ b/lib/dataspec/src/histogram.rs @@ -1,4 +1,5 @@ use serde::{Deserialize, Serialize}; +use thiserror::Error; use histogram::Histogram as _Histogram; @@ -16,16 +17,55 @@ pub struct Histogram { pub m: u32, pub r: u32, pub n: u32, + /// total number of data points in the histogram + pub total: u64, /// indices for the non-zero buckets in the histogram pub index: Vec, /// histogram bucket counts corresponding to the indices pub count: Vec, } +/// Errors returned for operations on histograms. +#[non_exhaustive] +#[derive(Error, Debug, PartialEq)] +pub enum Error { + #[error("histogram contains no observations")] + Empty, + #[error("invalid percentile, must be in range 0.0..=100.0")] + InvalidPercentile, + #[error("unknown error, should be unreachable")] + Unknown, +} + +impl Histogram { + pub fn percentile(&self, percentile: f64) -> Result { + if self.total == 0 { + return Err(Error::Empty); + } + + if !(0.0..=100.0).contains(&percentile) { + return Err(Error::InvalidPercentile); + } + + let search = ((self.total as f64) * percentile / 100.0).ceil() as usize; + let mut seen: usize = 0; + for (i, c) in self.index.iter().zip(self.count.iter()) { + seen += *c as usize; + if seen >= search { + return Ok(*i); + } + } + + // Should be unreachable + return Err(Error::Unknown); + } +} + impl From<&_Histogram> for Histogram { fn from(histogram: &_Histogram) -> Self { let mut index = Vec::new(); let mut count = Vec::new(); + let mut total: u64 = 0; for (i, bucket) in histogram .into_iter() @@ -34,6 +74,7 @@ impl From<&_Histogram> for Histogram { { index.push(i); count.push(bucket.count()); + total += bucket.count() as u64; } let p = histogram.parameters(); @@ -41,8 +82,31 @@ impl From<&_Histogram> for Histogram { m: p.0, r: p.1, n: p.2, + total, index, count, } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn percentile() { + let h = _Histogram::new(0, 5, 10).unwrap(); + for v in 1..1024 { + let _ = h.increment(v, 1); + } + + let hcompact = Histogram::from(&h); + + for percentile in [1.0, 10.0, 25.0, 50.0, 75.0, 90.0, 99.0, 99.9] { + let bucket = h.percentile(percentile).unwrap(); + let idx = hcompact.percentile(percentile).unwrap(); + + println!("{}-{}: {}", bucket.low(), bucket.high(), idx); + } + } +}