From fde57a22abbdb681f63f1ba536086f1ade237d51 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Mon, 21 Apr 2025 20:39:48 -0700 Subject: [PATCH 01/17] Add a simple benchmark to compare with fast-glob From a quick review of the implementation I see a few opportunities, and this will help justify them. --- Cargo.lock | 32 ++++++++++++------- turbopack/crates/turbo-tasks-fs/Cargo.toml | 1 + .../crates/turbo-tasks-fs/benches/mod.rs | 31 +++++++++++++++++- turbopack/crates/turbo-tasks-fs/src/glob.rs | 2 ++ 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 124c4f433acf4..e335e5288fb35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -276,9 +276,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "ascii" @@ -604,7 +604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f6ca6f0c18c02c2fbfc119df551b8aeb8a385f6d5980f1475ba0255f1e97f1e" dependencies = [ "anyhow", - "arrayvec 0.7.4", + "arrayvec 0.7.6", "itertools 0.10.5", "log", "nom", @@ -618,7 +618,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "876c75a42f6364451a033496a14c44bffe41f5f4a8236f697391f11024e596d2" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", ] [[package]] @@ -835,7 +835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" dependencies = [ "arrayref", - "arrayvec 0.7.4", + "arrayvec 0.7.6", "cc", "cfg-if", "constant_time_eq", @@ -2321,6 +2321,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" +[[package]] +name = "fast-glob" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ea3f6bbcf4dbe2076b372186fc7aeecd5f6f84754582e56ee7db262b15a6f0" +dependencies = [ + "arrayvec 0.7.6", +] + [[package]] name = "fastrand" version = "1.9.0" @@ -4825,7 +4834,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "itoa", ] @@ -5841,7 +5850,7 @@ checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9" dependencies = [ "arbitrary", "arg_enum_proc_macro", - "arrayvec 0.7.4", + "arrayvec 0.7.6", "av1-grain", "bitstream-io", "built", @@ -7009,7 +7018,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f352d5d14be5a1f956d76ae0c8060c3487aaa2a080f10a4b4ff023c7c05a9047" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "static-map-macro", ] @@ -7658,7 +7667,7 @@ version = "13.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2a2cf0263f34234cfcebde0545e4ed017e1b2b5667792c6902319d75df03110" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "indexmap 2.7.1", "is-macro", "rustc-hash 2.1.1", @@ -7945,7 +7954,7 @@ version = "12.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "250786944fbc05f6484eda9213df129ccfe17226ae9ad51b62fce2f72135dbee" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "bitflags 2.9.0", "either", "new_debug_unreachable", @@ -8087,7 +8096,7 @@ version = "14.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "012cd84fcc6c6fab718a177a3ffc360332d6bad29dbe19699be2ccbaba91e712" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "indexmap 2.7.1", "is-macro", "num-bigint", @@ -9557,6 +9566,7 @@ dependencies = [ "criterion", "dashmap 6.1.0", "dunce", + "fast-glob", "futures", "futures-retry", "include_dir", diff --git a/turbopack/crates/turbo-tasks-fs/Cargo.toml b/turbopack/crates/turbo-tasks-fs/Cargo.toml index 172a5805df11d..cdfc83924448e 100644 --- a/turbopack/crates/turbo-tasks-fs/Cargo.toml +++ b/turbopack/crates/turbo-tasks-fs/Cargo.toml @@ -64,6 +64,7 @@ tempfile = { workspace = true } turbo-tasks-memory = { workspace = true } turbo-tasks-testing = { workspace = true } turbo-tasks-backend = { workspace = true } +fast-glob = "0.4.5" [build-dependencies] turbo-tasks-build = { workspace = true } diff --git a/turbopack/crates/turbo-tasks-fs/benches/mod.rs b/turbopack/crates/turbo-tasks-fs/benches/mod.rs index ec5a14eba482e..3c1e2fa68f9af 100644 --- a/turbopack/crates/turbo-tasks-fs/benches/mod.rs +++ b/turbopack/crates/turbo-tasks-fs/benches/mod.rs @@ -101,9 +101,38 @@ fn bench_rope_iteration(c: &mut Criterion) { ); } +fn bench_glob_match_simple(c: &mut Criterion) { + const GLOB: &str = "some/**/n*d[k-m]e?txt"; + const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; + glob_bench(c, "simple", GLOB, PATH); +} + +fn bench_glob_match_alternations(c: &mut Criterion) { + const GLOB: &str = "some/**/{tob,crazy}/?*.{png,txt}"; + const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; + + glob_bench(c, "alternations", GLOB, PATH); +} + +fn glob_bench(c: &mut Criterion, name: &'static str, glob: &str, path: &str) { + let mut group = c.benchmark_group(format!("turbo-tasks-fs:{name}")); + group.bench_function("fast-glob", |b| { + b.iter(|| fast_glob::glob_match(glob, path)) + }); + group.bench_function("turbo-glob", |b| { + b.iter(|| { + let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); + g.execute(path) + }) + }); + let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); + group.bench_function("turbo-glob-cached", |b| b.iter(|| g.execute(path))); + group.finish(); +} + criterion_group!( name = benches; config = Criterion::default(); - targets = bench_file_watching, bench_rope_iteration + targets = bench_file_watching, bench_rope_iteration, bench_glob_match_simple, bench_glob_match_alternations ); criterion_main!(benches); diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index b6448d110709c..10967554a906f 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -231,6 +231,8 @@ impl GlobPart { let mut is_escaped = false; let mut literal = String::new(); + // TODO(lukes): there shouldn't be a need to traverse graphemes here, just literal + // matches since all of our delimiters are ascii characters let mut cursor = GraphemeCursor::new(0, input.len(), true); let mut start = cursor.cur_cursor(); From ab0afbaa26c397e5c259c9120133f680213dc0ce Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 1 May 2025 09:48:42 -0700 Subject: [PATCH 02/17] Use an NFA approach for globs. On our simple benchmarks this is 3-4x as fast as the existing glob implementation, but still 3 times slower than `fast-glob`. My expectation is that `fast-glob` would be faster for single matches but that this should be faster when the glob is executed multiple times. But we still need to close a gap there. --- .../crates/turbo-tasks-fs/benches/mod.rs | 15 +- turbopack/crates/turbo-tasks-fs/src/glob.rs | 681 +++++++++++++++++- 2 files changed, 685 insertions(+), 11 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/benches/mod.rs b/turbopack/crates/turbo-tasks-fs/benches/mod.rs index 3c1e2fa68f9af..3cb5e1b85fd53 100644 --- a/turbopack/crates/turbo-tasks-fs/benches/mod.rs +++ b/turbopack/crates/turbo-tasks-fs/benches/mod.rs @@ -102,20 +102,20 @@ fn bench_rope_iteration(c: &mut Criterion) { } fn bench_glob_match_simple(c: &mut Criterion) { - const GLOB: &str = "some/**/n*d[k-m]e?txt"; + const GLOB: &str = "some/**/n*de?txt"; const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; glob_bench(c, "simple", GLOB, PATH); } fn bench_glob_match_alternations(c: &mut Criterion) { - const GLOB: &str = "some/**/{tob,crazy}/?*.{png,txt}"; + const GLOB: &str = "some/**/{tob,crazy}/*.{png,txt}"; const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; glob_bench(c, "alternations", GLOB, PATH); } fn glob_bench(c: &mut Criterion, name: &'static str, glob: &str, path: &str) { - let mut group = c.benchmark_group(format!("turbo-tasks-fs:{name}")); + let mut group = c.benchmark_group(format!("turbo-tasks-fs/glob/{name}")); group.bench_function("fast-glob", |b| { b.iter(|| fast_glob::glob_match(glob, path)) }); @@ -127,6 +127,15 @@ fn glob_bench(c: &mut Criterion, name: &'static str, glob: &str, path: &str) { }); let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); group.bench_function("turbo-glob-cached", |b| b.iter(|| g.execute(path))); + + group.bench_function("turbo-glob-2", |b| { + b.iter(|| { + let g = turbo_tasks_fs::glob::GlobProgram::compile(glob).unwrap(); + g.matches(path, false) + }) + }); + let g = turbo_tasks_fs::glob::GlobProgram::compile(glob).unwrap(); + group.bench_function("turbo-glob-2-cached", |b| b.iter(|| g.matches(path, false))); group.finish(); } diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 10967554a906f..b903920a7bc2d 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,4 +1,4 @@ -use std::mem::take; +use std::{fmt::Display, iter::Peekable, mem::take, str::Chars}; use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; @@ -156,6 +156,639 @@ impl<'a> Iterator for GlobMatchesIterator<'a> { } } +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +struct CharRange { + low: char, + high: char, +} +impl PartialOrd for CharRange { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +// Lower `lows` come first, if two `lows` match then the large range comes first +// This makes the merge logic simpler. +impl Ord for CharRange { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match self.low.cmp(&other.low) { + std::cmp::Ordering::Equal => other.high.cmp(&self.high), + c => c, + } + } +} + +#[derive(Debug, Eq, Clone, PartialEq)] +#[repr(transparent)] +struct RangeSet { + ranges: Vec, +} + +impl RangeSet { + fn new() -> RangeSet { + Self { ranges: Vec::new() } + } + + fn contains(&self, c: char) -> bool { + self.ranges + .binary_search(&CharRange { low: c, high: c }) + .is_ok() + } + + fn add_singleton(&mut self, c: char) { + self.ranges.push(CharRange { low: c, high: c }); + } + fn add_range(&mut self, low: char, high: char) { + self.ranges.push(CharRange { low, high }); + } + + // Sorts and merges redundant entries. + fn finalize(&mut self) { + self.ranges.sort(); + self.ranges.dedup_by(|a, b| { + if b.high >= a.low { + b.high = a.high; + true + } else { + false + } + }); + self.ranges.shrink_to_fit(); + } +} + +// A sparse set of integers that supports O(1) insertion, testing, clearing and O(n) iteration. +// https://research.swtch.com/sparse +struct SparseSet { + mem: Vec, + n: usize, + max: usize, +} + +impl SparseSet { + fn new(size: usize) -> Self { + Self { + mem: unsafe { + std::mem::transmute(vec![std::mem::MaybeUninit::::uninit(); size * 2]) + }, + n: 0, + max: size, + } + } + + #[inline] + fn sparse(&self) -> &[usize] { + return &self.mem[0..self.max]; + } + #[inline] + fn dense(&self) -> &[usize] { + return &self.mem[self.max..]; + } + #[inline] + fn sparse_mut(&mut self) -> &mut [usize] { + return &mut self.mem[0..self.max]; + } + #[inline] + fn dense_mut(&mut self) -> &mut [usize] { + return &mut self.mem[self.max..]; + } + fn contains(&self, v: usize) -> bool { + debug_assert!(v < self.max); + let sparse = self.sparse()[v]; + sparse < self.n && self.dense()[sparse] == v + } + fn clear(&mut self) { + self.n = 0 + } + fn add(&mut self, v: usize) { + debug_assert!(v < self.max); + if !self.contains(v) { + let n = self.n; + self.dense_mut()[n] = v; + self.sparse_mut()[v] = n; + self.n += 1; + } + } + fn get(&self, i: usize) -> usize { + debug_assert!(i < self.n); + return self.dense()[i]; + } +} + +#[derive(Debug)] +pub struct GlobProgram { + instructions: Vec, + range_sets: Vec, +} + +impl GlobProgram { + pub fn compile(pattern: &str) -> Result { + let mut tok = Tokenizer::new(pattern); + let mut instructions = Vec::new(); + let mut range_sets = Vec::new(); + // A stack of alternates + struct PendingAlternates { + prefix: Vec, + branches: Vec>, + } + let mut left_bracket_starts: Vec = Vec::new(); + // We avoid synthesizing an AST and compile directly from the lexer, this adds a touch of + // trickiness for alternations but that can be managed by a single stack and avoids the + // complexity of building and visiting an AST. This simple program also allows + // trivial optimizations to be performed during compilation inline. + loop { + let t = tok.next_token(); + match t { + GlobToken::Literal(lit) => { + instructions.push(GlobInstruction::MatchLiteral(lit)); + } + GlobToken::Star => { + instructions.push(GlobInstruction::MatchAnyNonDelim); + // After a star match we either loop back to try again, or proceed. + instructions.push(GlobInstruction::Fork(-1)); + } + GlobToken::GlobStar => { + if instructions.len() >= 2 + && matches!( + instructions.last().unwrap(), + GlobInstruction::MatchGlobStar { .. } + ) + { + // This is a redundant globstar + // e.g. /**/**/ + // just skip this one. + } else { + // allow globstars to match nothing by skipping it + instructions.push(GlobInstruction::Fork(2)); + // We don't actually know if it is terminal or not yet. We will determing + // this after code generation. + instructions.push(GlobInstruction::MatchGlobStar { terminal: false }); + } + } + GlobToken::QuestionMark => { + // A question match, optionally matches a non- delimiter character, so we either + // skip forward or not. + instructions.push(GlobInstruction::Fork(2)); + instructions.push(GlobInstruction::MatchAnyNonDelim); + } + GlobToken::LSquareBracket => { + let mut set = RangeSet::new(); + // with in a square brachet we expect a mix of literals and hyphens followed by + // a RSuareBracket + let mut prev_literal: Option = None; + let mut partial_range: Option = None; + loop { + let t = tok.next_token(); + match t { + GlobToken::Literal(lit) => { + debug_assert!( + prev_literal.is_none(), + "impossible, tokenizer failed to merge literals or we failed \ + to handle a hyphen correctly" + ); + if let Some(start) = partial_range { + set.add_range(start, lit); + partial_range = None; + } else { + if let Some(prev) = prev_literal { + set.add_singleton(prev); + } + prev_literal = Some(lit); + } + } + GlobToken::Hyphen => { + if let Some(lit) = prev_literal { + prev_literal = None; + partial_range = Some(lit); + } else { + bail!("Unexpected hyphen at the beginning of a character class") + } + } + + GlobToken::RSquareBracket => { + if let Some(lit) = prev_literal { + set.add_singleton(lit); + } + if partial_range.is_some() { + bail!("unexpectedm hyphen at the end of a character class") + } + set.finalize(); + let index = range_sets.len(); + range_sets.push(set); + instructions + .push(GlobInstruction::MatchClass(index.try_into().unwrap())); + break; + } + _ => { + bail!("Unexpected token {t} inside of character class"); + } + } + } + } + GlobToken::RSquareBracket | GlobToken::Hyphen => panic!( + "should never happen, tokenizer should have already rejected or been consumed \ + within another branch" + ), + GlobToken::LBracket => { + let mut tmp = Vec::new(); + std::mem::swap(&mut instructions, &mut tmp); + left_bracket_starts.push(PendingAlternates { + prefix: tmp, + branches: Vec::new(), + }); + } + GlobToken::Comma | GlobToken::RBracket => { + // We don't need to check this, the tokenizer enforces that these tokens can + // only occur inside of an alternation so there must be something on our stack. + let pending = left_bracket_starts.last_mut().unwrap(); + // for an alternation we either 'jump' past it to the next one, or we process it + // directly + + let mut branch = Vec::new(); + std::mem::swap(&mut instructions, &mut branch); + pending.branches.push(branch); + + // If we hit a `}` then we are done so compute the jumps and pop the prefix. + if t == GlobToken::RBracket { + let num_branches = pending.branches.len(); + if num_branches == 1 { + bail!("Cannot have an alternation with only one member"); + } + // TODO(lukesandberg): consider looking for common suffixes or prefixes, + // shouldn't be common + + // The length of each branch plus one for the jump to the end + // For each alternative we have a `Fork` instruction so account for those + let mut next_branch_offset = num_branches - 1; + for branch in &pending.branches[0..num_branches - 1] { + // to jump past the branch we need to jump past all its instructions +1 + // to account for the JUMP instruction at the end + next_branch_offset += branch.len() + 1; + pending.prefix.push(GlobInstruction::Fork( + next_branch_offset.try_into().unwrap(), + )); + next_branch_offset -= 1; // subtract one since we added a fork + // instruction. + } + // NOTE: The last branch doesn't get a JUMP instruction, so no `+1` + // this is the relative offet to the end, from the very beginning + // however we aren't starting at the beginning, we already added + // `num_branches-1` `FORK` instructions` + let mut end_of_alternation = + next_branch_offset + pending.branches.last().unwrap().len(); + for branch in &mut pending.branches[0..num_branches - 1] { + end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of the + // alternation + pending.prefix.extend(branch.drain(..)); + pending.prefix.push(GlobInstruction::Jump( + end_of_alternation.try_into().unwrap(), + )); + end_of_alternation -= 1; // account for the jump instruction + } + end_of_alternation -= pending.branches.last().unwrap().len(); + pending + .prefix + .extend(pending.branches.last_mut().unwrap().drain(..)); + debug_assert!(end_of_alternation == 0); + std::mem::swap(&mut instructions, &mut pending.prefix); + left_bracket_starts.pop(); + } + } + GlobToken::End => { + instructions.push(GlobInstruction::Match); + break; + } + } + } + if let Some(err) = tok.err { + return Err(err); + } + + // Now we need to annotate 'terminal' globstars in order to speed up validating program + // matches The issue is that if we are executing a globstar when the program + // completes then pass or fail is dependant upon whether there are unconditioal require + // subsequent matches e.g. `foo/**` matches `foo/bar/baz.js` but foo/**/a` does not, + // because we have to match that 'a' similarly `foo/{**,bar}` matches + // `foo/bar/baz.js` Do determin this we need to chase pointers from globstars to the + // end of the program, and if there is a path to match then it is a terminal globstar. + + let mut visited = SparseSet::new(instructions.len()); + for start in 0..instructions.len() { + if matches!(instructions[start], GlobInstruction::MatchGlobStar { .. }) { + visited.add(start + 1); + let mut thread_index = 0; + while thread_index < visited.n { + let ip = visited.get(thread_index); + match &instructions[ip] { + GlobInstruction::Fork(offset) => { + visited.add(ip + 1); + visited.add(((*offset as isize) + (ip as isize)) as usize); + } + GlobInstruction::Jump(offset) => { + visited.add(((*offset as isize) + (ip as isize)) as usize); + } + GlobInstruction::Match => { + // There is a path to the end, update to `terminal` + instructions[start] = GlobInstruction::MatchGlobStar { terminal: true }; + break; + } + _ => { + break; + } + } + thread_index += 1; + } + + // Check if this has an unconditional path to Match + + visited.clear(); + } + } + + Ok(GlobProgram { + instructions, + range_sets, + }) + } + + pub fn matches(&self, v: &str, prefix: bool) -> bool { + let len = self.instructions.len(); + let mut cur = SparseSet::new(len); + let mut next = SparseSet::new(len); + cur.add(0); + // Process all characters in order + // It would be faster to process `bytes` but this adds complexity in a number of places + // TODO(lukesandberg): optimize for ascii bytes since those are legion in filepaths, put + // multibyte characters in separate instructions + for c in v.chars() { + let mut thread_index = 0; + // We need to use this looping construct since we may add elements to `cur` as we go. + // `cur.n` will never be > `len` so this loop is bounded to N iterations. + while thread_index < cur.n { + let ip = cur.get(thread_index); + let insn = &self.instructions[ip]; + match insn { + GlobInstruction::MatchLiteral(m) => { + if c == *m { + // We matched, proceed to the next character + next.add(ip + 1); + } + } + GlobInstruction::MatchAnyNonDelim => { + if c != '/' { + next.add(ip + 1); + } + } + GlobInstruction::MatchGlobStar { terminal } => { + if *terminal { + // If we find a terminal globstar, we are done! this must match + return true; + } + // If we see a `/` then we need to consider ending the globstar. + if c == '/' { + next.add(ip + 1); + } + // but even so we should keep trying to match, just like a fork. + next.add(ip); + } + GlobInstruction::MatchClass(set_index) => { + if self.range_sets[*set_index as usize].contains(c) { + next.add(ip + 1); + } + } + GlobInstruction::Jump(offset) => { + // Push another thread onto the current list + cur.add(((*offset as isize) + (ip as isize)) as usize); + } + GlobInstruction::Fork(offset) => { + cur.add(ip + 1); + cur.add(((*offset as isize) + (ip as isize)) as usize); + } + GlobInstruction::Match => { + // We ran out of instructions while we still have characters + // so this thread dies + } + } + thread_index += 1; + } + if next.n == 0 { + // This means that all threads exited early. This isn't needed for correctness, + // but there is no point iterating the rest of the characters. + return false; + } + // We have some progress! clear current and swap the two lists to advance to the next + // character. + cur.clear(); + std::mem::swap(&mut cur, &mut next); + } + // If we get here we have matched a prefix of the instructions and run out of text. + // So in prefix mode we are done, otherwise we need to check if we hit the end of the + // isntructions + if prefix { + return true; + } + + // We matched if there is some path from any current thread to the end, so we need to + // process all jumps and forks + let mut i = 0; + while i < cur.n { + let ip = cur.get(i); + match &self.instructions[ip] { + GlobInstruction::Jump(offset) => { + // Push another thread onto the current list + cur.add(((*offset as isize) + (ip as isize)) as usize); + } + GlobInstruction::Fork(offset) => { + cur.add(ip + 1); + cur.add(((*offset as isize) + (ip as isize)) as usize); + } + GlobInstruction::Match => { + return true; + } + _ => { + //ignore + } + } + i += 1; + } + // We are checking that we hit the end, which is always a Match instruction, so if any + // of the threads found it, then it is a match. + + debug_assert!(self.instructions[len - 1] == GlobInstruction::Match); + return cur.contains(len - 1); + } +} + +// Consider a more compact encoding. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +enum GlobInstruction { + // Matches a single literal char + MatchLiteral(char), /* N.B. this bloats the enum size to 8 even though chars only need 21 + * bits. */ + // Matches any non-`/` character + MatchAnyNonDelim, + // Matches **, which is any character but can only 'end' on a `/` or end of string + MatchGlobStar { terminal: bool }, + // Matches any character in the set + MatchClass(u32), + // Unconditional jump. This would occur at the end of an alternate to jump past the other + // alternates. + Jump(i32), + // Splits control flow into two branches. + Fork(i32), + // End of program + Match, +} + +#[derive(Debug, Eq, PartialEq)] +enum GlobToken { + // A sequence of bytes, possibly including `/` characters + // all bytes are unescaped. + Literal(char), + // a `*` token + Star, + // a `**` token + GlobStar, + // A `?` token` + QuestionMark, + // a `[` token + LSquareBracket, + // a `]` token + RSquareBracket, + // a `{` token + LBracket, + // a `}` token + RBracket, + // a `,` token + Comma, + // a `-` token + Hyphen, + End, +} +impl Display for GlobToken { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let GlobToken::Literal(c) = self { + let s = c.to_string(); + f.write_str(&s) + } else { + f.write_str(match self { + GlobToken::Star => "*", + GlobToken::GlobStar => "**", + GlobToken::QuestionMark => "?", + GlobToken::LSquareBracket => "[", + GlobToken::RSquareBracket => "]", + GlobToken::LBracket => "{", + GlobToken::RBracket => "}", + GlobToken::Comma => ",", + GlobToken::Hyphen => "-", + GlobToken::End => "", + GlobToken::Literal(_) => panic!("impossible"), + }) + } + } +} + +struct Tokenizer<'a> { + input: Peekable>, + pos: usize, + err: Option, + bracket_count: u32, + square_bracket_count: u32, +} + +impl<'a> Tokenizer<'a> { + fn new(input: &'a str) -> Tokenizer<'a> { + Self { + input: input.chars().peekable(), + pos: 0, + err: None, + bracket_count: 0, + square_bracket_count: 0, + } + } + fn next_token(&mut self) -> GlobToken { + self.pos += 1; + match self.input.next() { + None => GlobToken::End, + Some(c) => { + match c { + '*' if self.square_bracket_count == 0 => match self.input.next_if_eq(&'*') { + Some(_) => { + // TODO: we should validate that the previous character was a `/` as + // well, but this isn't a big deal/ + self.pos += 1; + match self.input.next() { + None => {} + Some('/') => { + self.pos += 1; + } // ok if we are at the end or followed by a `/` + _ => { + self.err = Some(anyhow!( + "a glob star must be a full path segment, e.g. `/**/`" + )); + return GlobToken::End; + } + } + + GlobToken::GlobStar + } + _ => GlobToken::Star, + }, + '?' if self.square_bracket_count == 0 => GlobToken::QuestionMark, + '[' => { + self.square_bracket_count += 1; + GlobToken::LSquareBracket + } + ']' => { + if self.square_bracket_count == 0 { + self.err = Some(anyhow!( + "mismatched square brackets at {} in glob", + self.pos + )); + return GlobToken::End; + } + self.square_bracket_count -= 1; + GlobToken::RSquareBracket + } + // These are not tokens inside of a character class, they are just literals + '{' if self.square_bracket_count == 0 => { + self.bracket_count += 1; + GlobToken::LBracket + } + '}' if self.square_bracket_count == 0 => { + if self.bracket_count == 0 { + self.err = Some(anyhow!("mismatched brackets at {} in glob", self.pos)); + return GlobToken::End; + } + + self.bracket_count -= 1; + GlobToken::RBracket + } + // This is only a meaninful token inside of a character class + '-' if self.square_bracket_count > 0 => GlobToken::Hyphen, + // This is only meaningful inside of an alternation (aka brackets) + ',' if self.bracket_count > 0 => GlobToken::Comma, + cur => { + if cur == '\\' { + match self.input.next() { + Some(c) => { + self.pos += 1; + GlobToken::Literal(c) + } + None => { + self.err = Some(anyhow!("found `\\` character at end of glob")); + GlobToken::End + } + } + } else { + GlobToken::Literal(cur) + } + } + } + } + } + } +} + impl GlobPart { /// Iterates over all possible matches of this part with the provided path. /// The least greedy match is returned first. This is usually used for @@ -433,7 +1066,8 @@ impl Glob { mod tests { use rstest::*; - use super::Glob; + use super::{Glob, GlobToken, Tokenizer}; + use crate::glob::GlobProgram; #[rstest] #[case::file("file.js", "file.js")] @@ -507,11 +1141,17 @@ mod tests { #[case::alternatives_nested4("{a,b/c,d/e/{f,g/h}}", "d/e/g/h")] // #[case::alternatives_chars("[abc]", "b")] fn glob_match(#[case] glob: &str, #[case] path: &str) { - let glob = Glob::parse(glob).unwrap(); + let parsed = Glob::parse(glob).unwrap(); + + println!("{glob:?} compiled to {parsed:?} matching {path}"); - println!("{glob:?} {path}"); + assert!(parsed.execute(path)); - assert!(glob.execute(path)); + let parsed2 = GlobProgram::compile(glob).unwrap(); + println!("{glob:?} compiled to\n{parsed2:#?}\nmatching {path}"); + let prefix = path.ends_with('/'); + + assert!(parsed2.matches(path, prefix)); } #[rstest] @@ -521,10 +1161,35 @@ mod tests { "next/dist/shared/lib/app-router-context.shared-runtime.js" )] fn glob_not_matching(#[case] glob: &str, #[case] path: &str) { - let glob = Glob::parse(glob).unwrap(); + let parsed = Glob::parse(glob).unwrap(); + + println!("{glob:?} compiled to {parsed:?} matching {path}"); - println!("{glob:?} {path}"); + assert!(!parsed.execute(path)); - assert!(!glob.execute(path)); + let parsed2 = GlobProgram::compile(glob).unwrap(); + println!("{glob:?} compiled to {parsed2:?} matching {path}"); + let prefix = path.ends_with('/'); + + assert!(!parsed2.matches(path, prefix)); + } + + #[test] + fn test_tokenizer() { + let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); + let prefix: Vec = "foo/bar".chars().map(|c| GlobToken::Literal(c)).collect(); + for t in prefix { + assert_eq!(t, tok.next_token()); + } + assert_eq!(GlobToken::LSquareBracket, tok.next_token()); + assert_eq!(GlobToken::Literal('a'), tok.next_token()); + assert_eq!(GlobToken::Hyphen, tok.next_token()); + assert_eq!(GlobToken::Literal('z'), tok.next_token()); + assert_eq!(GlobToken::RSquareBracket, tok.next_token()); + assert_eq!(GlobToken::Literal('/'), tok.next_token()); + assert_eq!(GlobToken::QuestionMark, tok.next_token()); + assert_eq!(GlobToken::Literal('/'), tok.next_token()); + assert_eq!(GlobToken::GlobStar, tok.next_token()); + assert_eq!(GlobToken::End, tok.next_token()); } } From 08e8830e701ad2d9e6c0d5eb0b37b1f21602f44d Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 1 May 2025 12:12:05 -0700 Subject: [PATCH 03/17] Take an ascii oriented approach Reduce the size of our instructions Reduce the maximum program size Optimize sparse set to reduce field reads by interlacing the sparse and dense ranges With this `fast-glob` is still 4x as a fast for a single match, but only 2x as fast when 'precompiled' which is our usecase. --- Cargo.toml | 6 + turbopack/crates/turbo-tasks-fs/src/glob.rs | 272 +++++++++++--------- 2 files changed, 159 insertions(+), 119 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a43f848f957e0..42084ac7e689e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -240,6 +240,12 @@ opt-level = 3 [profile.release.package.serde] opt-level = 3 + +[profile.profiling] +inherits = "release" +debug = true + + [workspace.dependencies] # Workspace crates next-api = { path = "crates/next-api" } diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index b903920a7bc2d..7bccde94b7eb7 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,4 +1,4 @@ -use std::{fmt::Display, iter::Peekable, mem::take, str::Chars}; +use std::{fmt::Display, mem::take}; use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; @@ -157,18 +157,27 @@ impl<'a> Iterator for GlobMatchesIterator<'a> { } #[derive(Debug, Eq, PartialEq, Copy, Clone)] -struct CharRange { - low: char, - high: char, +struct ByteRange { + low: u8, + high: u8, } -impl PartialOrd for CharRange { +impl ByteRange { + fn singleton(b: u8) -> Self { + Self { low: b, high: b } + } + fn range(low: u8, high: u8) -> Self { + Self { low, high } + } +} + +impl PartialOrd for ByteRange { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } // Lower `lows` come first, if two `lows` match then the large range comes first // This makes the merge logic simpler. -impl Ord for CharRange { +impl Ord for ByteRange { fn cmp(&self, other: &Self) -> std::cmp::Ordering { match self.low.cmp(&other.low) { std::cmp::Ordering::Equal => other.high.cmp(&self.high), @@ -180,7 +189,7 @@ impl Ord for CharRange { #[derive(Debug, Eq, Clone, PartialEq)] #[repr(transparent)] struct RangeSet { - ranges: Vec, + ranges: Vec, } impl RangeSet { @@ -188,17 +197,15 @@ impl RangeSet { Self { ranges: Vec::new() } } - fn contains(&self, c: char) -> bool { - self.ranges - .binary_search(&CharRange { low: c, high: c }) - .is_ok() + fn contains(&self, b: u8) -> bool { + self.ranges.binary_search(&ByteRange::singleton(b)).is_ok() } - fn add_singleton(&mut self, c: char) { - self.ranges.push(CharRange { low: c, high: c }); + fn add_singleton(&mut self, b: u8) { + self.ranges.push(ByteRange::singleton(b)); } - fn add_range(&mut self, low: char, high: char) { - self.ranges.push(CharRange { low, high }); + fn add_range(&mut self, low: u8, high: u8) { + self.ranges.push(ByteRange::range(low, high)); } // Sorts and merges redundant entries. @@ -218,59 +225,57 @@ impl RangeSet { // A sparse set of integers that supports O(1) insertion, testing, clearing and O(n) iteration. // https://research.swtch.com/sparse -struct SparseSet { - mem: Vec, - n: usize, - max: usize, +struct SparseSet<'a> { + mem: &'a mut [u16], + n: u16, } -impl SparseSet { - fn new(size: usize) -> Self { - Self { - mem: unsafe { - std::mem::transmute(vec![std::mem::MaybeUninit::::uninit(); size * 2]) - }, - n: 0, - max: size, - } +impl<'a> SparseSet<'a> { + fn from_storage(mem: &'a mut [u16]) -> Self { + debug_assert!(mem.len() & 1 == 0, "mem must have an even length"); + Self { mem, n: 0 } } + // Sparse entries are stored at the even indices and dense entries are stored at the odd ones + // this allows us to compute addresses withough referencing the length of the slice. #[inline] - fn sparse(&self) -> &[usize] { - return &self.mem[0..self.max]; + fn get_sparse(&self, i: u16) -> u16 { + return self.mem[i as usize * 2]; } #[inline] - fn dense(&self) -> &[usize] { - return &self.mem[self.max..]; + fn get_sparse_mut(&mut self, i: u16) -> &mut u16 { + return &mut self.mem[i as usize * 2]; } #[inline] - fn sparse_mut(&mut self) -> &mut [usize] { - return &mut self.mem[0..self.max]; + fn get_dense(&self, i: u16) -> u16 { + return self.mem[i as usize * 2 + 1]; } #[inline] - fn dense_mut(&mut self) -> &mut [usize] { - return &mut self.mem[self.max..]; - } - fn contains(&self, v: usize) -> bool { - debug_assert!(v < self.max); - let sparse = self.sparse()[v]; - sparse < self.n && self.dense()[sparse] == v + fn get_dense_mut(&mut self, i: u16) -> &mut u16 { + return &mut self.mem[i as usize * 2 + 1]; } + fn clear(&mut self) { self.n = 0 } - fn add(&mut self, v: usize) { - debug_assert!(v < self.max); - if !self.contains(v) { - let n = self.n; - self.dense_mut()[n] = v; - self.sparse_mut()[v] = n; - self.n += 1; + fn add(&mut self, v: u16) { + debug_assert!((v as usize) < self.mem.len() / 2); + let n = self.n; + let s = self.get_sparse(v); + // The value is already in the set if + // the sparse pointer is in range and the value at that + // index is `v` + if s < n && self.get_dense(s) == v { + // this value is already in the set. + return; } + *self.get_sparse_mut(v) = n; + *self.get_dense_mut(n) = v; + self.n += 1; } - fn get(&self, i: usize) -> usize { + fn get(&self, i: u16) -> u16 { debug_assert!(i < self.n); - return self.dense()[i]; + return self.get_dense(i); } } @@ -334,12 +339,21 @@ impl GlobProgram { let mut set = RangeSet::new(); // with in a square brachet we expect a mix of literals and hyphens followed by // a RSuareBracket - let mut prev_literal: Option = None; - let mut partial_range: Option = None; + let mut prev_literal: Option = None; + let mut partial_range: Option = None; loop { let t = tok.next_token(); match t { GlobToken::Literal(lit) => { + if lit > 127 { + // TODO(lukesandberg): These are supportable by expanding into + // several RanngeSets for + // each byte in the multibyte characters + // However, this is very unlikely to be required by a user so + // for now the feature is + // omitted. + bail!("Unsupported non-ascii character in set"); + } debug_assert!( prev_literal.is_none(), "impossible, tokenizer failed to merge literals or we failed \ @@ -374,8 +388,11 @@ impl GlobProgram { set.finalize(); let index = range_sets.len(); range_sets.push(set); - instructions - .push(GlobInstruction::MatchClass(index.try_into().unwrap())); + instructions.push(GlobInstruction::MatchClass( + index + .try_into() + .context("Cannot have more than 255 range sets")?, + )); break; } _ => { @@ -424,7 +441,9 @@ impl GlobProgram { // to account for the JUMP instruction at the end next_branch_offset += branch.len() + 1; pending.prefix.push(GlobInstruction::Fork( - next_branch_offset.try_into().unwrap(), + next_branch_offset.try_into().context( + "program too large, cannot have more than 64K instructions", + )?, )); next_branch_offset -= 1; // subtract one since we added a fork // instruction. @@ -440,7 +459,9 @@ impl GlobProgram { // alternation pending.prefix.extend(branch.drain(..)); pending.prefix.push(GlobInstruction::Jump( - end_of_alternation.try_into().unwrap(), + end_of_alternation.try_into().context( + "program too large, cannot have more than 64K instructions", + )?, )); end_of_alternation -= 1; // account for the jump instruction } @@ -471,24 +492,32 @@ impl GlobProgram { // `foo/bar/baz.js` Do determin this we need to chase pointers from globstars to the // end of the program, and if there is a path to match then it is a terminal globstar. - let mut visited = SparseSet::new(instructions.len()); - for start in 0..instructions.len() { - if matches!(instructions[start], GlobInstruction::MatchGlobStar { .. }) { + if instructions.len() > u16::MAX as usize { + bail!("program too large"); + } + let mut storage = vec![0; instructions.len() * 2]; + let mut visited = SparseSet::from_storage(storage.as_mut_slice()); + for start in 0..(instructions.len() as u16) { + if matches!( + instructions[start as usize], + GlobInstruction::MatchGlobStar { .. } + ) { visited.add(start + 1); let mut thread_index = 0; while thread_index < visited.n { let ip = visited.get(thread_index); - match &instructions[ip] { + match &instructions[ip as usize] { GlobInstruction::Fork(offset) => { visited.add(ip + 1); - visited.add(((*offset as isize) + (ip as isize)) as usize); + visited.add(((*offset as i16) + (ip as i16)) as u16); } GlobInstruction::Jump(offset) => { - visited.add(((*offset as isize) + (ip as isize)) as usize); + visited.add(ip + *offset); } GlobInstruction::Match => { // There is a path to the end, update to `terminal` - instructions[start] = GlobInstruction::MatchGlobStar { terminal: true }; + instructions[start as usize] = + GlobInstruction::MatchGlobStar { terminal: true }; break; } _ => { @@ -512,56 +541,63 @@ impl GlobProgram { pub fn matches(&self, v: &str, prefix: bool) -> bool { let len = self.instructions.len(); - let mut cur = SparseSet::new(len); - let mut next = SparseSet::new(len); + // Use a single uninitialize allocation for our storage. + let mut storage: Vec = + unsafe { std::mem::transmute(vec![std::mem::MaybeUninit::::uninit(); len * 4]) }; + let (set1, set2) = storage.split_at_mut(len * 2); + let mut set1 = SparseSet::from_storage(set1); + let mut set2 = SparseSet::from_storage(set2); + // Access via references to make the swap operations cheaper. + let mut cur = &mut set1; + let mut next = &mut set2; cur.add(0); // Process all characters in order // It would be faster to process `bytes` but this adds complexity in a number of places - // TODO(lukesandberg): optimize for ascii bytes since those are legion in filepaths, put - // multibyte characters in separate instructions - for c in v.chars() { + + let bytes = v.as_bytes(); + for i in 0..bytes.len() { + let byte = bytes[i]; let mut thread_index = 0; // We need to use this looping construct since we may add elements to `cur` as we go. // `cur.n` will never be > `len` so this loop is bounded to N iterations. while thread_index < cur.n { let ip = cur.get(thread_index); - let insn = &self.instructions[ip]; - match insn { + match self.instructions[ip as usize] { GlobInstruction::MatchLiteral(m) => { - if c == *m { + if byte == m { // We matched, proceed to the next character next.add(ip + 1); } } GlobInstruction::MatchAnyNonDelim => { - if c != '/' { + if byte != b'/' { next.add(ip + 1); } } GlobInstruction::MatchGlobStar { terminal } => { - if *terminal { + if terminal { // If we find a terminal globstar, we are done! this must match return true; } // If we see a `/` then we need to consider ending the globstar. - if c == '/' { + if byte == b'/' { next.add(ip + 1); } // but even so we should keep trying to match, just like a fork. next.add(ip); } GlobInstruction::MatchClass(set_index) => { - if self.range_sets[*set_index as usize].contains(c) { + if self.range_sets[set_index as usize].contains(byte) { next.add(ip + 1); } } GlobInstruction::Jump(offset) => { // Push another thread onto the current list - cur.add(((*offset as isize) + (ip as isize)) as usize); + cur.add(offset + ip); } GlobInstruction::Fork(offset) => { cur.add(ip + 1); - cur.add(((*offset as isize) + (ip as isize)) as usize); + cur.add((offset + (ip as i16)) as u16); } GlobInstruction::Match => { // We ran out of instructions while we still have characters @@ -592,14 +628,14 @@ impl GlobProgram { let mut i = 0; while i < cur.n { let ip = cur.get(i); - match &self.instructions[ip] { + match &self.instructions[ip as usize] { GlobInstruction::Jump(offset) => { // Push another thread onto the current list - cur.add(((*offset as isize) + (ip as isize)) as usize); + cur.add(*offset + ip); } GlobInstruction::Fork(offset) => { cur.add(ip + 1); - cur.add(((*offset as isize) + (ip as isize)) as usize); + cur.add((*offset + (ip as i16)) as u16); } GlobInstruction::Match => { return true; @@ -610,31 +646,29 @@ impl GlobProgram { } i += 1; } - // We are checking that we hit the end, which is always a Match instruction, so if any - // of the threads found it, then it is a match. - - debug_assert!(self.instructions[len - 1] == GlobInstruction::Match); - return cur.contains(len - 1); + false } } // Consider a more compact encoding. #[derive(Debug, PartialEq, Eq, Clone, Copy)] enum GlobInstruction { - // Matches a single literal char - MatchLiteral(char), /* N.B. this bloats the enum size to 8 even though chars only need 21 - * bits. */ + // Matches a single literal byte + MatchLiteral(u8), /* N.B. this bloats the enum size to 8 even though chars only need 21 + * bits. */ // Matches any non-`/` character MatchAnyNonDelim, // Matches **, which is any character but can only 'end' on a `/` or end of string MatchGlobStar { terminal: bool }, // Matches any character in the set - MatchClass(u32), - // Unconditional jump. This would occur at the end of an alternate to jump past the other - // alternates. - Jump(i32), + // The value is an index into the ranges + MatchClass(u8), + // Unconditional jump forward. This would occur at the end of an alternate to jump past the + // other alternates. + Jump(u16), // Splits control flow into two branches. - Fork(i32), + // Represented as a signed integer since we allow backwards forks + Fork(i16), // End of program Match, } @@ -643,7 +677,7 @@ enum GlobInstruction { enum GlobToken { // A sequence of bytes, possibly including `/` characters // all bytes are unescaped. - Literal(char), + Literal(u8), // a `*` token Star, // a `**` token @@ -688,7 +722,7 @@ impl Display for GlobToken { } struct Tokenizer<'a> { - input: Peekable>, + input: &'a [u8], pos: usize, err: Option, bracket_count: u32, @@ -698,7 +732,7 @@ struct Tokenizer<'a> { impl<'a> Tokenizer<'a> { fn new(input: &'a str) -> Tokenizer<'a> { Self { - input: input.chars().peekable(), + input: input.as_bytes(), pos: 0, err: None, bracket_count: 0, @@ -706,19 +740,19 @@ impl<'a> Tokenizer<'a> { } } fn next_token(&mut self) -> GlobToken { - self.pos += 1; - match self.input.next() { + match self.input.get(self.pos) { None => GlobToken::End, Some(c) => { + self.pos += 1; match c { - '*' if self.square_bracket_count == 0 => match self.input.next_if_eq(&'*') { - Some(_) => { + b'*' if self.square_bracket_count == 0 => match self.input.get(self.pos) { + Some(b) if *b == b'*' => { // TODO: we should validate that the previous character was a `/` as - // well, but this isn't a big deal/ + // well self.pos += 1; - match self.input.next() { + match self.input.get(self.pos) { None => {} - Some('/') => { + Some(b'/') => { self.pos += 1; } // ok if we are at the end or followed by a `/` _ => { @@ -733,12 +767,12 @@ impl<'a> Tokenizer<'a> { } _ => GlobToken::Star, }, - '?' if self.square_bracket_count == 0 => GlobToken::QuestionMark, - '[' => { + b'?' if self.square_bracket_count == 0 => GlobToken::QuestionMark, + b'[' => { self.square_bracket_count += 1; GlobToken::LSquareBracket } - ']' => { + b']' => { if self.square_bracket_count == 0 { self.err = Some(anyhow!( "mismatched square brackets at {} in glob", @@ -750,11 +784,11 @@ impl<'a> Tokenizer<'a> { GlobToken::RSquareBracket } // These are not tokens inside of a character class, they are just literals - '{' if self.square_bracket_count == 0 => { + b'{' if self.square_bracket_count == 0 => { self.bracket_count += 1; GlobToken::LBracket } - '}' if self.square_bracket_count == 0 => { + b'}' if self.square_bracket_count == 0 => { if self.bracket_count == 0 { self.err = Some(anyhow!("mismatched brackets at {} in glob", self.pos)); return GlobToken::End; @@ -764,15 +798,15 @@ impl<'a> Tokenizer<'a> { GlobToken::RBracket } // This is only a meaninful token inside of a character class - '-' if self.square_bracket_count > 0 => GlobToken::Hyphen, + b'-' if self.square_bracket_count > 0 => GlobToken::Hyphen, // This is only meaningful inside of an alternation (aka brackets) - ',' if self.bracket_count > 0 => GlobToken::Comma, + b',' if self.bracket_count > 0 => GlobToken::Comma, cur => { - if cur == '\\' { - match self.input.next() { + if *cur == b'\\' { + match self.input.get(self.pos) { Some(c) => { self.pos += 1; - GlobToken::Literal(c) + GlobToken::Literal(*c) } None => { self.err = Some(anyhow!("found `\\` character at end of glob")); @@ -780,7 +814,7 @@ impl<'a> Tokenizer<'a> { } } } else { - GlobToken::Literal(cur) + GlobToken::Literal(*cur) } } } @@ -1177,18 +1211,18 @@ mod tests { #[test] fn test_tokenizer() { let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); - let prefix: Vec = "foo/bar".chars().map(|c| GlobToken::Literal(c)).collect(); + let prefix: Vec = "foo/bar".bytes().map(|c| GlobToken::Literal(c)).collect(); for t in prefix { assert_eq!(t, tok.next_token()); } assert_eq!(GlobToken::LSquareBracket, tok.next_token()); - assert_eq!(GlobToken::Literal('a'), tok.next_token()); + assert_eq!(GlobToken::Literal(b'a'), tok.next_token()); assert_eq!(GlobToken::Hyphen, tok.next_token()); - assert_eq!(GlobToken::Literal('z'), tok.next_token()); + assert_eq!(GlobToken::Literal(b'z'), tok.next_token()); assert_eq!(GlobToken::RSquareBracket, tok.next_token()); - assert_eq!(GlobToken::Literal('/'), tok.next_token()); + assert_eq!(GlobToken::Literal(b'/'), tok.next_token()); assert_eq!(GlobToken::QuestionMark, tok.next_token()); - assert_eq!(GlobToken::Literal('/'), tok.next_token()); + assert_eq!(GlobToken::Literal(b'/'), tok.next_token()); assert_eq!(GlobToken::GlobStar, tok.next_token()); assert_eq!(GlobToken::End, tok.next_token()); } From f99f481eff09f7fb2b5fc095c95243b046b50b4c Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 1 May 2025 13:22:00 -0700 Subject: [PATCH 04/17] tiny improvement --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 41 ++++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 7bccde94b7eb7..5f6097508c0f0 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -258,7 +258,7 @@ impl<'a> SparseSet<'a> { fn clear(&mut self) { self.n = 0 } - fn add(&mut self, v: u16) { + fn add(&mut self, v: u16) -> bool { debug_assert!((v as usize) < self.mem.len() / 2); let n = self.n; let s = self.get_sparse(v); @@ -267,11 +267,12 @@ impl<'a> SparseSet<'a> { // index is `v` if s < n && self.get_dense(s) == v { // this value is already in the set. - return; + return false; } *self.get_sparse_mut(v) = n; *self.get_dense_mut(n) = v; self.n += 1; + true } fn get(&self, i: u16) -> u16 { debug_assert!(i < self.n); @@ -304,7 +305,13 @@ impl GlobProgram { let t = tok.next_token(); match t { GlobToken::Literal(lit) => { - instructions.push(GlobInstruction::MatchLiteral(lit)); + // If the previous token is a literal mark it as having a successor. + if let Some(GlobInstruction::MatchLiteral((_, has_next))) = + instructions.last_mut() + { + *has_next = true; + } + instructions.push(GlobInstruction::MatchLiteral((lit, false))); } GlobToken::Star => { instructions.push(GlobInstruction::MatchAnyNonDelim); @@ -560,10 +567,11 @@ impl GlobProgram { let mut thread_index = 0; // We need to use this looping construct since we may add elements to `cur` as we go. // `cur.n` will never be > `len` so this loop is bounded to N iterations. - while thread_index < cur.n { + let mut n_threads = cur.n; + while thread_index < n_threads { let ip = cur.get(thread_index); match self.instructions[ip as usize] { - GlobInstruction::MatchLiteral(m) => { + GlobInstruction::MatchLiteral((m, has_next)) => { if byte == m { // We matched, proceed to the next character next.add(ip + 1); @@ -582,6 +590,7 @@ impl GlobProgram { // If we see a `/` then we need to consider ending the globstar. if byte == b'/' { next.add(ip + 1); + } else if n_threads == 1 { } // but even so we should keep trying to match, just like a fork. next.add(ip); @@ -593,11 +602,16 @@ impl GlobProgram { } GlobInstruction::Jump(offset) => { // Push another thread onto the current list - cur.add(offset + ip); + if cur.add(offset + ip) { + n_threads += 1; + } } GlobInstruction::Fork(offset) => { - cur.add(ip + 1); - cur.add((offset + (ip as i16)) as u16); + let added1 = cur.add(ip + 1); + let added2 = cur.add((offset + (ip as i16)) as u16); + if added1 || added2 { + n_threads = cur.n; + } } GlobInstruction::Match => { // We ran out of instructions while we still have characters @@ -628,14 +642,14 @@ impl GlobProgram { let mut i = 0; while i < cur.n { let ip = cur.get(i); - match &self.instructions[ip as usize] { + match self.instructions[ip as usize] { GlobInstruction::Jump(offset) => { // Push another thread onto the current list - cur.add(*offset + ip); + cur.add(offset + ip); } GlobInstruction::Fork(offset) => { cur.add(ip + 1); - cur.add((*offset + (ip as i16)) as u16); + cur.add((offset + (ip as i16)) as u16); } GlobInstruction::Match => { return true; @@ -653,9 +667,8 @@ impl GlobProgram { // Consider a more compact encoding. #[derive(Debug, PartialEq, Eq, Clone, Copy)] enum GlobInstruction { - // Matches a single literal byte - MatchLiteral(u8), /* N.B. this bloats the enum size to 8 even though chars only need 21 - * bits. */ + // Matches a single literal byte, stores if there is a following literal + MatchLiteral((u8, bool)), // Matches any non-`/` character MatchAnyNonDelim, // Matches **, which is any character but can only 'end' on a `/` or end of string From e2b9eb452688b51202612d2afad727121edbbaf4 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 8 May 2025 15:43:14 -0700 Subject: [PATCH 05/17] Replace the previous glob implementation --- .../crates/turbo-tasks-fs/benches/mod.rs | 8 - turbopack/crates/turbo-tasks-fs/src/glob.rs | 723 ++++++------------ .../crates/turbo-tasks-fs/src/read_glob.rs | 2 +- 3 files changed, 225 insertions(+), 508 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/benches/mod.rs b/turbopack/crates/turbo-tasks-fs/benches/mod.rs index 3cb5e1b85fd53..94c61f4e214ef 100644 --- a/turbopack/crates/turbo-tasks-fs/benches/mod.rs +++ b/turbopack/crates/turbo-tasks-fs/benches/mod.rs @@ -128,14 +128,6 @@ fn glob_bench(c: &mut Criterion, name: &'static str, glob: &str, path: &str) { let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); group.bench_function("turbo-glob-cached", |b| b.iter(|| g.execute(path))); - group.bench_function("turbo-glob-2", |b| { - b.iter(|| { - let g = turbo_tasks_fs::glob::GlobProgram::compile(glob).unwrap(); - g.matches(path, false) - }) - }); - let g = turbo_tasks_fs::glob::GlobProgram::compile(glob).unwrap(); - group.bench_function("turbo-glob-2-cached", |b| b.iter(|| g.matches(path, false))); group.finish(); } diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 5f6097508c0f0..807757eb41d27 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,10 +1,9 @@ -use std::{fmt::Display, mem::take}; +use std::fmt::Display; use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; use turbo_rcstr::RcStr; use turbo_tasks::{trace::TraceRawVcs, NonLocalValue, TryJoinIterExt, Vc}; -use unicode_segmentation::GraphemeCursor; #[derive(PartialEq, Eq, Debug, Clone, TraceRawVcs, Serialize, Deserialize, NonLocalValue)] enum GlobPart { @@ -45,118 +44,36 @@ enum GlobPart { #[turbo_tasks::value] #[derive(Debug, Clone)] pub struct Glob { - expression: Vec, + #[turbo_tasks(trace_ignore)] + program: GlobProgram, } impl Glob { pub fn execute(&self, path: &str) -> bool { // TODO(lukesandberg): deprecate this implicit behavior let match_partial = path.ends_with('/'); - self.iter_matches(path, true, match_partial) - .any(|result| matches!(result, ("", _))) + let path = if match_partial { + &path[0..path.len() - 1] + } else { + path + }; + self.program.matches(path, match_partial) } // Returns true if the glob could match a filename underneath this `path` where the path // represents a directory. pub fn match_in_directory(&self, path: &str) -> bool { - debug_assert!(!path.ends_with('/')); - // TODO(lukesandberg): see if we can avoid this allocation by changing the matching - // algorithm - let path = format!("{path}/"); - self.iter_matches(&path, true, true) - .any(|result| matches!(result, ("", _))) - } - - fn iter_matches<'a>( - &'a self, - path: &'a str, - previous_part_is_path_separator_equivalent: bool, - match_in_directory: bool, - ) -> GlobMatchesIterator<'a> { - GlobMatchesIterator { - current: path, - glob: self, - match_in_directory, - is_path_separator_equivalent: previous_part_is_path_separator_equivalent, - stack: Vec::new(), - index: 0, - } - } - - pub fn parse(input: &str) -> Result { - let mut current = input; - let mut expression = Vec::new(); - - while !current.is_empty() { - let (part, remainder) = GlobPart::parse(current, false) - .with_context(|| anyhow!("Failed to parse glob {input}"))?; - expression.push(part); - current = remainder; - } - - Ok(Glob { expression }) + self.program.matches(path, true) } -} - -struct GlobMatchesIterator<'a> { - current: &'a str, - glob: &'a Glob, - // In this mode we are checking if the glob might match something in the directory represented - // by this path. - match_in_directory: bool, - is_path_separator_equivalent: bool, - stack: Vec>, - index: usize, -} - -impl<'a> Iterator for GlobMatchesIterator<'a> { - type Item = (&'a str, bool); - - fn next(&mut self) -> Option { - loop { - if let Some(part) = self.glob.expression.get(self.index) { - let iter = if let Some(iter) = self.stack.get_mut(self.index) { - iter - } else { - let iter = part.iter_matches( - self.current, - self.is_path_separator_equivalent, - self.match_in_directory, - ); - self.stack.push(iter); - self.stack.last_mut().unwrap() - }; - if let Some((new_path, new_is_path_separator_equivalent)) = iter.next() { - self.current = new_path; - self.is_path_separator_equivalent = new_is_path_separator_equivalent; - - self.index += 1; - if self.match_in_directory && self.current.is_empty() { - return Some(("", self.is_path_separator_equivalent)); - } - } else { - if self.index == 0 { - // failed to match - return None; - } - // backtrack - self.stack.pop(); - self.index -= 1; - } - } else { - // end of expression, matched successfully - - // backtrack for the next iteration - self.index -= 1; - - return Some((self.current, self.is_path_separator_equivalent)); - } - } + pub fn parse(input: &str) -> Result { + Ok(Self { + program: GlobProgram::compile(input)?, + }) } } -#[derive(Debug, Eq, PartialEq, Copy, Clone)] +#[derive(Debug, Eq, PartialEq, Copy, Clone, Serialize, Deserialize, Hash)] struct ByteRange { low: u8, high: u8, @@ -186,7 +103,7 @@ impl Ord for ByteRange { } } -#[derive(Debug, Eq, Clone, PartialEq)] +#[derive(Debug, Eq, Clone, PartialEq, Hash, Serialize, Deserialize)] #[repr(transparent)] struct RangeSet { ranges: Vec, @@ -280,10 +197,39 @@ impl<'a> SparseSet<'a> { } } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] +struct BitSet { + bits: Box<[u64]>, // use a boxed slice instead of a vec since this is not extendable +} +impl BitSet { + const BITS: usize = { u64::BITS as usize }; + fn new(len: usize) -> Self { + let words = len.div_ceil(Self::BITS); + Self { + bits: vec![0u64; words].into_boxed_slice(), + } + } + fn set(&mut self, bit: usize) { + let word_index = bit / Self::BITS; + let bit_index = bit % Self::BITS; + + self.bits[word_index] |= 1 << bit_index; + } + fn has(&self, bit: usize) -> bool { + let word_index = bit / Self::BITS; + let bit_index = bit % Self::BITS; + self.bits[word_index] & 1 << bit_index != 0 + } +} + +#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] pub struct GlobProgram { - instructions: Vec, - range_sets: Vec, + instructions: Box<[GlobInstruction]>, + // Instructions we can end on that are a valid match + match_instructions: BitSet, + // Instructions we can end on that are a valid prefix match + prefix_match_instructions: BitSet, + range_sets: Box<[RangeSet]>, } impl GlobProgram { @@ -305,39 +251,23 @@ impl GlobProgram { let t = tok.next_token(); match t { GlobToken::Literal(lit) => { - // If the previous token is a literal mark it as having a successor. - if let Some(GlobInstruction::MatchLiteral((_, has_next))) = - instructions.last_mut() - { - *has_next = true; - } - instructions.push(GlobInstruction::MatchLiteral((lit, false))); + instructions.push(GlobInstruction::MatchLiteral(lit)); } GlobToken::Star => { + instructions.push(GlobInstruction::Fork(3)); // allowed to match nothing instructions.push(GlobInstruction::MatchAnyNonDelim); // After a star match we either loop back to try again, or proceed. instructions.push(GlobInstruction::Fork(-1)); } GlobToken::GlobStar => { - if instructions.len() >= 2 - && matches!( - instructions.last().unwrap(), - GlobInstruction::MatchGlobStar { .. } - ) - { - // This is a redundant globstar - // e.g. /**/**/ - // just skip this one. - } else { - // allow globstars to match nothing by skipping it - instructions.push(GlobInstruction::Fork(2)); - // We don't actually know if it is terminal or not yet. We will determing - // this after code generation. - instructions.push(GlobInstruction::MatchGlobStar { terminal: false }); - } + // allow globstars to match nothing by skipping it + instructions.push(GlobInstruction::Fork(2)); + // We don't actually know if it is terminal or not yet. We will determine + // this after code generation. + instructions.push(GlobInstruction::MatchGlobStar { terminal: false }); } GlobToken::QuestionMark => { - // A question match, optionally matches a non- delimiter character, so we either + // A question match, optionally matches a non-delimiter character, so we either // skip forward or not. instructions.push(GlobInstruction::Fork(2)); instructions.push(GlobInstruction::MatchAnyNonDelim); @@ -354,11 +284,9 @@ impl GlobProgram { GlobToken::Literal(lit) => { if lit > 127 { // TODO(lukesandberg): These are supportable by expanding into - // several RanngeSets for - // each byte in the multibyte characters + // several RanngeSets for each byte in the multibyte characters // However, this is very unlikely to be required by a user so - // for now the feature is - // omitted. + // for now the feature is omitted. bail!("Unsupported non-ascii character in set"); } debug_assert!( @@ -390,7 +318,7 @@ impl GlobProgram { set.add_singleton(lit); } if partial_range.is_some() { - bail!("unexpectedm hyphen at the end of a character class") + bail!("unexpectedmhyphen at the end of a character class") } set.finalize(); let index = range_sets.len(); @@ -423,33 +351,40 @@ impl GlobProgram { GlobToken::Comma | GlobToken::RBracket => { // We don't need to check this, the tokenizer enforces that these tokens can // only occur inside of an alternation so there must be something on our stack. - let pending = left_bracket_starts.last_mut().unwrap(); - // for an alternation we either 'jump' past it to the next one, or we process it - // directly + { + let pending = left_bracket_starts.last_mut().unwrap(); + // for an alternation we either 'jump' past it to the next one, or we + // process it directly - let mut branch = Vec::new(); - std::mem::swap(&mut instructions, &mut branch); - pending.branches.push(branch); + let mut branch = Vec::new(); + std::mem::swap(&mut instructions, &mut branch); + pending.branches.push(branch); + } // If we hit a `}` then we are done so compute the jumps and pop the prefix. if t == GlobToken::RBracket { - let num_branches = pending.branches.len(); - if num_branches == 1 { - bail!("Cannot have an alternation with only one member"); + let mut branches = left_bracket_starts.pop().unwrap(); + let mut prefix = &mut branches.prefix; + let branches = &mut branches.branches; + let num_branches = branches.len(); + if num_branches <= 1 { + bail!( + "Cannot have an alternation with less than 2 members, remove the \ + brackets?" + ); } - // TODO(lukesandberg): consider looking for common suffixes or prefixes, - // shouldn't be common + // TODO(lukesandberg): we could eliminate common suffixes and prefixes - // The length of each branch plus one for the jump to the end + // The length of each branch plus one for the jump to the end. // For each alternative we have a `Fork` instruction so account for those let mut next_branch_offset = num_branches - 1; - for branch in &pending.branches[0..num_branches - 1] { + for branch in &branches[0..num_branches - 1] { // to jump past the branch we need to jump past all its instructions +1 // to account for the JUMP instruction at the end next_branch_offset += branch.len() + 1; - pending.prefix.push(GlobInstruction::Fork( + prefix.push(GlobInstruction::Fork( next_branch_offset.try_into().context( - "program too large, cannot have more than 64K instructions", + "glob too large, cannot have more than 32K instructions", )?, )); next_branch_offset -= 1; // subtract one since we added a fork @@ -460,25 +395,23 @@ impl GlobProgram { // however we aren't starting at the beginning, we already added // `num_branches-1` `FORK` instructions` let mut end_of_alternation = - next_branch_offset + pending.branches.last().unwrap().len(); - for branch in &mut pending.branches[0..num_branches - 1] { + next_branch_offset + branches.last().unwrap().len(); + for branch in &mut branches[0..num_branches - 1] { end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of the // alternation - pending.prefix.extend(branch.drain(..)); - pending.prefix.push(GlobInstruction::Jump( + prefix.extend(branch.drain(..)); + prefix.push(GlobInstruction::Jump( end_of_alternation.try_into().context( - "program too large, cannot have more than 64K instructions", + "glob too large, cannot have more than 64K instructions", )?, )); end_of_alternation -= 1; // account for the jump instruction } - end_of_alternation -= pending.branches.last().unwrap().len(); - pending - .prefix - .extend(pending.branches.last_mut().unwrap().drain(..)); + end_of_alternation -= branches.last().unwrap().len(); + + prefix.extend(branches.last_mut().unwrap().drain(..)); debug_assert!(end_of_alternation == 0); - std::mem::swap(&mut instructions, &mut pending.prefix); - left_bracket_starts.pop(); + std::mem::swap(&mut instructions, &mut prefix); } } GlobToken::End => { @@ -492,63 +425,110 @@ impl GlobProgram { } // Now we need to annotate 'terminal' globstars in order to speed up validating program - // matches The issue is that if we are executing a globstar when the program - // completes then pass or fail is dependant upon whether there are unconditioal require - // subsequent matches e.g. `foo/**` matches `foo/bar/baz.js` but foo/**/a` does not, - // because we have to match that 'a' similarly `foo/{**,bar}` matches - // `foo/bar/baz.js` Do determin this we need to chase pointers from globstars to the + // matches. The issue is that if we are executing a globstar when the program + // completes then pass or fail is dependant upon whether there are any unconditional + // required subsequent matches e.g. `foo/**` matches `foo/bar/baz.js` but foo/**/a` + // does not, because we have to match that 'a' similarly `foo/{**,bar}` matches + // `foo/bar/baz.js`. To determine this we need to chase pointers from globstars to the // end of the program, and if there is a path to match then it is a terminal globstar. + // Similarly for other matches (branches, stars) we might end a match on some control flow + // rather than following it to the end we can just precompute properties of each + // instructions if instructions.len() > u16::MAX as usize { bail!("program too large"); } - let mut storage = vec![0; instructions.len() * 2]; - let mut visited = SparseSet::from_storage(storage.as_mut_slice()); - for start in 0..(instructions.len() as u16) { - if matches!( - instructions[start as usize], - GlobInstruction::MatchGlobStar { .. } - ) { - visited.add(start + 1); - let mut thread_index = 0; - while thread_index < visited.n { - let ip = visited.get(thread_index); - match &instructions[ip as usize] { - GlobInstruction::Fork(offset) => { - visited.add(ip + 1); - visited.add(((*offset as i16) + (ip as i16)) as u16); - } - GlobInstruction::Jump(offset) => { - visited.add(ip + *offset); - } - GlobInstruction::Match => { - // There is a path to the end, update to `terminal` - instructions[start as usize] = - GlobInstruction::MatchGlobStar { terminal: true }; - break; - } - _ => { - break; - } + + let mut visited = BitSet::new(instructions.len()); + let mut has_path_to_match = BitSet::new(instructions.len()); + let mut has_path_to_prefix_match = BitSet::new(instructions.len()); + + for start in (0..instructions.len()).rev() { + visited.set(start as usize); + let (valid_prefix_end, valid_match) = match instructions[start as usize] { + GlobInstruction::MatchLiteral(byte) => (byte == b'/', false), + GlobInstruction::MatchAnyNonDelim => (false, false), + GlobInstruction::MatchGlobStar { terminal } => { + debug_assert!(!terminal); // shouldn't have been set + // a globstar is always a valid prefix end but is only a valid match if the + // subsequent instruction is. + let next = start + 1; + debug_assert!(visited.has(next), "should have already visited the target"); + let has_path_to_end = has_path_to_match.has(next); + if has_path_to_end { + instructions[start] = GlobInstruction::MatchGlobStar { terminal: true }; } - thread_index += 1; + (true, has_path_to_match.has(next)) } + GlobInstruction::MatchClass(index) => { + (range_sets[index as usize].contains(b'/'), false) + } + GlobInstruction::Jump(offset) => { + let target = start + offset as usize; + debug_assert!( + visited.has(target), + "should have already visited the target" + ); + // copy from the target + ( + has_path_to_prefix_match.has(target), + has_path_to_match.has(target), + ) + } + GlobInstruction::Fork(offset) => { + let next_instruction = (start + 1) as usize; + debug_assert!( + visited.has(next_instruction), + "should have already visited the target" + ); + let next = ( + has_path_to_prefix_match.has(next_instruction), + has_path_to_match.has(next_instruction), + ); - // Check if this has an unconditional path to Match - - visited.clear(); + if offset < 0 { + // Forks that point backwards are implementing loops. + // So we need to follow them now. + next + } else { + let fork_target = start as usize + offset as usize; + debug_assert!( + visited.has(fork_target), + "should have already visited the target" + ); + let fork = ( + has_path_to_prefix_match.has(fork_target), + has_path_to_match.has(fork_target), + ); + (next.0 || fork.0, next.1 || fork.1) + } + } + GlobInstruction::Match => { + // For prefix matches, we only want to say it matches if a a file within the + // directory could match, so if we see a Maatch instruction then we cannot match + // it with this glob + (false, true) + } + }; + if valid_prefix_end { + has_path_to_prefix_match.set(start as usize) + } + if valid_match { + has_path_to_match.set(start as usize) } } Ok(GlobProgram { - instructions, - range_sets, + instructions: instructions.into_boxed_slice(), + match_instructions: has_path_to_match, + prefix_match_instructions: has_path_to_prefix_match, + range_sets: range_sets.into_boxed_slice(), }) } pub fn matches(&self, v: &str, prefix: bool) -> bool { let len = self.instructions.len(); - // Use a single uninitialize allocation for our storage. + // Use a single uninitialized allocation for our storage. let mut storage: Vec = unsafe { std::mem::transmute(vec![std::mem::MaybeUninit::::uninit(); len * 4]) }; let (set1, set2) = storage.split_at_mut(len * 2); @@ -557,13 +537,15 @@ impl GlobProgram { // Access via references to make the swap operations cheaper. let mut cur = &mut set1; let mut next = &mut set2; + // start at the first instruction! cur.add(0); - // Process all characters in order - // It would be faster to process `bytes` but this adds complexity in a number of places - - let bytes = v.as_bytes(); - for i in 0..bytes.len() { - let byte = bytes[i]; + // Process all bytes in order + // Each iteration of the outer loop advances one byte through the input + // Each iteration of the inner loop iterates at most once for every instruction in the + // program but typically far less + // This bounds execution at O(N*M) where N is the size of the path and M is the size of the + // program + for &byte in v.as_bytes() { let mut thread_index = 0; // We need to use this looping construct since we may add elements to `cur` as we go. // `cur.n` will never be > `len` so this loop is bounded to N iterations. @@ -571,7 +553,7 @@ impl GlobProgram { while thread_index < n_threads { let ip = cur.get(thread_index); match self.instructions[ip as usize] { - GlobInstruction::MatchLiteral((m, has_next)) => { + GlobInstruction::MatchLiteral(m) => { if byte == m { // We matched, proceed to the next character next.add(ip + 1); @@ -584,19 +566,19 @@ impl GlobProgram { } GlobInstruction::MatchGlobStar { terminal } => { if terminal { - // If we find a terminal globstar, we are done! this must match + // If we find a terminal globstar, we are done! this must match whatever + // remains return true; } // If we see a `/` then we need to consider ending the globstar. if byte == b'/' { next.add(ip + 1); - } else if n_threads == 1 { } // but even so we should keep trying to match, just like a fork. next.add(ip); } - GlobInstruction::MatchClass(set_index) => { - if self.range_sets[set_index as usize].contains(byte) { + GlobInstruction::MatchClass(index) => { + if self.range_sets[index as usize].contains(byte) { next.add(ip + 1); } } @@ -631,44 +613,38 @@ impl GlobProgram { std::mem::swap(&mut cur, &mut next); } // If we get here we have matched a prefix of the instructions and run out of text. - // So in prefix mode we are done, otherwise we need to check if we hit the end of the - // isntructions + // We need to ensure that whatever instructions we landed on are valid for a prefix match if prefix { - return true; + for i in 0..cur.n { + let insn = cur.get(i); + if self.prefix_match_instructions.has(insn as usize) { + return true; + } + } } // We matched if there is some path from any current thread to the end, so we need to // process all jumps and forks - let mut i = 0; - while i < cur.n { - let ip = cur.get(i); - match self.instructions[ip as usize] { - GlobInstruction::Jump(offset) => { - // Push another thread onto the current list - cur.add(offset + ip); - } - GlobInstruction::Fork(offset) => { - cur.add(ip + 1); - cur.add((offset + (ip as i16)) as u16); - } - GlobInstruction::Match => { - return true; - } - _ => { - //ignore - } + // Consider a pattern like `a{b,c}` matching `ab` + // The program would be `Lit(a)Fork(2)Lit(b)Jump(2)Lit(c)Match` + // So when matching 'b' we need to execute a jump to find the match.` + for i in 0..cur.n { + let insn = cur.get(i); + if self.match_instructions.has(insn as usize) { + return true; } - i += 1; } false } } // Consider a more compact encoding. -#[derive(Debug, PartialEq, Eq, Clone, Copy)] +// The jump offsets force this to 4 bytes +// A variable length instruction encoding would help a lot +#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] enum GlobInstruction { - // Matches a single literal byte, stores if there is a following literal - MatchLiteral((u8, bool)), + // Matches a single literal byte + MatchLiteral(u8), // Matches any non-`/` character MatchAnyNonDelim, // Matches **, which is any character but can only 'end' on a `/` or end of string @@ -836,251 +812,6 @@ impl<'a> Tokenizer<'a> { } } -impl GlobPart { - /// Iterates over all possible matches of this part with the provided path. - /// The least greedy match is returned first. This is usually used for - /// backtracking. The string slice returned is the remaining part or the - /// path. The boolean flag returned specifies if the matched part should - /// be considered as path-separator equivalent. - fn iter_matches<'a>( - &'a self, - path: &'a str, - previous_part_is_path_separator_equivalent: bool, - match_in_directory: bool, - ) -> GlobPartMatchesIterator<'a> { - GlobPartMatchesIterator { - path, - part: self, - match_in_directory, - previous_part_is_path_separator_equivalent, - cursor: GraphemeCursor::new(0, path.len(), true), - index: 0, - glob_iterator: None, - } - } - - fn parse(input: &str, inside_of_braces: bool) -> Result<(GlobPart, &str)> { - debug_assert!(!input.is_empty()); - let two_chars = { - let mut chars = input.chars(); - (chars.next().unwrap(), chars.next()) - }; - match two_chars { - ('/', _) => Ok((GlobPart::PathSeparator, &input[1..])), - ('*', Some('*')) => Ok((GlobPart::AnyDirectories, &input[2..])), - ('*', _) => Ok((GlobPart::AnyFile, &input[1..])), - ('?', _) => Ok((GlobPart::AnyFileChar, &input[1..])), - ('[', Some('[')) => todo!("glob char classes are not implemented yet"), - ('[', _) => todo!("glob char sequences are not implemented yet"), - ('{', Some(_)) => { - let mut current = &input[1..]; - let mut alternatives = Vec::new(); - let mut expression = Vec::new(); - - loop { - let (part, remainder) = GlobPart::parse(current, true)?; - expression.push(part); - current = remainder; - match current.chars().next() { - Some(',') => { - alternatives.push(Glob { - expression: take(&mut expression), - }); - current = ¤t[1..]; - } - Some('}') => { - alternatives.push(Glob { - expression: take(&mut expression), - }); - current = ¤t[1..]; - break; - } - None => bail!("Unterminated glob braces"), - _ => { - // next part of the glob - } - } - } - - Ok((GlobPart::Alternatives(alternatives), current)) - } - ('{', None) => { - bail!("Unterminated glob braces") - } - _ => { - let mut is_escaped = false; - let mut literal = String::new(); - - // TODO(lukes): there shouldn't be a need to traverse graphemes here, just literal - // matches since all of our delimiters are ascii characters - let mut cursor = GraphemeCursor::new(0, input.len(), true); - - let mut start = cursor.cur_cursor(); - let mut end_cursor = cursor - .next_boundary(input, 0) - .map_err(|e| anyhow!("{:?}", e))?; - - while let Some(end) = end_cursor { - let c = &input[start..end]; - if is_escaped { - is_escaped = false; - } else if c == "\\" { - is_escaped = true; - } else if c == "/" - || c == "*" - || c == "?" - || c == "[" - || c == "{" - || (inside_of_braces && (c == "," || c == "}")) - { - break; - } - literal.push_str(c); - - start = cursor.cur_cursor(); - end_cursor = cursor - .next_boundary(input, end) - .map_err(|e| anyhow!("{:?}", e))?; - } - - Ok((GlobPart::File(literal), &input[start..])) - } - } - } -} - -struct GlobPartMatchesIterator<'a> { - path: &'a str, - part: &'a GlobPart, - match_in_directory: bool, - previous_part_is_path_separator_equivalent: bool, - cursor: GraphemeCursor, - index: usize, - glob_iterator: Option>>, -} - -impl<'a> Iterator for GlobPartMatchesIterator<'a> { - type Item = (&'a str, bool); - - fn next(&mut self) -> Option { - match self.part { - GlobPart::AnyDirectories => { - if self.cursor.cur_cursor() == 0 { - let Ok(Some(_)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - return Some((self.path, true)); - } - - if self.cursor.cur_cursor() == self.path.len() { - return None; - } - - loop { - let start = self.cursor.cur_cursor(); - // next_boundary does not set cursor offset to the end of the string - // if there is no next boundary - manually set cursor to the end - let end = match self.cursor.next_boundary(self.path, 0) { - Ok(end) => { - if let Some(end) = end { - end - } else { - self.cursor.set_cursor(self.path.len()); - self.cursor.cur_cursor() - } - } - _ => return None, - }; - - if &self.path[start..end] == "/" { - return Some((&self.path[end..], true)); - } else if start == end { - return Some((&self.path[start..], false)); - } - } - } - GlobPart::AnyFile => { - let Ok(Some(c)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - - let idx = self.path[0..c].len(); - - // TODO verify if `*` does match zero chars? - if let Some(slice) = self.path.get(0..c) { - if slice.ends_with('/') { - None - } else { - Some(( - &self.path[c..], - self.previous_part_is_path_separator_equivalent && idx == 1, - )) - } - } else { - None - } - } - GlobPart::AnyFileChar => todo!(), - GlobPart::PathSeparator => { - if self.cursor.cur_cursor() == 0 { - let Ok(Some(b)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - if self.path.starts_with('/') { - Some((&self.path[b..], true)) - } else if self.previous_part_is_path_separator_equivalent { - Some((self.path, true)) - } else { - None - } - } else { - None - } - } - GlobPart::FileChar(chars) => { - let start = self.cursor.cur_cursor(); - let Ok(Some(end)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - let mut chars_in_path = self.path[start..end].chars(); - let c = chars_in_path.next()?; - if chars_in_path.next().is_some() { - return None; - } - chars.contains(&c).then(|| (&self.path[end..], false)) - } - GlobPart::File(name) => { - if self.cursor.cur_cursor() == 0 && self.path.starts_with(name) { - let Ok(Some(_)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - Some((&self.path[name.len()..], false)) - } else { - None - } - } - GlobPart::Alternatives(alternatives) => loop { - if let Some(glob_iterator) = &mut self.glob_iterator { - if let Some((path, is_path_separator_equivalent)) = glob_iterator.next() { - return Some((path, is_path_separator_equivalent)); - } else { - self.index += 1; - self.glob_iterator = None; - } - } else if let Some(alternative) = alternatives.get(self.index) { - self.glob_iterator = Some(Box::new(alternative.iter_matches( - self.path, - self.previous_part_is_path_separator_equivalent, - self.match_in_directory, - ))); - } else { - return None; - } - }, - } - } -} - impl TryFrom<&str> for Glob { type Error = anyhow::Error; @@ -1098,14 +829,20 @@ impl Glob { #[turbo_tasks::function] pub async fn alternatives(globs: Vec>) -> Result> { - if globs.len() == 1 { - return Ok(globs.into_iter().next().unwrap()); + match globs.len() { + 0 => Ok(Self::cell(Glob::parse("")?)), + 1 => Ok(globs.into_iter().next().unwrap()), + _ => { + todo!() + } } - Ok(Self::cell(Glob { - expression: vec![GlobPart::Alternatives( - globs.into_iter().map(|g| g.owned()).try_join().await?, - )], - })) + // if globs.len() == 1 { + // } + // Ok(Self::cell(Glob { + // expression: vec![GlobPart::Alternatives( + // globs.into_iter().map(|g| g.owned()).try_join().await?, + // )], + // })) } } @@ -1113,8 +850,8 @@ impl Glob { mod tests { use rstest::*; - use super::{Glob, GlobToken, Tokenizer}; - use crate::glob::GlobProgram; + use super::{Glob, GlobToken}; + use crate::glob::Tokenizer; #[rstest] #[case::file("file.js", "file.js")] @@ -1193,12 +930,6 @@ mod tests { println!("{glob:?} compiled to {parsed:?} matching {path}"); assert!(parsed.execute(path)); - - let parsed2 = GlobProgram::compile(glob).unwrap(); - println!("{glob:?} compiled to\n{parsed2:#?}\nmatching {path}"); - let prefix = path.ends_with('/'); - - assert!(parsed2.matches(path, prefix)); } #[rstest] @@ -1213,12 +944,6 @@ mod tests { println!("{glob:?} compiled to {parsed:?} matching {path}"); assert!(!parsed.execute(path)); - - let parsed2 = GlobProgram::compile(glob).unwrap(); - println!("{glob:?} compiled to {parsed2:?} matching {path}"); - let prefix = path.ends_with('/'); - - assert!(!parsed2.matches(path, prefix)); } #[test] diff --git a/turbopack/crates/turbo-tasks-fs/src/read_glob.rs b/turbopack/crates/turbo-tasks-fs/src/read_glob.rs index 4168c64bee11e..9ae6a676ba40e 100644 --- a/turbopack/crates/turbo-tasks-fs/src/read_glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/read_glob.rs @@ -321,7 +321,7 @@ pub mod tests { path, Vec::new(), )); - let read_dir = fs + let read_dir = &*fs .root() .read_glob(Glob::new("*.js".into()), false) .await From cf436e2534cd908c27088463fb1253cf633094df Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 8 May 2025 17:15:49 -0700 Subject: [PATCH 06/17] drop support for alternatives We could support this but it is complex and all the callers can easily just compose a large glob themselves --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 333 +++++++++--------- .../src/chunk/placeable.rs | 69 ++-- turbopack/crates/turbopack/src/lib.rs | 11 +- 3 files changed, 216 insertions(+), 197 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 807757eb41d27..429fe1023092e 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,33 +1,9 @@ -use std::fmt::Display; +use std::{cmp::Ordering, fmt::Display}; use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; use turbo_rcstr::RcStr; -use turbo_tasks::{trace::TraceRawVcs, NonLocalValue, TryJoinIterExt, Vc}; - -#[derive(PartialEq, Eq, Debug, Clone, TraceRawVcs, Serialize, Deserialize, NonLocalValue)] -enum GlobPart { - /// `/**/`: Matches any path of directories - AnyDirectories, - - /// `*`: Matches any filename (no path separator) - AnyFile, - - /// `?`: Matches a single filename character (no path separator) - AnyFileChar, - - /// `/`: Matches the path separator - PathSeparator, - - /// `[abc]`: Matches any char of the list - FileChar(Vec), - - /// `abc`: Matches literal filename - File(String), - - /// `{a,b,c}`: Matches any of the globs in the list - Alternatives(Vec), -} +use turbo_tasks::Vc; // Examples: // - file.js = File(file.js) @@ -73,6 +49,29 @@ impl Glob { } } +impl TryFrom<&str> for Glob { + type Error = anyhow::Error; + + fn try_from(value: &str) -> Result { + Glob::parse(value) + } +} +impl TryFrom for Glob { + type Error = anyhow::Error; + + fn try_from(value: RcStr) -> Result { + Glob::parse(value.as_str()) + } +} + +#[turbo_tasks::value_impl] +impl Glob { + #[turbo_tasks::function] + pub fn new(glob: RcStr) -> Result> { + Ok(Self::cell(Glob::try_from(glob)?)) + } +} + #[derive(Debug, Eq, PartialEq, Copy, Clone, Serialize, Deserialize, Hash)] struct ByteRange { low: u8, @@ -104,31 +103,14 @@ impl Ord for ByteRange { } #[derive(Debug, Eq, Clone, PartialEq, Hash, Serialize, Deserialize)] -#[repr(transparent)] struct RangeSet { - ranges: Vec, + ranges: Box<[ByteRange]>, } impl RangeSet { - fn new() -> RangeSet { - Self { ranges: Vec::new() } - } - - fn contains(&self, b: u8) -> bool { - self.ranges.binary_search(&ByteRange::singleton(b)).is_ok() - } - - fn add_singleton(&mut self, b: u8) { - self.ranges.push(ByteRange::singleton(b)); - } - fn add_range(&mut self, low: u8, high: u8) { - self.ranges.push(ByteRange::range(low, high)); - } - - // Sorts and merges redundant entries. - fn finalize(&mut self) { - self.ranges.sort(); - self.ranges.dedup_by(|a, b| { + fn new(mut ranges: Vec) -> RangeSet { + ranges.sort(); + ranges.dedup_by(|a, b| { if b.high >= a.low { b.high = a.high; true @@ -136,7 +118,25 @@ impl RangeSet { false } }); - self.ranges.shrink_to_fit(); + Self { + ranges: ranges.into_boxed_slice(), + } + } + + fn contains(&self, b: u8) -> bool { + self.ranges + .binary_search_by(move |r| { + if r.low <= b { + if r.high >= b { + Ordering::Equal + } else { + Ordering::Less + } + } else { + Ordering::Greater + } + }) + .is_ok() } } @@ -233,7 +233,7 @@ pub struct GlobProgram { } impl GlobProgram { - pub fn compile(pattern: &str) -> Result { + fn compile(pattern: &str) -> Result { let mut tok = Tokenizer::new(pattern); let mut instructions = Vec::new(); let mut range_sets = Vec::new(); @@ -250,6 +250,9 @@ impl GlobProgram { loop { let t = tok.next_token(); match t { + GlobToken::Caret | GlobToken::ExclamationPoint => { + panic!("these cannot appear at the beginning of a pattern") + } GlobToken::Literal(lit) => { instructions.push(GlobInstruction::MatchLiteral(lit)); } @@ -273,14 +276,27 @@ impl GlobProgram { instructions.push(GlobInstruction::MatchAnyNonDelim); } GlobToken::LSquareBracket => { - let mut set = RangeSet::new(); - // with in a square brachet we expect a mix of literals and hyphens followed by + let mut set = Vec::new(); + // with in a square bracket we expect a mix of literals and hyphens followed by // a RSuareBracket let mut prev_literal: Option = None; let mut partial_range: Option = None; + let mut negated = false; loop { let t = tok.next_token(); match t { + GlobToken::Caret | GlobToken::ExclamationPoint => { + if !set.is_empty() + || prev_literal.is_some() + || partial_range.is_some() + { + bail!( + "negation tokens can only appear at the beginning of \ + character classes" + ); + } + negated = !negated; + } GlobToken::Literal(lit) => { if lit > 127 { // TODO(lukesandberg): These are supportable by expanding into @@ -289,17 +305,12 @@ impl GlobProgram { // for now the feature is omitted. bail!("Unsupported non-ascii character in set"); } - debug_assert!( - prev_literal.is_none(), - "impossible, tokenizer failed to merge literals or we failed \ - to handle a hyphen correctly" - ); if let Some(start) = partial_range { - set.add_range(start, lit); + set.push(ByteRange::range(start, lit)); partial_range = None; } else { if let Some(prev) = prev_literal { - set.add_singleton(prev); + set.push(ByteRange::singleton(prev)); } prev_literal = Some(lit); } @@ -315,19 +326,24 @@ impl GlobProgram { GlobToken::RSquareBracket => { if let Some(lit) = prev_literal { - set.add_singleton(lit); + set.push(ByteRange::singleton(lit)); } if partial_range.is_some() { - bail!("unexpectedmhyphen at the end of a character class") + bail!("unexpected hyphen at the end of a character class") } - set.finalize(); - let index = range_sets.len(); + let set = RangeSet::new(set); + let index: u8 = range_sets + .len() + .try_into() + .context("Cannot have more than 255 range sets")?; range_sets.push(set); - instructions.push(GlobInstruction::MatchClass( - index - .try_into() - .context("Cannot have more than 255 range sets")?, - )); + // It is more efficient to use a separate instruction than pad out + // the range class + if negated { + instructions.push(GlobInstruction::NegativeMatchClass(index)) + } else { + instructions.push(GlobInstruction::MatchClass(index)); + } break; } _ => { @@ -366,51 +382,7 @@ impl GlobProgram { let mut branches = left_bracket_starts.pop().unwrap(); let mut prefix = &mut branches.prefix; let branches = &mut branches.branches; - let num_branches = branches.len(); - if num_branches <= 1 { - bail!( - "Cannot have an alternation with less than 2 members, remove the \ - brackets?" - ); - } - // TODO(lukesandberg): we could eliminate common suffixes and prefixes - - // The length of each branch plus one for the jump to the end. - // For each alternative we have a `Fork` instruction so account for those - let mut next_branch_offset = num_branches - 1; - for branch in &branches[0..num_branches - 1] { - // to jump past the branch we need to jump past all its instructions +1 - // to account for the JUMP instruction at the end - next_branch_offset += branch.len() + 1; - prefix.push(GlobInstruction::Fork( - next_branch_offset.try_into().context( - "glob too large, cannot have more than 32K instructions", - )?, - )); - next_branch_offset -= 1; // subtract one since we added a fork - // instruction. - } - // NOTE: The last branch doesn't get a JUMP instruction, so no `+1` - // this is the relative offet to the end, from the very beginning - // however we aren't starting at the beginning, we already added - // `num_branches-1` `FORK` instructions` - let mut end_of_alternation = - next_branch_offset + branches.last().unwrap().len(); - for branch in &mut branches[0..num_branches - 1] { - end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of the - // alternation - prefix.extend(branch.drain(..)); - prefix.push(GlobInstruction::Jump( - end_of_alternation.try_into().context( - "glob too large, cannot have more than 64K instructions", - )?, - )); - end_of_alternation -= 1; // account for the jump instruction - } - end_of_alternation -= branches.last().unwrap().len(); - - prefix.extend(branches.last_mut().unwrap().drain(..)); - debug_assert!(end_of_alternation == 0); + build_alternatives(prefix, branches)?; std::mem::swap(&mut instructions, &mut prefix); } } @@ -463,6 +435,9 @@ impl GlobProgram { GlobInstruction::MatchClass(index) => { (range_sets[index as usize].contains(b'/'), false) } + GlobInstruction::NegativeMatchClass(index) => { + (!range_sets[index as usize].contains(b'/'), false) + } GlobInstruction::Jump(offset) => { let target = start + offset as usize; debug_assert!( @@ -488,7 +463,7 @@ impl GlobProgram { if offset < 0 { // Forks that point backwards are implementing loops. - // So we need to follow them now. + // So we don't need to follow them now next } else { let fork_target = start as usize + offset as usize; @@ -526,7 +501,7 @@ impl GlobProgram { }) } - pub fn matches(&self, v: &str, prefix: bool) -> bool { + fn matches(&self, v: &str, prefix: bool) -> bool { let len = self.instructions.len(); // Use a single uninitialized allocation for our storage. let mut storage: Vec = @@ -582,6 +557,11 @@ impl GlobProgram { next.add(ip + 1); } } + GlobInstruction::NegativeMatchClass(index) => { + if !self.range_sets[index as usize].contains(byte) { + next.add(ip + 1); + } + } GlobInstruction::Jump(offset) => { // Push another thread onto the current list if cur.add(offset + ip) { @@ -638,6 +618,45 @@ impl GlobProgram { } } +fn build_alternatives( + prefix: &mut Vec, + branches: &mut Vec>, +) -> Result<(), anyhow::Error> { + let num_branches = branches.len(); + if num_branches <= 1 { + bail!("Cannot have an alternation with less than 2 members, remove the brackets?"); + } + let mut next_branch_offset = num_branches - 1; + for branch in &branches[0..num_branches - 1] { + // to jump past the branch we need to jump past all its instructions +1 + // to account for the JUMP instruction at the end + next_branch_offset += branch.len() + 1; + prefix.push(GlobInstruction::Fork( + next_branch_offset + .try_into() + .context("glob too large, cannot have more than 32K instructions")?, + )); + next_branch_offset -= 1; // subtract one since we added a fork + // instruction. + } + let mut end_of_alternation = next_branch_offset + branches.last().unwrap().len(); + for branch in &mut branches[0..num_branches - 1] { + end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of the + // alternation + prefix.extend(branch.drain(..)); + prefix.push(GlobInstruction::Jump( + end_of_alternation + .try_into() + .context("glob too large, cannot have more than 64K instructions")?, + )); + end_of_alternation -= 1; // account for the jump instruction + } + end_of_alternation -= branches.last().unwrap().len(); + prefix.extend(branches.last_mut().unwrap().drain(..)); + debug_assert!(end_of_alternation == 0); + Ok(()) +} + // Consider a more compact encoding. // The jump offsets force this to 4 bytes // A variable length instruction encoding would help a lot @@ -652,6 +671,9 @@ enum GlobInstruction { // Matches any character in the set // The value is an index into the ranges MatchClass(u8), + // Matches any character not in the set + // The value is an index into the ranges + NegativeMatchClass(u8), // Unconditional jump forward. This would occur at the end of an alternate to jump past the // other alternates. Jump(u16), @@ -681,10 +703,15 @@ enum GlobToken { LBracket, // a `}` token RBracket, - // a `,` token + // a `,` token, this is contextual, only present within a `{...}` section Comma, - // a `-` token + // a `-` token, this is contextual, only present within a `[...]` section Hyphen, + // a ! token, this is contextual, only allowed at the beginning of a `[` or at the very + // beginning of the pattern + ExclamationPoint, + // a '^' token. Same rules as ! + Caret, End, } impl Display for GlobToken { @@ -705,6 +732,8 @@ impl Display for GlobToken { GlobToken::Hyphen => "-", GlobToken::End => "", GlobToken::Literal(_) => panic!("impossible"), + GlobToken::ExclamationPoint => "!", + GlobToken::Caret => "^", }) } } @@ -736,8 +765,15 @@ impl<'a> Tokenizer<'a> { match c { b'*' if self.square_bracket_count == 0 => match self.input.get(self.pos) { Some(b) if *b == b'*' => { - // TODO: we should validate that the previous character was a `/` as - // well + // This is a globstar + + if self.pos > 2 && self.input[self.pos - 2] != b'/' { + self.err = Some(anyhow!( + "a glob star must be a full path segment, e.g. `/**/` @{}", + self.pos + )); + return GlobToken::End; + } self.pos += 1; match self.input.get(self.pos) { None => {} @@ -746,7 +782,8 @@ impl<'a> Tokenizer<'a> { } // ok if we are at the end or followed by a `/` _ => { self.err = Some(anyhow!( - "a glob star must be a full path segment, e.g. `/**/`" + "a glob star must be a full path segment, e.g. `/**/`@{}", + self.pos )); return GlobToken::End; } @@ -763,10 +800,8 @@ impl<'a> Tokenizer<'a> { } b']' => { if self.square_bracket_count == 0 { - self.err = Some(anyhow!( - "mismatched square brackets at {} in glob", - self.pos - )); + self.err = + Some(anyhow!("mismatched square brackets in glob @{}", self.pos)); return GlobToken::End; } self.square_bracket_count -= 1; @@ -779,7 +814,7 @@ impl<'a> Tokenizer<'a> { } b'}' if self.square_bracket_count == 0 => { if self.bracket_count == 0 { - self.err = Some(anyhow!("mismatched brackets at {} in glob", self.pos)); + self.err = Some(anyhow!("mismatched brackets @{}", self.pos)); return GlobToken::End; } @@ -790,6 +825,9 @@ impl<'a> Tokenizer<'a> { b'-' if self.square_bracket_count > 0 => GlobToken::Hyphen, // This is only meaningful inside of an alternation (aka brackets) b',' if self.bracket_count > 0 => GlobToken::Comma, + // only valid inside of a character class + b'!' if self.square_bracket_count > 0 => GlobToken::ExclamationPoint, + b'^' if self.square_bracket_count > 0 => GlobToken::Caret, cur => { if *cur == b'\\' { match self.input.get(self.pos) { @@ -812,40 +850,6 @@ impl<'a> Tokenizer<'a> { } } -impl TryFrom<&str> for Glob { - type Error = anyhow::Error; - - fn try_from(value: &str) -> Result { - Glob::parse(value) - } -} - -#[turbo_tasks::value_impl] -impl Glob { - #[turbo_tasks::function] - pub fn new(glob: RcStr) -> Result> { - Ok(Self::cell(Glob::try_from(glob.as_str())?)) - } - - #[turbo_tasks::function] - pub async fn alternatives(globs: Vec>) -> Result> { - match globs.len() { - 0 => Ok(Self::cell(Glob::parse("")?)), - 1 => Ok(globs.into_iter().next().unwrap()), - _ => { - todo!() - } - } - // if globs.len() == 1 { - // } - // Ok(Self::cell(Glob { - // expression: vec![GlobPart::Alternatives( - // globs.into_iter().map(|g| g.owned()).try_join().await?, - // )], - // })) - } -} - #[cfg(test)] mod tests { use rstest::*; @@ -862,6 +866,7 @@ mod tests { #[case::dir_and_file_braces("dir/file.{ts,js}", "dir/file.js")] #[case::dir_and_file_dir_braces("{dir,other}/file.{ts,js}", "dir/file.js")] #[case::star("*.js", "file.js")] + #[case::star_empty("*.js", ".js")] // can match nothing #[case::dir_star("dir/*.js", "dir/file.js")] #[case::dir_star_partial("dir/*.js", "dir/")] #[case::globstar("**/*.js", "file.js")] @@ -946,6 +951,18 @@ mod tests { assert!(!parsed.execute(path)); } + #[test] + fn glob_character_classes() { + let parsed = Glob::parse("[a-zA-Z0-9_\\-]").unwrap(); + + assert!(parsed.execute("a")); + assert!(!parsed.execute("$")); + let parsed = Glob::parse("[!a-zA-Z0-9_\\-]").unwrap(); + + assert!(!parsed.execute("a")); + assert!(parsed.execute("$")); + } + #[test] fn test_tokenizer() { let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); diff --git a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs index 65173eb921fc6..b9cd16c374769 100644 --- a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs +++ b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs @@ -49,13 +49,36 @@ async fn side_effects_from_package_json( .iter() .filter_map(|side_effect| { if let Some(side_effect) = side_effect.as_str() { - if side_effect.contains('/') { - Some(Glob::new( - side_effect.strip_prefix("./").unwrap_or(side_effect).into(), - )) + let glob = if side_effect.contains('/') { + side_effect + .strip_prefix("./") + .unwrap_or(side_effect) + .to_string() } else { - Some(Glob::new(format!("**/{side_effect}").into())) + format!("**/{side_effect}") + }; + match Glob::parse(glob.as_str()) { + Ok(_) => {} + Err(err) => { + SideEffectsInPackageJsonIssue { + path: package_json, + description: Some( + StyledString::Text( + format!( + "Invalid glob '{}' in sideEffects: {}", + glob, + PrettyPrintError(&err) + ) + .into(), + ) + .resolved_cell(), + ), + } + .resolved_cell() + .emit(); + } } + Some(glob) } else { SideEffectsInPackageJsonIssue { path: package_json, @@ -76,34 +99,14 @@ async fn side_effects_from_package_json( None } }) - .map(|glob| async move { - match glob.resolve().await { - Ok(glob) => Ok(Some(glob)), - Err(err) => { - SideEffectsInPackageJsonIssue { - path: package_json, - description: Some( - StyledString::Text( - format!( - "Invalid glob in sideEffects: {}", - PrettyPrintError(&err) - ) - .into(), - ) - .resolved_cell(), - ), - } - .resolved_cell() - .emit(); - Ok(None) - } - } - }) - .try_flat_join() - .await?; - return Ok( - SideEffectsValue::Glob(Glob::alternatives(globs).to_resolved().await?).cell(), - ); + .collect::>() + .join(","); + return Ok(SideEffectsValue::Glob( + Glob::new(format!("{{{}}}", globs).into()) + .to_resolved() + .await?, + ) + .cell()); } else { SideEffectsInPackageJsonIssue { path: package_json, diff --git a/turbopack/crates/turbopack/src/lib.rs b/turbopack/crates/turbopack/src/lib.rs index 22d3095a1820e..10ff9d16bcab2 100644 --- a/turbopack/crates/turbopack/src/lib.rs +++ b/turbopack/crates/turbopack/src/lib.rs @@ -927,13 +927,12 @@ impl AssetContext for ModuleAssetContext { async fn side_effect_free_packages(&self) -> Result> { let pkgs = &*self.module_options_context.await?.side_effect_free_packages; - let mut globs = Vec::with_capacity(pkgs.len()); + let mut glob = String::new(); + glob.push_str("**/node_modules/{"); + glob.push_str(pkgs.join(",").as_str()); + glob.push_str("}/**"); - for pkg in pkgs { - globs.push(Glob::new(format!("**/node_modules/{{{}}}/**", pkg).into())); - } - - Ok(Glob::alternatives(globs)) + Ok(Glob::new(glob.into())) } } From df9053c4967b695afdf46b90e7f6a96d7e4210fb Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 8 May 2025 17:19:23 -0700 Subject: [PATCH 07/17] update lock --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e335e5288fb35..4027a813fe9b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7847,7 +7847,7 @@ version = "12.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d11c8e71901401b9aae2ece4946eeb7674b14b8301a53768afbbeeb0e48b599" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "bitflags 2.9.0", "either", "new_debug_unreachable", @@ -7916,7 +7916,7 @@ version = "17.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bca0ad5b72d8b440e701d47f544a728543414f6f165c6c61a899a76d3c7fdf9d" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "bitflags 2.9.0", "indexmap 2.7.1", "num-bigint", From 0b0cc53ccd5e992e2394dae8d05771d0e8c11a7b Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Thu, 8 May 2025 17:41:31 -0700 Subject: [PATCH 08/17] clippy! --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 162 ++++++++++++-------- 1 file changed, 100 insertions(+), 62 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 429fe1023092e..c5f3654e21fb0 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -380,10 +380,49 @@ impl GlobProgram { // If we hit a `}` then we are done so compute the jumps and pop the prefix. if t == GlobToken::RBracket { let mut branches = left_bracket_starts.pop().unwrap(); - let mut prefix = &mut branches.prefix; + let prefix = &mut branches.prefix; let branches = &mut branches.branches; - build_alternatives(prefix, branches)?; - std::mem::swap(&mut instructions, &mut prefix); + + let num_branches = branches.len(); + if num_branches <= 1 { + bail!( + "Cannot have an alternation with less than 2 members, remove the \ + brackets?" + ); + } + let mut next_branch_offset = num_branches - 1; + for branch in &branches[0..num_branches - 1] { + // to jump past the branch we need to jump past all its instructions + // +1 to account for the JUMP + // instruction at the end + next_branch_offset += branch.len() + 1; + prefix.push(GlobInstruction::Fork( + next_branch_offset.try_into().context( + "glob too large, cannot have more than 32K instructions", + )?, + )); + next_branch_offset -= 1; // subtract one since we added a fork + // instruction. + } + let mut end_of_alternation = + next_branch_offset + branches.last().unwrap().len(); + for branch in &mut branches[0..num_branches - 1] { + end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of + // the + // alternation + prefix.append(branch); + prefix.push(GlobInstruction::Jump( + end_of_alternation.try_into().context( + "glob too large, cannot have more than 64K instructions", + )?, + )); + end_of_alternation -= 1; // account for the jump instruction + } + end_of_alternation -= branches.last().unwrap().len(); + prefix.append(branches.last_mut().unwrap()); + debug_assert!(end_of_alternation == 0); + + std::mem::swap(&mut instructions, prefix); } } GlobToken::End => { @@ -416,8 +455,8 @@ impl GlobProgram { let mut has_path_to_prefix_match = BitSet::new(instructions.len()); for start in (0..instructions.len()).rev() { - visited.set(start as usize); - let (valid_prefix_end, valid_match) = match instructions[start as usize] { + visited.set(start); + let (valid_prefix_end, valid_match) = match instructions[start] { GlobInstruction::MatchLiteral(byte) => (byte == b'/', false), GlobInstruction::MatchAnyNonDelim => (false, false), GlobInstruction::MatchGlobStar { terminal } => { @@ -451,7 +490,7 @@ impl GlobProgram { ) } GlobInstruction::Fork(offset) => { - let next_instruction = (start + 1) as usize; + let next_instruction = start + 1; debug_assert!( visited.has(next_instruction), "should have already visited the target" @@ -466,7 +505,7 @@ impl GlobProgram { // So we don't need to follow them now next } else { - let fork_target = start as usize + offset as usize; + let fork_target = start + offset as usize; debug_assert!( visited.has(fork_target), "should have already visited the target" @@ -486,10 +525,10 @@ impl GlobProgram { } }; if valid_prefix_end { - has_path_to_prefix_match.set(start as usize) + has_path_to_prefix_match.set(start) } if valid_match { - has_path_to_match.set(start as usize) + has_path_to_match.set(start) } } @@ -520,22 +559,34 @@ impl GlobProgram { // program but typically far less // This bounds execution at O(N*M) where N is the size of the path and M is the size of the // program - for &byte in v.as_bytes() { + let mut n_threads = 1; + let mut ip = 0; + 'outer: for &byte in v.as_bytes() { let mut thread_index = 0; - // We need to use this looping construct since we may add elements to `cur` as we go. - // `cur.n` will never be > `len` so this loop is bounded to N iterations. - let mut n_threads = cur.n; - while thread_index < n_threads { - let ip = cur.get(thread_index); + // We manage the loop manually at the bottom to make it easier to skip it when hitting + // some fast paths + loop { match self.instructions[ip as usize] { GlobInstruction::MatchLiteral(m) => { if byte == m { + if n_threads == 1 { + cur.clear(); + ip += 1; + cur.add(ip); + continue 'outer; + } // We matched, proceed to the next character next.add(ip + 1); } } GlobInstruction::MatchAnyNonDelim => { if byte != b'/' { + if n_threads == 1 { + cur.clear(); + ip += 1; + cur.add(ip); + continue 'outer; + } next.add(ip + 1); } } @@ -548,17 +599,34 @@ impl GlobProgram { // If we see a `/` then we need to consider ending the globstar. if byte == b'/' { next.add(ip + 1); + // but even so we should keep trying to match, just like a fork. + next.add(ip); + } else { + if n_threads == 1 { + continue 'outer; + } + next.add(ip); } - // but even so we should keep trying to match, just like a fork. - next.add(ip); } GlobInstruction::MatchClass(index) => { if self.range_sets[index as usize].contains(byte) { + if n_threads == 1 { + cur.clear(); + ip += 1; + cur.add(ip); + continue 'outer; + } next.add(ip + 1); } } GlobInstruction::NegativeMatchClass(index) => { if !self.range_sets[index as usize].contains(byte) { + if n_threads == 1 { + cur.clear(); + ip += 1; + cur.add(ip); + continue 'outer; + } next.add(ip + 1); } } @@ -569,10 +637,11 @@ impl GlobProgram { } } GlobInstruction::Fork(offset) => { - let added1 = cur.add(ip + 1); - let added2 = cur.add((offset + (ip as i16)) as u16); - if added1 || added2 { - n_threads = cur.n; + if cur.add(ip + 1) { + n_threads += 1; + } + if cur.add((offset + (ip as i16)) as u16) { + n_threads += 1; } } GlobInstruction::Match => { @@ -580,13 +649,21 @@ impl GlobProgram { // so this thread dies } } + // Do this at the bottom of the loop thread_index += 1; + if thread_index < n_threads { + ip = cur.get(thread_index); + } else { + break; + } } - if next.n == 0 { + n_threads = next.n; + if n_threads == 0 { // This means that all threads exited early. This isn't needed for correctness, // but there is no point iterating the rest of the characters. return false; } + ip = next.get(0); // We have some progress! clear current and swap the two lists to advance to the next // character. cur.clear(); @@ -618,45 +695,6 @@ impl GlobProgram { } } -fn build_alternatives( - prefix: &mut Vec, - branches: &mut Vec>, -) -> Result<(), anyhow::Error> { - let num_branches = branches.len(); - if num_branches <= 1 { - bail!("Cannot have an alternation with less than 2 members, remove the brackets?"); - } - let mut next_branch_offset = num_branches - 1; - for branch in &branches[0..num_branches - 1] { - // to jump past the branch we need to jump past all its instructions +1 - // to account for the JUMP instruction at the end - next_branch_offset += branch.len() + 1; - prefix.push(GlobInstruction::Fork( - next_branch_offset - .try_into() - .context("glob too large, cannot have more than 32K instructions")?, - )); - next_branch_offset -= 1; // subtract one since we added a fork - // instruction. - } - let mut end_of_alternation = next_branch_offset + branches.last().unwrap().len(); - for branch in &mut branches[0..num_branches - 1] { - end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of the - // alternation - prefix.extend(branch.drain(..)); - prefix.push(GlobInstruction::Jump( - end_of_alternation - .try_into() - .context("glob too large, cannot have more than 64K instructions")?, - )); - end_of_alternation -= 1; // account for the jump instruction - } - end_of_alternation -= branches.last().unwrap().len(); - prefix.extend(branches.last_mut().unwrap().drain(..)); - debug_assert!(end_of_alternation == 0); - Ok(()) -} - // Consider a more compact encoding. // The jump offsets force this to 4 bytes // A variable length instruction encoding would help a lot @@ -966,7 +1004,7 @@ mod tests { #[test] fn test_tokenizer() { let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); - let prefix: Vec = "foo/bar".bytes().map(|c| GlobToken::Literal(c)).collect(); + let prefix: Vec = "foo/bar".bytes().map(GlobToken::Literal).collect(); for t in prefix { assert_eq!(t, tok.next_token()); } From 4c11f32a455c3d63af69430bf17ab9a194c79547 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 08:23:58 -0700 Subject: [PATCH 09/17] improve error messages --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 76 +++++++++++-------- .../src/chunk/placeable.rs | 2 +- 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index c5f3654e21fb0..08d394fe6f9ee 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -234,6 +234,9 @@ pub struct GlobProgram { impl GlobProgram { fn compile(pattern: &str) -> Result { + GlobProgram::do_compile(pattern).with_context(|| format!("Failed to parse glob: {pattern}")) + } + fn do_compile(pattern: &str) -> Result { let mut tok = Tokenizer::new(pattern); let mut instructions = Vec::new(); let mut range_sets = Vec::new(); @@ -248,7 +251,7 @@ impl GlobProgram { // complexity of building and visiting an AST. This simple program also allows // trivial optimizations to be performed during compilation inline. loop { - let t = tok.next_token(); + let (pos, t) = tok.next_token(); match t { GlobToken::Caret | GlobToken::ExclamationPoint => { panic!("these cannot appear at the beginning of a pattern") @@ -283,7 +286,7 @@ impl GlobProgram { let mut partial_range: Option = None; let mut negated = false; loop { - let t = tok.next_token(); + let (pos, t) = tok.next_token(); match t { GlobToken::Caret | GlobToken::ExclamationPoint => { if !set.is_empty() @@ -292,7 +295,7 @@ impl GlobProgram { { bail!( "negation tokens can only appear at the beginning of \ - character classes" + character classes @{pos}" ); } negated = !negated; @@ -303,7 +306,7 @@ impl GlobProgram { // several RanngeSets for each byte in the multibyte characters // However, this is very unlikely to be required by a user so // for now the feature is omitted. - bail!("Unsupported non-ascii character in set"); + bail!("Unsupported non-ascii character in set @{pos}"); } if let Some(start) = partial_range { set.push(ByteRange::range(start, lit)); @@ -320,7 +323,10 @@ impl GlobProgram { prev_literal = None; partial_range = Some(lit); } else { - bail!("Unexpected hyphen at the beginning of a character class") + bail!( + "Unexpected hyphen at the beginning of a character class \ + @{pos}" + ) } } @@ -329,13 +335,14 @@ impl GlobProgram { set.push(ByteRange::singleton(lit)); } if partial_range.is_some() { - bail!("unexpected hyphen at the end of a character class") + bail!( + "unexpected hyphen at the end of a character class @{pos}" + ) } let set = RangeSet::new(set); - let index: u8 = range_sets - .len() - .try_into() - .context("Cannot have more than 255 range sets")?; + let index: u8 = range_sets.len().try_into().with_context(|| { + format!("Cannot have more than 255 range sets @{pos}") + })?; range_sets.push(set); // It is more efficient to use a separate instruction than pad out // the range class @@ -347,7 +354,7 @@ impl GlobProgram { break; } _ => { - bail!("Unexpected token {t} inside of character class"); + bail!("Unexpected token {t} inside of character class @{pos}"); } } } @@ -387,7 +394,7 @@ impl GlobProgram { if num_branches <= 1 { bail!( "Cannot have an alternation with less than 2 members, remove the \ - brackets?" + brackets? @{pos}" ); } let mut next_branch_offset = num_branches - 1; @@ -397,9 +404,12 @@ impl GlobProgram { // instruction at the end next_branch_offset += branch.len() + 1; prefix.push(GlobInstruction::Fork( - next_branch_offset.try_into().context( - "glob too large, cannot have more than 32K instructions", - )?, + next_branch_offset.try_into().with_context(|| { + format!( + "glob too large, cannot have more than 32K instructions \ + @{pos}" + ) + })?, )); next_branch_offset -= 1; // subtract one since we added a fork // instruction. @@ -412,9 +422,12 @@ impl GlobProgram { // alternation prefix.append(branch); prefix.push(GlobInstruction::Jump( - end_of_alternation.try_into().context( - "glob too large, cannot have more than 64K instructions", - )?, + end_of_alternation.try_into().with_context(|| { + format!( + "glob too large, cannot have more than 32K instructions \ + @{pos}" + ) + })?, )); end_of_alternation -= 1; // account for the jump instruction } @@ -795,7 +808,10 @@ impl<'a> Tokenizer<'a> { square_bracket_count: 0, } } - fn next_token(&mut self) -> GlobToken { + fn next_token(&mut self) -> (usize, GlobToken) { + (self.pos, self.do_next_token()) + } + fn do_next_token(&mut self) -> GlobToken { match self.input.get(self.pos) { None => GlobToken::End, Some(c) => { @@ -1006,17 +1022,17 @@ mod tests { let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); let prefix: Vec = "foo/bar".bytes().map(GlobToken::Literal).collect(); for t in prefix { - assert_eq!(t, tok.next_token()); + assert_eq!(t, tok.next_token().1); } - assert_eq!(GlobToken::LSquareBracket, tok.next_token()); - assert_eq!(GlobToken::Literal(b'a'), tok.next_token()); - assert_eq!(GlobToken::Hyphen, tok.next_token()); - assert_eq!(GlobToken::Literal(b'z'), tok.next_token()); - assert_eq!(GlobToken::RSquareBracket, tok.next_token()); - assert_eq!(GlobToken::Literal(b'/'), tok.next_token()); - assert_eq!(GlobToken::QuestionMark, tok.next_token()); - assert_eq!(GlobToken::Literal(b'/'), tok.next_token()); - assert_eq!(GlobToken::GlobStar, tok.next_token()); - assert_eq!(GlobToken::End, tok.next_token()); + assert_eq!(GlobToken::LSquareBracket, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'a'), tok.next_token().1); + assert_eq!(GlobToken::Hyphen, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'z'), tok.next_token().1); + assert_eq!(GlobToken::RSquareBracket, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'/'), tok.next_token().1); + assert_eq!(GlobToken::QuestionMark, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'/'), tok.next_token().1); + assert_eq!(GlobToken::GlobStar, tok.next_token().1); + assert_eq!(GlobToken::End, tok.next_token().1); } } diff --git a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs index b9cd16c374769..e3ee16c48a34a 100644 --- a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs +++ b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use turbo_tasks::{ResolvedVc, TryFlatJoinIterExt, Vc}; +use turbo_tasks::{ResolvedVc, Vc}; use turbo_tasks_fs::{glob::Glob, FileJsonContent, FileSystemPath}; use turbopack_core::{ asset::Asset, From 054b7ead11329a180741cf230246bb4330e24703 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 12:02:05 -0700 Subject: [PATCH 10/17] eliminate backwards jumps create a more optimal * matcher for when it is suffixed by a literal as is common --- .../crates/turbo-tasks-fs/benches/mod.rs | 4 +- turbopack/crates/turbo-tasks-fs/src/glob.rs | 637 +++++++++++------- 2 files changed, 383 insertions(+), 258 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/benches/mod.rs b/turbopack/crates/turbo-tasks-fs/benches/mod.rs index 94c61f4e214ef..4d5fdffa0a3a2 100644 --- a/turbopack/crates/turbo-tasks-fs/benches/mod.rs +++ b/turbopack/crates/turbo-tasks-fs/benches/mod.rs @@ -102,13 +102,13 @@ fn bench_rope_iteration(c: &mut Criterion) { } fn bench_glob_match_simple(c: &mut Criterion) { - const GLOB: &str = "some/**/n*de?txt"; + const GLOB: &str = "some/**/n*d[k-m]e?txt"; const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; glob_bench(c, "simple", GLOB, PATH); } fn bench_glob_match_alternations(c: &mut Criterion) { - const GLOB: &str = "some/**/{tob,crazy}/*.{png,txt}"; + const GLOB: &str = "some/**/{tob,crazy}/?*.{png,txt}"; const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; glob_bench(c, "alternations", GLOB, PATH); diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 08d394fe6f9ee..2a81b966d894f 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,4 +1,4 @@ -use std::{cmp::Ordering, fmt::Display}; +use std::{cmp::Ordering, collections::VecDeque, fmt::Display}; use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; @@ -237,216 +237,13 @@ impl GlobProgram { GlobProgram::do_compile(pattern).with_context(|| format!("Failed to parse glob: {pattern}")) } fn do_compile(pattern: &str) -> Result { - let mut tok = Tokenizer::new(pattern); + let root = Ast::parse(pattern)?; + let mut instructions = Vec::new(); let mut range_sets = Vec::new(); - // A stack of alternates - struct PendingAlternates { - prefix: Vec, - branches: Vec>, - } - let mut left_bracket_starts: Vec = Vec::new(); - // We avoid synthesizing an AST and compile directly from the lexer, this adds a touch of - // trickiness for alternations but that can be managed by a single stack and avoids the - // complexity of building and visiting an AST. This simple program also allows - // trivial optimizations to be performed during compilation inline. - loop { - let (pos, t) = tok.next_token(); - match t { - GlobToken::Caret | GlobToken::ExclamationPoint => { - panic!("these cannot appear at the beginning of a pattern") - } - GlobToken::Literal(lit) => { - instructions.push(GlobInstruction::MatchLiteral(lit)); - } - GlobToken::Star => { - instructions.push(GlobInstruction::Fork(3)); // allowed to match nothing - instructions.push(GlobInstruction::MatchAnyNonDelim); - // After a star match we either loop back to try again, or proceed. - instructions.push(GlobInstruction::Fork(-1)); - } - GlobToken::GlobStar => { - // allow globstars to match nothing by skipping it - instructions.push(GlobInstruction::Fork(2)); - // We don't actually know if it is terminal or not yet. We will determine - // this after code generation. - instructions.push(GlobInstruction::MatchGlobStar { terminal: false }); - } - GlobToken::QuestionMark => { - // A question match, optionally matches a non-delimiter character, so we either - // skip forward or not. - instructions.push(GlobInstruction::Fork(2)); - instructions.push(GlobInstruction::MatchAnyNonDelim); - } - GlobToken::LSquareBracket => { - let mut set = Vec::new(); - // with in a square bracket we expect a mix of literals and hyphens followed by - // a RSuareBracket - let mut prev_literal: Option = None; - let mut partial_range: Option = None; - let mut negated = false; - loop { - let (pos, t) = tok.next_token(); - match t { - GlobToken::Caret | GlobToken::ExclamationPoint => { - if !set.is_empty() - || prev_literal.is_some() - || partial_range.is_some() - { - bail!( - "negation tokens can only appear at the beginning of \ - character classes @{pos}" - ); - } - negated = !negated; - } - GlobToken::Literal(lit) => { - if lit > 127 { - // TODO(lukesandberg): These are supportable by expanding into - // several RanngeSets for each byte in the multibyte characters - // However, this is very unlikely to be required by a user so - // for now the feature is omitted. - bail!("Unsupported non-ascii character in set @{pos}"); - } - if let Some(start) = partial_range { - set.push(ByteRange::range(start, lit)); - partial_range = None; - } else { - if let Some(prev) = prev_literal { - set.push(ByteRange::singleton(prev)); - } - prev_literal = Some(lit); - } - } - GlobToken::Hyphen => { - if let Some(lit) = prev_literal { - prev_literal = None; - partial_range = Some(lit); - } else { - bail!( - "Unexpected hyphen at the beginning of a character class \ - @{pos}" - ) - } - } - GlobToken::RSquareBracket => { - if let Some(lit) = prev_literal { - set.push(ByteRange::singleton(lit)); - } - if partial_range.is_some() { - bail!( - "unexpected hyphen at the end of a character class @{pos}" - ) - } - let set = RangeSet::new(set); - let index: u8 = range_sets.len().try_into().with_context(|| { - format!("Cannot have more than 255 range sets @{pos}") - })?; - range_sets.push(set); - // It is more efficient to use a separate instruction than pad out - // the range class - if negated { - instructions.push(GlobInstruction::NegativeMatchClass(index)) - } else { - instructions.push(GlobInstruction::MatchClass(index)); - } - break; - } - _ => { - bail!("Unexpected token {t} inside of character class @{pos}"); - } - } - } - } - GlobToken::RSquareBracket | GlobToken::Hyphen => panic!( - "should never happen, tokenizer should have already rejected or been consumed \ - within another branch" - ), - GlobToken::LBracket => { - let mut tmp = Vec::new(); - std::mem::swap(&mut instructions, &mut tmp); - left_bracket_starts.push(PendingAlternates { - prefix: tmp, - branches: Vec::new(), - }); - } - GlobToken::Comma | GlobToken::RBracket => { - // We don't need to check this, the tokenizer enforces that these tokens can - // only occur inside of an alternation so there must be something on our stack. - { - let pending = left_bracket_starts.last_mut().unwrap(); - // for an alternation we either 'jump' past it to the next one, or we - // process it directly - - let mut branch = Vec::new(); - std::mem::swap(&mut instructions, &mut branch); - pending.branches.push(branch); - } - - // If we hit a `}` then we are done so compute the jumps and pop the prefix. - if t == GlobToken::RBracket { - let mut branches = left_bracket_starts.pop().unwrap(); - let prefix = &mut branches.prefix; - let branches = &mut branches.branches; - - let num_branches = branches.len(); - if num_branches <= 1 { - bail!( - "Cannot have an alternation with less than 2 members, remove the \ - brackets? @{pos}" - ); - } - let mut next_branch_offset = num_branches - 1; - for branch in &branches[0..num_branches - 1] { - // to jump past the branch we need to jump past all its instructions - // +1 to account for the JUMP - // instruction at the end - next_branch_offset += branch.len() + 1; - prefix.push(GlobInstruction::Fork( - next_branch_offset.try_into().with_context(|| { - format!( - "glob too large, cannot have more than 32K instructions \ - @{pos}" - ) - })?, - )); - next_branch_offset -= 1; // subtract one since we added a fork - // instruction. - } - let mut end_of_alternation = - next_branch_offset + branches.last().unwrap().len(); - for branch in &mut branches[0..num_branches - 1] { - end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of - // the - // alternation - prefix.append(branch); - prefix.push(GlobInstruction::Jump( - end_of_alternation.try_into().with_context(|| { - format!( - "glob too large, cannot have more than 32K instructions \ - @{pos}" - ) - })?, - )); - end_of_alternation -= 1; // account for the jump instruction - } - end_of_alternation -= branches.last().unwrap().len(); - prefix.append(branches.last_mut().unwrap()); - debug_assert!(end_of_alternation == 0); - - std::mem::swap(&mut instructions, prefix); - } - } - GlobToken::End => { - instructions.push(GlobInstruction::Match); - break; - } - } - } - if let Some(err) = tok.err { - return Err(err); - } + generate_code(&mut instructions, &mut range_sets, root)?; + instructions.push(GlobInstruction::Match); // Now we need to annotate 'terminal' globstars in order to speed up validating program // matches. The issue is that if we are executing a globstar when the program @@ -463,15 +260,20 @@ impl GlobProgram { bail!("program too large"); } + // This is just used for validation let mut visited = BitSet::new(instructions.len()); + // For each instruction, tracks if there is a a path from it to `Match` let mut has_path_to_match = BitSet::new(instructions.len()); let mut has_path_to_prefix_match = BitSet::new(instructions.len()); + // Compute paths by iterating backwards through the instructions. for start in (0..instructions.len()).rev() { visited.set(start); let (valid_prefix_end, valid_match) = match instructions[start] { GlobInstruction::MatchLiteral(byte) => (byte == b'/', false), - GlobInstruction::MatchAnyNonDelim => (false, false), + GlobInstruction::MatchManyNonDelimWithLit(..) + | GlobInstruction::MatchManyNonDelim + | GlobInstruction::MatchAnyNonDelim => (false, false), GlobInstruction::MatchGlobStar { terminal } => { debug_assert!(!terminal); // shouldn't have been set // a globstar is always a valid prefix end but is only a valid match if the @@ -513,22 +315,16 @@ impl GlobProgram { has_path_to_match.has(next_instruction), ); - if offset < 0 { - // Forks that point backwards are implementing loops. - // So we don't need to follow them now - next - } else { - let fork_target = start + offset as usize; - debug_assert!( - visited.has(fork_target), - "should have already visited the target" - ); - let fork = ( - has_path_to_prefix_match.has(fork_target), - has_path_to_match.has(fork_target), - ); - (next.0 || fork.0, next.1 || fork.1) - } + let fork_target = start + offset as usize; + debug_assert!( + visited.has(fork_target), + "should have already visited the target" + ); + let fork = ( + has_path_to_prefix_match.has(fork_target), + has_path_to_match.has(fork_target), + ); + (next.0 || fork.0, next.1 || fork.1) } GlobInstruction::Match => { // For prefix matches, we only want to say it matches if a a file within the @@ -592,6 +388,7 @@ impl GlobProgram { next.add(ip + 1); } } + GlobInstruction::MatchAnyNonDelim => { if byte != b'/' { if n_threads == 1 { @@ -603,6 +400,29 @@ impl GlobProgram { next.add(ip + 1); } } + GlobInstruction::MatchManyNonDelim => { + if byte != b'/' { + // keep evaluating this instruction and possibly exit + next.add(ip); + next.add(ip + 1); + } + } + GlobInstruction::MatchManyNonDelimWithLit(exit) => { + if byte != b'/' { + // if we match the exit consider this the same as a literal match and + // jump to the subsequent instruction + if byte == exit { + next.add(ip + 2); + next.add(ip); + } else { + // otherwise we can just loop directly like a globstar + if n_threads == 1 { + continue 'outer; + } + next.add(ip); + } + } + } GlobInstruction::MatchGlobStar { terminal } => { if terminal { // If we find a terminal globstar, we are done! this must match whatever @@ -615,6 +435,8 @@ impl GlobProgram { // but even so we should keep trying to match, just like a fork. next.add(ip); } else { + // Otherwise we keep globbing, if we are the only thread jump to the + // next byte if n_threads == 1 { continue 'outer; } @@ -653,7 +475,7 @@ impl GlobProgram { if cur.add(ip + 1) { n_threads += 1; } - if cur.add((offset + (ip as i16)) as u16) { + if cur.add(offset + ip) { n_threads += 1; } } @@ -708,6 +530,320 @@ impl GlobProgram { } } +enum Ast { + Child(usize, GlobToken), + Alternation(usize, Vec<(usize, VecDeque)>), + Class(RangeSet, bool), +} +impl Ast { + fn parse(glob: &str) -> Result> { + let mut tok = Tokenizer::new(glob); + let mut stack: Vec> = Vec::new(); + stack.push(VecDeque::new()); + loop { + let (pos, token) = tok.next_token(); + match token { + GlobToken::Star + | GlobToken::Delimiter + | GlobToken::QuestionMark + | GlobToken::GlobStar + | GlobToken::Literal(_) => { + stack.last_mut().unwrap().push_back(Ast::Child(pos, token)) + } + GlobToken::Caret | GlobToken::ExclamationPoint => { + panic!("BUG: these cannot appear at the beginning of a pattern") + } + GlobToken::LSquareBracket => { + let mut set = Vec::new(); + // with in a square bracket we expect a mix of literals and hyphens followed by + // a RSuareBracket + let mut prev_literal: Option = None; + let mut partial_range: Option = None; + let mut negated = false; + loop { + let (pos, t) = tok.next_token(); + match t { + GlobToken::Caret | GlobToken::ExclamationPoint => { + if !set.is_empty() + || prev_literal.is_some() + || partial_range.is_some() + { + bail!( + "negation tokens can only appear at the beginning of \ + character classes @{pos}" + ); + } + negated = !negated; + } + GlobToken::Literal(lit) => { + if lit > 127 { + // TODO(lukesandberg): These are supportable by expanding into + // several RanngeSets for each byte in the multibyte characters + // However, this is very unlikely to be required by a user so + // for now the feature is omitted. + bail!("Unsupported non-ascii character in set @{pos}"); + } + if let Some(start) = partial_range { + set.push(ByteRange::range(start, lit)); + partial_range = None; + } else { + if let Some(prev) = prev_literal { + set.push(ByteRange::singleton(prev)); + } + prev_literal = Some(lit); + } + } + GlobToken::Hyphen => { + if let Some(lit) = prev_literal { + prev_literal = None; + partial_range = Some(lit); + } else { + bail!( + "Unexpected hyphen at the beginning of a character class \ + @{pos}" + ) + } + } + + GlobToken::RSquareBracket => { + if let Some(lit) = prev_literal { + set.push(ByteRange::singleton(lit)); + } + if partial_range.is_some() { + bail!( + "Unexpected hyphen at the end of a character class @{pos}" + ) + } + stack + .last_mut() + .unwrap() + .push_back(Ast::Class(RangeSet::new(set), negated)); + break; + } + _ => { + bail!("Unexpected token {t} inside of character class @{pos}"); + } + } + } + } + GlobToken::RSquareBracket | GlobToken::Hyphen => panic!( + "should never happen, tokenizer should have already rejected or been consumed \ + within another branch" + ), + GlobToken::LBracket => { + stack + .last_mut() + .unwrap() + .push_back(Ast::Alternation(pos, vec![(pos, VecDeque::new())])); + stack.push(VecDeque::new()) + } + GlobToken::Comma | GlobToken::RBracket => { + let mut last_branch = stack.pop().unwrap(); + if let Ast::Alternation(_, branches) = + stack.last_mut().unwrap().back_mut().unwrap() + { + branches.last_mut().unwrap().1.append(&mut last_branch); + if token == GlobToken::Comma { + branches.push((pos, VecDeque::new())); + stack.push(VecDeque::new()); + } + } else { + // The lexer ensures that these tokens only occur in the context of an + // alternation. + panic!("impossible!, token in unexpected place"); + } + } + GlobToken::End => break, + } + } + if stack.len() != 1 { + bail!("Expected '}}' before end of pattern"); + } + if let Some(err) = tok.err { + return Err(err); + } + Ok(stack.pop().unwrap()) + } +} + +fn generate_code( + instructions: &mut Vec, + range_sets: &mut Vec, + mut root: VecDeque, +) -> Result<(bool, bool), anyhow::Error> { + // check and validate globstar structure + let mut i = 0; + let mut starts_with_globstar = false; + let mut ends_with_globstar = false; + while i < root.len() { + if let Ast::Child(pos, GlobToken::GlobStar) = root[i] { + // a ** should be followed by a `/` or the end of the branch + if i < root.len() - 1 { + let next = &root[i + 1]; + if !matches!(next, Ast::Child(_, GlobToken::Delimiter)) { + bail!("Globstar must be a complete path segment, e.g. /**/ @{pos}"); + } + root.remove(i + 1); + } else { + ends_with_globstar = true; + } + + // a ** should be prefixed by a `/` or the beginning of the branch + // duplicated **/**/ are just dropped. + if i > 0 { + let prev = &root[i - 1]; + if matches!(prev, Ast::Child(_, GlobToken::GlobStar)) { + root.remove(i); + i -= 1; + } else if !matches!(prev, Ast::Child(_, GlobToken::Delimiter)) { + bail!("Globstar must be a complete path segment, e.g. /**/ @{pos}"); + } + } else { + starts_with_globstar = true; + } + } + i += 1; + } + + let mut prev_token_was_delimiter = false; + let mut is_first = true; + while let Some(node) = root.pop_front() { + match node { + Ast::Child(_, glob_token) => { + prev_token_was_delimiter = false; + match glob_token { + GlobToken::Literal(byte) => { + instructions.push(GlobInstruction::MatchLiteral(byte)) + } + GlobToken::Delimiter => { + prev_token_was_delimiter = true; + instructions.push(GlobInstruction::MatchLiteral(b'/')) + } + GlobToken::Star => { + instructions.push(GlobInstruction::Fork(2)); // allowed to match nothing + instructions.push(match root.front() { + Some(Ast::Child(_, GlobToken::Literal(byte))) => { + GlobInstruction::MatchManyNonDelimWithLit(*byte) + } + _ => GlobInstruction::MatchManyNonDelim, + }); + } + GlobToken::QuestionMark => { + // A question match, optionally matches a non-delimiter character, so we + // either skip forward or not. + instructions.push(GlobInstruction::Fork(2)); + instructions.push(GlobInstruction::MatchAnyNonDelim); + } + GlobToken::GlobStar => { + // allow globstars to match nothing by skipping it + instructions.push(GlobInstruction::Fork(2)); + // We don't actually know if it is terminal or not yet. We will determine + // this after code generation. + instructions.push(GlobInstruction::MatchGlobStar { terminal: false }); + } + + GlobToken::LSquareBracket + | GlobToken::RSquareBracket + | GlobToken::LBracket + | GlobToken::RBracket + | GlobToken::Comma + | GlobToken::Hyphen + | GlobToken::ExclamationPoint + | GlobToken::Caret + | GlobToken::End => unreachable!(), + }; + } + Ast::Alternation(pos, branches) => { + let num_branches = branches.len(); + if num_branches <= 1 { + bail!("alternations should have >1 branch, remove the {{'s? @{pos}") + } + let mut branch_instructions = Vec::with_capacity(branches.len()); + for (branch_pos, branch) in branches { + let mut instructions = Vec::new(); + let (branch_starts_with_globstar, branch_ends_with_globstar) = + generate_code(&mut instructions, range_sets, branch)?; + if branch_starts_with_globstar { + if !is_first { + if !prev_token_was_delimiter { + bail!( + "Alternation begins with a glob star that is not prefixed by \ + a '/' @{branch_pos}" + ); + } + } else { + starts_with_globstar = true; + } + } + if branch_ends_with_globstar { + if root.is_empty() { + ends_with_globstar = true; + } else { + bail!( + "An alternation can only end with a glob star if it is at the end \ + of the pattern @{branch_pos}" + ); + } + } + branch_instructions.push(instructions); + } + + let mut next_branch_offset = num_branches - 1; + for branch in &branch_instructions[0..num_branches - 1] { + // to jump past the branch we need to jump past all its instructions + // +1 to account for the JUMP + // instruction at the end + next_branch_offset += branch.len() + 1; + instructions.push(GlobInstruction::Fork( + next_branch_offset.try_into().with_context(|| { + format!( + "glob too large, cannot have more than 32K instructions @{pos}" + ) + })?, + )); + next_branch_offset -= 1; // subtract one since we added a fork + // instruction. + } + let mut end_of_alternation = + next_branch_offset + branch_instructions.last().unwrap().len(); + for branch in &mut branch_instructions[0..num_branches - 1] { + end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of + // the + // alternation + instructions.append(branch); + instructions.push(GlobInstruction::Jump( + end_of_alternation.try_into().with_context(|| { + format!( + "glob too large, cannot have more than 32K instructions @{pos}" + ) + })?, + )); + end_of_alternation -= 1; // account for the jump instruction + } + let last_branch = branch_instructions.last_mut().unwrap(); + end_of_alternation -= last_branch.len(); + instructions.append(last_branch); + debug_assert!(end_of_alternation == 0); + } + Ast::Class(range_set, negated) => { + prev_token_was_delimiter = false; + let index: u8 = range_sets + .len() + .try_into() + .context("Cannot have >255 character classes in a glob")?; + range_sets.push(range_set); + instructions.push(if negated { + GlobInstruction::NegativeMatchClass(index) + } else { + GlobInstruction::MatchClass(index) + }); + } + } + is_first = false; + } + Ok((starts_with_globstar, ends_with_globstar)) +} + // Consider a more compact encoding. // The jump offsets force this to 4 bytes // A variable length instruction encoding would help a lot @@ -717,6 +853,10 @@ enum GlobInstruction { MatchLiteral(u8), // Matches any non-`/` character MatchAnyNonDelim, + // Matches any number of non-`/` character + MatchManyNonDelim, + // Matches any number of non-`/` character followed by a literal as a hint + MatchManyNonDelimWithLit(u8), // Matches **, which is any character but can only 'end' on a `/` or end of string MatchGlobStar { terminal: bool }, // Matches any character in the set @@ -729,8 +869,7 @@ enum GlobInstruction { // other alternates. Jump(u16), // Splits control flow into two branches. - // Represented as a signed integer since we allow backwards forks - Fork(i16), + Fork(u16), // End of program Match, } @@ -763,6 +902,8 @@ enum GlobToken { ExclamationPoint, // a '^' token. Same rules as ! Caret, + // a `/` token, + Delimiter, End, } impl Display for GlobToken { @@ -785,6 +926,7 @@ impl Display for GlobToken { GlobToken::Literal(_) => panic!("impossible"), GlobToken::ExclamationPoint => "!", GlobToken::Caret => "^", + GlobToken::Delimiter => "/", }) } } @@ -820,29 +962,7 @@ impl<'a> Tokenizer<'a> { b'*' if self.square_bracket_count == 0 => match self.input.get(self.pos) { Some(b) if *b == b'*' => { // This is a globstar - - if self.pos > 2 && self.input[self.pos - 2] != b'/' { - self.err = Some(anyhow!( - "a glob star must be a full path segment, e.g. `/**/` @{}", - self.pos - )); - return GlobToken::End; - } self.pos += 1; - match self.input.get(self.pos) { - None => {} - Some(b'/') => { - self.pos += 1; - } // ok if we are at the end or followed by a `/` - _ => { - self.err = Some(anyhow!( - "a glob star must be a full path segment, e.g. `/**/`@{}", - self.pos - )); - return GlobToken::End; - } - } - GlobToken::GlobStar } _ => GlobToken::Star, @@ -882,6 +1002,7 @@ impl<'a> Tokenizer<'a> { // only valid inside of a character class b'!' if self.square_bracket_count > 0 => GlobToken::ExclamationPoint, b'^' if self.square_bracket_count > 0 => GlobToken::Caret, + b'/' if self.square_bracket_count == 0 => GlobToken::Delimiter, cur => { if *cur == b'\\' { match self.input.get(self.pos) { @@ -982,6 +1103,7 @@ mod tests { #[case::alternatives_nested2("{a,b/c,d/e/{f,g/h}}", "b/c")] #[case::alternatives_nested3("{a,b/c,d/e/{f,g/h}}", "d/e/f")] #[case::alternatives_nested4("{a,b/c,d/e/{f,g/h}}", "d/e/g/h")] + #[case::alternatives_nested6("{a/**,b/**,{c/**,d/**}}", "d/")] // #[case::alternatives_chars("[abc]", "b")] fn glob_match(#[case] glob: &str, #[case] path: &str) { let parsed = Glob::parse(glob).unwrap(); @@ -1020,18 +1142,21 @@ mod tests { #[test] fn test_tokenizer() { let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); - let prefix: Vec = "foo/bar".bytes().map(GlobToken::Literal).collect(); - for t in prefix { - assert_eq!(t, tok.next_token().1); - } + assert_eq!(GlobToken::Literal(b'f'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'o'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'o'), tok.next_token().1); + assert_eq!(GlobToken::Delimiter, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'b'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'a'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'r'), tok.next_token().1); assert_eq!(GlobToken::LSquareBracket, tok.next_token().1); assert_eq!(GlobToken::Literal(b'a'), tok.next_token().1); assert_eq!(GlobToken::Hyphen, tok.next_token().1); assert_eq!(GlobToken::Literal(b'z'), tok.next_token().1); assert_eq!(GlobToken::RSquareBracket, tok.next_token().1); - assert_eq!(GlobToken::Literal(b'/'), tok.next_token().1); + assert_eq!(GlobToken::Delimiter, tok.next_token().1); assert_eq!(GlobToken::QuestionMark, tok.next_token().1); - assert_eq!(GlobToken::Literal(b'/'), tok.next_token().1); + assert_eq!(GlobToken::Delimiter, tok.next_token().1); assert_eq!(GlobToken::GlobStar, tok.next_token().1); assert_eq!(GlobToken::End, tok.next_token().1); } From 0af70696cdd7a5d4959237a76d4cdc75a294c23c Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 12:07:18 -0700 Subject: [PATCH 11/17] comments! --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 2a81b966d894f..b5f8487c34085 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -844,9 +844,8 @@ fn generate_code( Ok((starts_with_globstar, ends_with_globstar)) } -// Consider a more compact encoding. -// The jump offsets force this to 4 bytes -// A variable length instruction encoding would help a lot +// A more compact encoding would be nice but experimentally it does not save much +// Instead we should explore ways to encode more hints into the instructions to speed up matchers. #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] enum GlobInstruction { // Matches a single literal byte From 960677cc28ae865738dc2f85e2e88b370e2644c1 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 12:31:54 -0700 Subject: [PATCH 12/17] most of the matching time is spent in loops so move memory accesses out of the loop header and into our jump locations this requires some code duplication but because we eliminate reading `self.instructions` in the hot loops we can eliminate most of the memory operations and thus accelerate matching by a lot. With this we consistently beat 'fast-glob' and are neck and neck with globset. --- Cargo.lock | 7 ++++--- turbopack/crates/turbo-tasks-fs/Cargo.toml | 1 + turbopack/crates/turbo-tasks-fs/benches/mod.rs | 9 +++++++++ turbopack/crates/turbo-tasks-fs/src/glob.rs | 8 +++++++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4027a813fe9b4..1a014f4d4568a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2673,9 +2673,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "globset" -version = "0.4.14" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" dependencies = [ "aho-corasick", "bstr", @@ -3014,7 +3014,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.4.9", + "socket2 0.5.8", "tokio", "tower-service", "tracing", @@ -9569,6 +9569,7 @@ dependencies = [ "fast-glob", "futures", "futures-retry", + "globset", "include_dir", "indexmap 2.7.1", "jsonc-parser 0.21.0", diff --git a/turbopack/crates/turbo-tasks-fs/Cargo.toml b/turbopack/crates/turbo-tasks-fs/Cargo.toml index cdfc83924448e..0ed6aa37d8fbc 100644 --- a/turbopack/crates/turbo-tasks-fs/Cargo.toml +++ b/turbopack/crates/turbo-tasks-fs/Cargo.toml @@ -65,6 +65,7 @@ turbo-tasks-memory = { workspace = true } turbo-tasks-testing = { workspace = true } turbo-tasks-backend = { workspace = true } fast-glob = "0.4.5" +globset = "0.4.16" [build-dependencies] turbo-tasks-build = { workspace = true } diff --git a/turbopack/crates/turbo-tasks-fs/benches/mod.rs b/turbopack/crates/turbo-tasks-fs/benches/mod.rs index 4d5fdffa0a3a2..f5f1ceb37703b 100644 --- a/turbopack/crates/turbo-tasks-fs/benches/mod.rs +++ b/turbopack/crates/turbo-tasks-fs/benches/mod.rs @@ -128,6 +128,15 @@ fn glob_bench(c: &mut Criterion, name: &'static str, glob: &str, path: &str) { let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); group.bench_function("turbo-glob-cached", |b| b.iter(|| g.execute(path))); + group.bench_function("globset", |b| { + b.iter(|| { + let g = globset::Glob::new(glob).unwrap().compile_matcher(); + g.is_match(path) + }) + }); + let g = globset::Glob::new(glob).unwrap().compile_matcher(); + group.bench_function("globset-cached", |b| b.iter(|| g.is_match(path))); + group.finish(); } diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index b5f8487c34085..3bbe2a3724e25 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -370,18 +370,20 @@ impl GlobProgram { // program let mut n_threads = 1; let mut ip = 0; + let mut instruction = self.instructions[0]; 'outer: for &byte in v.as_bytes() { let mut thread_index = 0; // We manage the loop manually at the bottom to make it easier to skip it when hitting // some fast paths loop { - match self.instructions[ip as usize] { + match instruction { GlobInstruction::MatchLiteral(m) => { if byte == m { if n_threads == 1 { cur.clear(); ip += 1; cur.add(ip); + instruction = self.instructions[ip as usize + 1]; continue 'outer; } // We matched, proceed to the next character @@ -395,6 +397,7 @@ impl GlobProgram { cur.clear(); ip += 1; cur.add(ip); + instruction = self.instructions[ip as usize + 1]; continue 'outer; } next.add(ip + 1); @@ -449,6 +452,7 @@ impl GlobProgram { cur.clear(); ip += 1; cur.add(ip); + instruction = self.instructions[ip as usize + 1]; continue 'outer; } next.add(ip + 1); @@ -460,6 +464,7 @@ impl GlobProgram { cur.clear(); ip += 1; cur.add(ip); + instruction = self.instructions[ip as usize + 1]; continue 'outer; } next.add(ip + 1); @@ -488,6 +493,7 @@ impl GlobProgram { thread_index += 1; if thread_index < n_threads { ip = cur.get(thread_index); + instruction = self.instructions[ip as usize]; } else { break; } From 907c9cea6a2237ada4eb2f0e223f8749db5894e1 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 12:49:04 -0700 Subject: [PATCH 13/17] fix a bug, add comments, cry --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 39 +++++++++++++-------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 3bbe2a3724e25..8a9a1e0677556 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -104,6 +104,9 @@ impl Ord for ByteRange { #[derive(Debug, Eq, Clone, PartialEq, Hash, Serialize, Deserialize)] struct RangeSet { + // A sorted list of non-overlapping ranges. + // The ranges are also non-abutting, but that isn't + // This makes it ranges: Box<[ByteRange]>, } @@ -141,7 +144,9 @@ impl RangeSet { } // A sparse set of integers that supports O(1) insertion, testing, clearing and O(n) iteration. -// https://research.swtch.com/sparse +// https://research.swtch.com/sparse has a nice writeup +// This is useful for our purposes becasue we often need to clear the set and the O(1) clear time is +// critical. struct SparseSet<'a> { mem: &'a mut [u16], n: u16, @@ -175,6 +180,13 @@ impl<'a> SparseSet<'a> { fn clear(&mut self) { self.n = 0 } + fn clear_and_add(&mut self, v: u16) { + debug_assert!((v as usize) < self.mem.len() / 2); + self.n = 1; + *self.get_sparse_mut(v) = 0; + *self.get_dense_mut(0) = v; + } + fn add(&mut self, v: u16) -> bool { debug_assert!((v as usize) < self.mem.len() / 2); let n = self.n; @@ -368,6 +380,7 @@ impl GlobProgram { // program but typically far less // This bounds execution at O(N*M) where N is the size of the path and M is the size of the // program + // This is the same as `cur.n` but we track it here to avoid the indirect memory read let mut n_threads = 1; let mut ip = 0; let mut instruction = self.instructions[0]; @@ -380,10 +393,9 @@ impl GlobProgram { GlobInstruction::MatchLiteral(m) => { if byte == m { if n_threads == 1 { - cur.clear(); ip += 1; - cur.add(ip); - instruction = self.instructions[ip as usize + 1]; + cur.clear_and_add(ip); + instruction = self.instructions[ip as usize]; continue 'outer; } // We matched, proceed to the next character @@ -394,10 +406,9 @@ impl GlobProgram { GlobInstruction::MatchAnyNonDelim => { if byte != b'/' { if n_threads == 1 { - cur.clear(); ip += 1; - cur.add(ip); - instruction = self.instructions[ip as usize + 1]; + cur.clear_and_add(ip); + instruction = self.instructions[ip as usize]; continue 'outer; } next.add(ip + 1); @@ -449,10 +460,9 @@ impl GlobProgram { GlobInstruction::MatchClass(index) => { if self.range_sets[index as usize].contains(byte) { if n_threads == 1 { - cur.clear(); ip += 1; - cur.add(ip); - instruction = self.instructions[ip as usize + 1]; + cur.clear_and_add(ip); + instruction = self.instructions[ip as usize]; continue 'outer; } next.add(ip + 1); @@ -461,10 +471,9 @@ impl GlobProgram { GlobInstruction::NegativeMatchClass(index) => { if !self.range_sets[index as usize].contains(byte) { if n_threads == 1 { - cur.clear(); ip += 1; - cur.add(ip); - instruction = self.instructions[ip as usize + 1]; + cur.clear_and_add(ip); + instruction = self.instructions[ip as usize]; continue 'outer; } next.add(ip + 1); @@ -489,7 +498,8 @@ impl GlobProgram { // so this thread dies } } - // Do this at the bottom of the loop + // Do this at the bottom of the loop, this allows our early returns above to skip + // the dependent memory reads. thread_index += 1; if thread_index < n_threads { ip = cur.get(thread_index); @@ -505,6 +515,7 @@ impl GlobProgram { return false; } ip = next.get(0); + instruction = self.instructions[ip as usize]; // We have some progress! clear current and swap the two lists to advance to the next // character. cur.clear(); From b4c4bb0ac907b8b5323232dd3c4fb43f1163e37f Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 14:44:34 -0700 Subject: [PATCH 14/17] optimize loops even more eliminate dispatching overhead --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 152 +++++++++++++------- 1 file changed, 103 insertions(+), 49 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 8a9a1e0677556..67aeb4add62e7 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -242,6 +242,8 @@ pub struct GlobProgram { // Instructions we can end on that are a valid prefix match prefix_match_instructions: BitSet, range_sets: Box<[RangeSet]>, + // Constant prefixes if any, used as a prefilter. + prefix: Option>, } impl GlobProgram { @@ -250,11 +252,30 @@ impl GlobProgram { } fn do_compile(pattern: &str) -> Result { let root = Ast::parse(pattern)?; - let mut instructions = Vec::new(); let mut range_sets = Vec::new(); generate_code(&mut instructions, &mut range_sets, root)?; + + let mut prefix = None; + for instruction in &instructions { + match instruction { + GlobInstruction::MatchLiteral(b) => { + if prefix.is_none() { + prefix = Some(Vec::new()); + } + prefix.as_mut().unwrap().push(*b); + } + _ => { + break; + } + } + } + if let Some(ref prefix) = prefix { + // drop the prefix + instructions.drain(0..prefix.len()); + } + instructions.push(GlobInstruction::Match); // Now we need to annotate 'terminal' globstars in order to speed up validating program @@ -358,10 +379,24 @@ impl GlobProgram { match_instructions: has_path_to_match, prefix_match_instructions: has_path_to_prefix_match, range_sets: range_sets.into_boxed_slice(), + prefix: prefix.map(|v| v.into_boxed_slice()), }) } fn matches(&self, v: &str, prefix: bool) -> bool { + let mut v = v.as_bytes(); + // trim a prefix if we have one + if let Some(literal_prefix) = &self.prefix { + if prefix && v.len() <= literal_prefix.len() { + return v == &literal_prefix[0..v.len()]; + } else { + if literal_prefix.len() > v.len() || v[..literal_prefix.len()] != **literal_prefix { + return false; + } + v = &v[literal_prefix.len()..]; + } + } + let len = self.instructions.len(); // Use a single uninitialized allocation for our storage. let mut storage: Vec = @@ -384,20 +419,26 @@ impl GlobProgram { let mut n_threads = 1; let mut ip = 0; let mut instruction = self.instructions[0]; - 'outer: for &byte in v.as_bytes() { + let mut vi = 0; + let vlen = v.len(); + 'outer: while vi < vlen { + let mut byte = v[vi]; + vi += 1; let mut thread_index = 0; // We manage the loop manually at the bottom to make it easier to skip it when hitting // some fast paths loop { + // The dispatching is the slowest part of the loop. To mitigate we should allow + // some of our fastpaths to advance to the next byte locally. match instruction { GlobInstruction::MatchLiteral(m) => { if byte == m { - if n_threads == 1 { - ip += 1; - cur.clear_and_add(ip); - instruction = self.instructions[ip as usize]; - continue 'outer; - } + // if n_threads == 1 { + // ip += 1; + // cur.clear_and_add(ip); + // instruction = self.instructions[ip as usize]; + // continue 'outer; + // } // We matched, proceed to the next character next.add(ip + 1); } @@ -405,12 +446,12 @@ impl GlobProgram { GlobInstruction::MatchAnyNonDelim => { if byte != b'/' { - if n_threads == 1 { - ip += 1; - cur.clear_and_add(ip); - instruction = self.instructions[ip as usize]; - continue 'outer; - } + // if n_threads == 1 { + // ip += 1; + // cur.clear_and_add(ip); + // instruction = self.instructions[ip as usize]; + // continue 'outer; + // } next.add(ip + 1); } } @@ -422,19 +463,25 @@ impl GlobProgram { } } GlobInstruction::MatchManyNonDelimWithLit(exit) => { - if byte != b'/' { - // if we match the exit consider this the same as a literal match and - // jump to the subsequent instruction - if byte == exit { - next.add(ip + 2); - next.add(ip); - } else { - // otherwise we can just loop directly like a globstar - if n_threads == 1 { - continue 'outer; + loop { + if byte != b'/' { + // if we match the exit consider this the same as a literal match + // and jump to the subsequent + // instruction + if byte == exit { + next.add(ip); + next.add(ip + 2); + } else { + // otherwise we can just loop directly like a globstar + if n_threads == 1 && vi < vlen { + byte = v[vi]; + vi += 1; + continue; + } + next.add(ip); } - next.add(ip); } + break; } } GlobInstruction::MatchGlobStar { terminal } => { @@ -443,44 +490,51 @@ impl GlobProgram { // remains return true; } - // If we see a `/` then we need to consider ending the globstar. - if byte == b'/' { - next.add(ip + 1); - // but even so we should keep trying to match, just like a fork. - next.add(ip); - } else { - // Otherwise we keep globbing, if we are the only thread jump to the - // next byte - if n_threads == 1 { - continue 'outer; + loop { + // If we see a `/` then we need to consider ending the globstar. + if byte == b'/' { + // but even so we keep trying to match, just like a fork. + next.add(ip); + next.add(ip + 1); + } else { + // Otherwise we keep globbing, if we are the only thread jump to the + // next byte + if n_threads == 1 && vi < vlen { + byte = v[vi]; + vi += 1; + continue; + } + // otherwise wait for the other threads to complete + next.add(ip); } - next.add(ip); + break; } } GlobInstruction::MatchClass(index) => { if self.range_sets[index as usize].contains(byte) { - if n_threads == 1 { - ip += 1; - cur.clear_and_add(ip); - instruction = self.instructions[ip as usize]; - continue 'outer; - } + // if n_threads == 1 { + // ip += 1; + // cur.clear_and_add(ip); + // instruction = self.instructions[ip as usize]; + // continue 'outer; + // } next.add(ip + 1); } } GlobInstruction::NegativeMatchClass(index) => { if !self.range_sets[index as usize].contains(byte) { - if n_threads == 1 { - ip += 1; - cur.clear_and_add(ip); - instruction = self.instructions[ip as usize]; - continue 'outer; - } + // if n_threads == 1 { + // ip += 1; + // cur.clear_and_add(ip); + // instruction = self.instructions[ip as usize]; + // continue 'outer; + // } next.add(ip + 1); } } GlobInstruction::Jump(offset) => { // Push another thread onto the current list + // This is just for when we exiting alternations to skip over alternates if cur.add(offset + ip) { n_threads += 1; } @@ -495,7 +549,7 @@ impl GlobProgram { } GlobInstruction::Match => { // We ran out of instructions while we still have characters - // so this thread dies + // so this thread dies. } } // Do this at the bottom of the loop, this allows our early returns above to skip From be675bbaa33637623848855b759ac4d3485270c5 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 14:44:56 -0700 Subject: [PATCH 15/17] cklippy --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 33 +-------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 67aeb4add62e7..1cd5800af2d8d 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -180,12 +180,6 @@ impl<'a> SparseSet<'a> { fn clear(&mut self) { self.n = 0 } - fn clear_and_add(&mut self, v: u16) { - debug_assert!((v as usize) < self.mem.len() / 2); - self.n = 1; - *self.get_sparse_mut(v) = 0; - *self.get_dense_mut(0) = v; - } fn add(&mut self, v: u16) -> bool { debug_assert!((v as usize) < self.mem.len() / 2); @@ -421,7 +415,7 @@ impl GlobProgram { let mut instruction = self.instructions[0]; let mut vi = 0; let vlen = v.len(); - 'outer: while vi < vlen { + while vi < vlen { let mut byte = v[vi]; vi += 1; let mut thread_index = 0; @@ -433,25 +427,12 @@ impl GlobProgram { match instruction { GlobInstruction::MatchLiteral(m) => { if byte == m { - // if n_threads == 1 { - // ip += 1; - // cur.clear_and_add(ip); - // instruction = self.instructions[ip as usize]; - // continue 'outer; - // } // We matched, proceed to the next character next.add(ip + 1); } } - GlobInstruction::MatchAnyNonDelim => { if byte != b'/' { - // if n_threads == 1 { - // ip += 1; - // cur.clear_and_add(ip); - // instruction = self.instructions[ip as usize]; - // continue 'outer; - // } next.add(ip + 1); } } @@ -512,23 +493,11 @@ impl GlobProgram { } GlobInstruction::MatchClass(index) => { if self.range_sets[index as usize].contains(byte) { - // if n_threads == 1 { - // ip += 1; - // cur.clear_and_add(ip); - // instruction = self.instructions[ip as usize]; - // continue 'outer; - // } next.add(ip + 1); } } GlobInstruction::NegativeMatchClass(index) => { if !self.range_sets[index as usize].contains(byte) { - // if n_threads == 1 { - // ip += 1; - // cur.clear_and_add(ip); - // instruction = self.instructions[ip as usize]; - // continue 'outer; - // } next.add(ip + 1); } } From 3003ea2fa6879a79307633a45b64c2b80e0f13dd Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 16:14:11 -0700 Subject: [PATCH 16/17] fix a logical error --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 15 ++-- .../src/chunk/placeable.rs | 84 +++++++++---------- turbopack/crates/turbopack/src/lib.rs | 8 +- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 1cd5800af2d8d..161de253f9ef2 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -22,6 +22,7 @@ use turbo_tasks::Vc; pub struct Glob { #[turbo_tasks(trace_ignore)] program: GlobProgram, + src: String, } impl Glob { @@ -33,7 +34,12 @@ impl Glob { } else { path }; - self.program.matches(path, match_partial) + let result = self.program.matches(path, match_partial); + let src = &self.src; + + let dbg = format!("matching: '{src}' against '{path}' = {result}"); + dbg!(dbg); + result } // Returns true if the glob could match a filename underneath this `path` where the path @@ -44,6 +50,7 @@ impl Glob { pub fn parse(input: &str) -> Result { Ok(Self { + src: input.to_string(), program: GlobProgram::compile(input)?, }) } @@ -242,6 +249,7 @@ pub struct GlobProgram { impl GlobProgram { fn compile(pattern: &str) -> Result { + dbg!(format!("Compiling: {pattern}")); GlobProgram::do_compile(pattern).with_context(|| format!("Failed to parse glob: {pattern}")) } fn do_compile(pattern: &str) -> Result { @@ -795,10 +803,7 @@ fn generate_code( } Ast::Alternation(pos, branches) => { let num_branches = branches.len(); - if num_branches <= 1 { - bail!("alternations should have >1 branch, remove the {{'s? @{pos}") - } - let mut branch_instructions = Vec::with_capacity(branches.len()); + let mut branch_instructions = Vec::with_capacity(num_branches); for (branch_pos, branch) in branches { let mut instructions = Vec::new(); let (branch_starts_with_globstar, branch_ends_with_globstar) = diff --git a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs index e3ee16c48a34a..892841fc648c0 100644 --- a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs +++ b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use turbo_tasks::{ResolvedVc, Vc}; +use turbo_tasks::{ResolvedVc, TryFlatJoinIterExt, Vc}; use turbo_tasks_fs::{glob::Glob, FileJsonContent, FileSystemPath}; use turbopack_core::{ asset::Asset, @@ -33,7 +33,7 @@ pub trait EcmascriptChunkPlaceable: ChunkableModule + Module + Asset { enum SideEffectsValue { None, Constant(bool), - Glob(ResolvedVc), + Glob(Vec>), } #[turbo_tasks::function] @@ -49,36 +49,13 @@ async fn side_effects_from_package_json( .iter() .filter_map(|side_effect| { if let Some(side_effect) = side_effect.as_str() { - let glob = if side_effect.contains('/') { - side_effect - .strip_prefix("./") - .unwrap_or(side_effect) - .to_string() + if side_effect.contains('/') { + Some(Glob::new( + side_effect.strip_prefix("./").unwrap_or(side_effect).into(), + )) } else { - format!("**/{side_effect}") - }; - match Glob::parse(glob.as_str()) { - Ok(_) => {} - Err(err) => { - SideEffectsInPackageJsonIssue { - path: package_json, - description: Some( - StyledString::Text( - format!( - "Invalid glob '{}' in sideEffects: {}", - glob, - PrettyPrintError(&err) - ) - .into(), - ) - .resolved_cell(), - ), - } - .resolved_cell() - .emit(); - } + Some(Glob::new(format!("**/{side_effect}").into())) } - Some(glob) } else { SideEffectsInPackageJsonIssue { path: package_json, @@ -99,14 +76,32 @@ async fn side_effects_from_package_json( None } }) - .collect::>() - .join(","); - return Ok(SideEffectsValue::Glob( - Glob::new(format!("{{{}}}", globs).into()) - .to_resolved() - .await?, - ) - .cell()); + .map(|glob| async move { + match glob.to_resolved().await { + Ok(glob) => Ok(Some(glob)), + Err(err) => { + SideEffectsInPackageJsonIssue { + path: package_json, + description: Some( + StyledString::Text( + format!( + "Invalid glob in sideEffects: {}", + PrettyPrintError(&err) + ) + .into(), + ) + .resolved_cell(), + ), + } + .resolved_cell() + .emit(); + Ok(None) + } + } + }) + .try_flat_join() + .await?; + return Ok(SideEffectsValue::Glob(globs).cell()); } else { SideEffectsInPackageJsonIssue { path: package_json, @@ -175,17 +170,22 @@ pub async fn is_marked_as_side_effect_free( let find_package_json = find_context_file(path.parent(), package_json()).await?; if let FindContextFileResult::Found(package_json, _) = *find_package_json { - match *side_effects_from_package_json(*package_json).await? { + match &*side_effects_from_package_json(*package_json).await? { SideEffectsValue::None => {} - SideEffectsValue::Constant(side_effects) => return Ok(Vc::cell(!side_effects)), - SideEffectsValue::Glob(glob) => { + SideEffectsValue::Constant(side_effects) => return Ok(Vc::cell(!*side_effects)), + SideEffectsValue::Glob(globs) => { if let Some(rel_path) = package_json .parent() .await? .get_relative_path_to(&*path.await?) { let rel_path = rel_path.strip_prefix("./").unwrap_or(&rel_path); - return Ok(Vc::cell(!glob.await?.execute(rel_path))); + for glob in globs { + if glob.await?.execute(rel_path) { + return Ok(Vc::cell(false)); + } + } + return Ok(Vc::cell(true)); } } } diff --git a/turbopack/crates/turbopack/src/lib.rs b/turbopack/crates/turbopack/src/lib.rs index 10ff9d16bcab2..11a4ea5871fbb 100644 --- a/turbopack/crates/turbopack/src/lib.rs +++ b/turbopack/crates/turbopack/src/lib.rs @@ -928,9 +928,11 @@ impl AssetContext for ModuleAssetContext { let pkgs = &*self.module_options_context.await?.side_effect_free_packages; let mut glob = String::new(); - glob.push_str("**/node_modules/{"); - glob.push_str(pkgs.join(",").as_str()); - glob.push_str("}/**"); + if !pkgs.is_empty() { + glob.push_str("**/node_modules/{"); + glob.push_str(pkgs.join(",").as_str()); + glob.push_str("}/**"); + } Ok(Glob::new(glob.into())) } From b3c0753d0a9d894b1bdce7ae4de7316beca37879 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Fri, 9 May 2025 16:37:46 -0700 Subject: [PATCH 17/17] fix siliness --- turbopack/crates/turbo-tasks-fs/src/glob.rs | 10 +--------- turbopack/crates/turbopack/src/lib.rs | 1 - 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 161de253f9ef2..d634e0c2ae694 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -22,7 +22,6 @@ use turbo_tasks::Vc; pub struct Glob { #[turbo_tasks(trace_ignore)] program: GlobProgram, - src: String, } impl Glob { @@ -34,12 +33,7 @@ impl Glob { } else { path }; - let result = self.program.matches(path, match_partial); - let src = &self.src; - - let dbg = format!("matching: '{src}' against '{path}' = {result}"); - dbg!(dbg); - result + self.program.matches(path, match_partial) } // Returns true if the glob could match a filename underneath this `path` where the path @@ -50,7 +44,6 @@ impl Glob { pub fn parse(input: &str) -> Result { Ok(Self { - src: input.to_string(), program: GlobProgram::compile(input)?, }) } @@ -249,7 +242,6 @@ pub struct GlobProgram { impl GlobProgram { fn compile(pattern: &str) -> Result { - dbg!(format!("Compiling: {pattern}")); GlobProgram::do_compile(pattern).with_context(|| format!("Failed to parse glob: {pattern}")) } fn do_compile(pattern: &str) -> Result { diff --git a/turbopack/crates/turbopack/src/lib.rs b/turbopack/crates/turbopack/src/lib.rs index 11a4ea5871fbb..80860974d3e1a 100644 --- a/turbopack/crates/turbopack/src/lib.rs +++ b/turbopack/crates/turbopack/src/lib.rs @@ -933,7 +933,6 @@ impl AssetContext for ModuleAssetContext { glob.push_str(pkgs.join(",").as_str()); glob.push_str("}/**"); } - Ok(Glob::new(glob.into())) } }