diff --git a/Cargo.lock b/Cargo.lock index 124c4f433acf4..1a014f4d4568a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -276,9 +276,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "ascii" @@ -604,7 +604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f6ca6f0c18c02c2fbfc119df551b8aeb8a385f6d5980f1475ba0255f1e97f1e" dependencies = [ "anyhow", - "arrayvec 0.7.4", + "arrayvec 0.7.6", "itertools 0.10.5", "log", "nom", @@ -618,7 +618,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "876c75a42f6364451a033496a14c44bffe41f5f4a8236f697391f11024e596d2" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", ] [[package]] @@ -835,7 +835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" dependencies = [ "arrayref", - "arrayvec 0.7.4", + "arrayvec 0.7.6", "cc", "cfg-if", "constant_time_eq", @@ -2321,6 +2321,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" +[[package]] +name = "fast-glob" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ea3f6bbcf4dbe2076b372186fc7aeecd5f6f84754582e56ee7db262b15a6f0" +dependencies = [ + "arrayvec 0.7.6", +] + [[package]] name = "fastrand" version = "1.9.0" @@ -2664,9 +2673,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "globset" -version = "0.4.14" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" dependencies = [ "aho-corasick", "bstr", @@ -3005,7 +3014,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.4.9", + "socket2 0.5.8", "tokio", "tower-service", "tracing", @@ -4825,7 +4834,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "itoa", ] @@ -5841,7 +5850,7 @@ checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9" dependencies = [ "arbitrary", "arg_enum_proc_macro", - "arrayvec 0.7.4", + "arrayvec 0.7.6", "av1-grain", "bitstream-io", "built", @@ -7009,7 +7018,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f352d5d14be5a1f956d76ae0c8060c3487aaa2a080f10a4b4ff023c7c05a9047" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "static-map-macro", ] @@ -7658,7 +7667,7 @@ version = "13.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2a2cf0263f34234cfcebde0545e4ed017e1b2b5667792c6902319d75df03110" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "indexmap 2.7.1", "is-macro", "rustc-hash 2.1.1", @@ -7838,7 +7847,7 @@ version = "12.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d11c8e71901401b9aae2ece4946eeb7674b14b8301a53768afbbeeb0e48b599" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "bitflags 2.9.0", "either", "new_debug_unreachable", @@ -7907,7 +7916,7 @@ version = "17.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bca0ad5b72d8b440e701d47f544a728543414f6f165c6c61a899a76d3c7fdf9d" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "bitflags 2.9.0", "indexmap 2.7.1", "num-bigint", @@ -7945,7 +7954,7 @@ version = "12.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "250786944fbc05f6484eda9213df129ccfe17226ae9ad51b62fce2f72135dbee" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "bitflags 2.9.0", "either", "new_debug_unreachable", @@ -8087,7 +8096,7 @@ version = "14.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "012cd84fcc6c6fab718a177a3ffc360332d6bad29dbe19699be2ccbaba91e712" dependencies = [ - "arrayvec 0.7.4", + "arrayvec 0.7.6", "indexmap 2.7.1", "is-macro", "num-bigint", @@ -9557,8 +9566,10 @@ dependencies = [ "criterion", "dashmap 6.1.0", "dunce", + "fast-glob", "futures", "futures-retry", + "globset", "include_dir", "indexmap 2.7.1", "jsonc-parser 0.21.0", diff --git a/Cargo.toml b/Cargo.toml index a43f848f957e0..42084ac7e689e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -240,6 +240,12 @@ opt-level = 3 [profile.release.package.serde] opt-level = 3 + +[profile.profiling] +inherits = "release" +debug = true + + [workspace.dependencies] # Workspace crates next-api = { path = "crates/next-api" } diff --git a/turbopack/crates/turbo-tasks-fs/Cargo.toml b/turbopack/crates/turbo-tasks-fs/Cargo.toml index 172a5805df11d..0ed6aa37d8fbc 100644 --- a/turbopack/crates/turbo-tasks-fs/Cargo.toml +++ b/turbopack/crates/turbo-tasks-fs/Cargo.toml @@ -64,6 +64,8 @@ tempfile = { workspace = true } turbo-tasks-memory = { workspace = true } turbo-tasks-testing = { workspace = true } turbo-tasks-backend = { workspace = true } +fast-glob = "0.4.5" +globset = "0.4.16" [build-dependencies] turbo-tasks-build = { workspace = true } diff --git a/turbopack/crates/turbo-tasks-fs/benches/mod.rs b/turbopack/crates/turbo-tasks-fs/benches/mod.rs index ec5a14eba482e..f5f1ceb37703b 100644 --- a/turbopack/crates/turbo-tasks-fs/benches/mod.rs +++ b/turbopack/crates/turbo-tasks-fs/benches/mod.rs @@ -101,9 +101,48 @@ fn bench_rope_iteration(c: &mut Criterion) { ); } +fn bench_glob_match_simple(c: &mut Criterion) { + const GLOB: &str = "some/**/n*d[k-m]e?txt"; + const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; + glob_bench(c, "simple", GLOB, PATH); +} + +fn bench_glob_match_alternations(c: &mut Criterion) { + const GLOB: &str = "some/**/{tob,crazy}/?*.{png,txt}"; + const PATH: &str = "some/a/bigger/path/to/the/crazy/needle.txt"; + + glob_bench(c, "alternations", GLOB, PATH); +} + +fn glob_bench(c: &mut Criterion, name: &'static str, glob: &str, path: &str) { + let mut group = c.benchmark_group(format!("turbo-tasks-fs/glob/{name}")); + group.bench_function("fast-glob", |b| { + b.iter(|| fast_glob::glob_match(glob, path)) + }); + group.bench_function("turbo-glob", |b| { + b.iter(|| { + let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); + g.execute(path) + }) + }); + let g = turbo_tasks_fs::glob::Glob::parse(glob).unwrap(); + group.bench_function("turbo-glob-cached", |b| b.iter(|| g.execute(path))); + + group.bench_function("globset", |b| { + b.iter(|| { + let g = globset::Glob::new(glob).unwrap().compile_matcher(); + g.is_match(path) + }) + }); + let g = globset::Glob::new(glob).unwrap().compile_matcher(); + group.bench_function("globset-cached", |b| b.iter(|| g.is_match(path))); + + group.finish(); +} + criterion_group!( name = benches; config = Criterion::default(); - targets = bench_file_watching, bench_rope_iteration + targets = bench_file_watching, bench_rope_iteration, bench_glob_match_simple, bench_glob_match_alternations ); criterion_main!(benches); diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index b6448d110709c..d634e0c2ae694 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,34 +1,9 @@ -use std::mem::take; +use std::{cmp::Ordering, collections::VecDeque, fmt::Display}; use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; use turbo_rcstr::RcStr; -use turbo_tasks::{trace::TraceRawVcs, NonLocalValue, TryJoinIterExt, Vc}; -use unicode_segmentation::GraphemeCursor; - -#[derive(PartialEq, Eq, Debug, Clone, TraceRawVcs, Serialize, Deserialize, NonLocalValue)] -enum GlobPart { - /// `/**/`: Matches any path of directories - AnyDirectories, - - /// `*`: Matches any filename (no path separator) - AnyFile, - - /// `?`: Matches a single filename character (no path separator) - AnyFileChar, - - /// `/`: Matches the path separator - PathSeparator, - - /// `[abc]`: Matches any char of the list - FileChar(Vec), - - /// `abc`: Matches literal filename - File(String), - - /// `{a,b,c}`: Matches any of the globs in the list - Alternatives(Vec), -} +use turbo_tasks::Vc; // Examples: // - file.js = File(file.js) @@ -45,385 +20,1044 @@ enum GlobPart { #[turbo_tasks::value] #[derive(Debug, Clone)] pub struct Glob { - expression: Vec, + #[turbo_tasks(trace_ignore)] + program: GlobProgram, } impl Glob { pub fn execute(&self, path: &str) -> bool { // TODO(lukesandberg): deprecate this implicit behavior let match_partial = path.ends_with('/'); - self.iter_matches(path, true, match_partial) - .any(|result| matches!(result, ("", _))) + let path = if match_partial { + &path[0..path.len() - 1] + } else { + path + }; + self.program.matches(path, match_partial) } // Returns true if the glob could match a filename underneath this `path` where the path // represents a directory. pub fn match_in_directory(&self, path: &str) -> bool { - debug_assert!(!path.ends_with('/')); - // TODO(lukesandberg): see if we can avoid this allocation by changing the matching - // algorithm - let path = format!("{path}/"); - self.iter_matches(&path, true, true) - .any(|result| matches!(result, ("", _))) - } - - fn iter_matches<'a>( - &'a self, - path: &'a str, - previous_part_is_path_separator_equivalent: bool, - match_in_directory: bool, - ) -> GlobMatchesIterator<'a> { - GlobMatchesIterator { - current: path, - glob: self, - match_in_directory, - is_path_separator_equivalent: previous_part_is_path_separator_equivalent, - stack: Vec::new(), - index: 0, - } + self.program.matches(path, true) } - pub fn parse(input: &str) -> Result { - let mut current = input; - let mut expression = Vec::new(); + pub fn parse(input: &str) -> Result { + Ok(Self { + program: GlobProgram::compile(input)?, + }) + } +} - while !current.is_empty() { - let (part, remainder) = GlobPart::parse(current, false) - .with_context(|| anyhow!("Failed to parse glob {input}"))?; - expression.push(part); - current = remainder; - } +impl TryFrom<&str> for Glob { + type Error = anyhow::Error; - Ok(Glob { expression }) + fn try_from(value: &str) -> Result { + Glob::parse(value) } } +impl TryFrom for Glob { + type Error = anyhow::Error; -struct GlobMatchesIterator<'a> { - current: &'a str, - glob: &'a Glob, - // In this mode we are checking if the glob might match something in the directory represented - // by this path. - match_in_directory: bool, - is_path_separator_equivalent: bool, - stack: Vec>, - index: usize, + fn try_from(value: RcStr) -> Result { + Glob::parse(value.as_str()) + } } -impl<'a> Iterator for GlobMatchesIterator<'a> { - type Item = (&'a str, bool); +#[turbo_tasks::value_impl] +impl Glob { + #[turbo_tasks::function] + pub fn new(glob: RcStr) -> Result> { + Ok(Self::cell(Glob::try_from(glob)?)) + } +} - fn next(&mut self) -> Option { - loop { - if let Some(part) = self.glob.expression.get(self.index) { - let iter = if let Some(iter) = self.stack.get_mut(self.index) { - iter - } else { - let iter = part.iter_matches( - self.current, - self.is_path_separator_equivalent, - self.match_in_directory, - ); - self.stack.push(iter); - self.stack.last_mut().unwrap() - }; - if let Some((new_path, new_is_path_separator_equivalent)) = iter.next() { - self.current = new_path; - self.is_path_separator_equivalent = new_is_path_separator_equivalent; +#[derive(Debug, Eq, PartialEq, Copy, Clone, Serialize, Deserialize, Hash)] +struct ByteRange { + low: u8, + high: u8, +} +impl ByteRange { + fn singleton(b: u8) -> Self { + Self { low: b, high: b } + } + fn range(low: u8, high: u8) -> Self { + Self { low, high } + } +} + +impl PartialOrd for ByteRange { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +// Lower `lows` come first, if two `lows` match then the large range comes first +// This makes the merge logic simpler. +impl Ord for ByteRange { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match self.low.cmp(&other.low) { + std::cmp::Ordering::Equal => other.high.cmp(&self.high), + c => c, + } + } +} - self.index += 1; +#[derive(Debug, Eq, Clone, PartialEq, Hash, Serialize, Deserialize)] +struct RangeSet { + // A sorted list of non-overlapping ranges. + // The ranges are also non-abutting, but that isn't + // This makes it + ranges: Box<[ByteRange]>, +} + +impl RangeSet { + fn new(mut ranges: Vec) -> RangeSet { + ranges.sort(); + ranges.dedup_by(|a, b| { + if b.high >= a.low { + b.high = a.high; + true + } else { + false + } + }); + Self { + ranges: ranges.into_boxed_slice(), + } + } - if self.match_in_directory && self.current.is_empty() { - return Some(("", self.is_path_separator_equivalent)); + fn contains(&self, b: u8) -> bool { + self.ranges + .binary_search_by(move |r| { + if r.low <= b { + if r.high >= b { + Ordering::Equal + } else { + Ordering::Less } } else { - if self.index == 0 { - // failed to match - return None; - } - // backtrack - self.stack.pop(); - self.index -= 1; + Ordering::Greater } - } else { - // end of expression, matched successfully + }) + .is_ok() + } +} - // backtrack for the next iteration - self.index -= 1; +// A sparse set of integers that supports O(1) insertion, testing, clearing and O(n) iteration. +// https://research.swtch.com/sparse has a nice writeup +// This is useful for our purposes becasue we often need to clear the set and the O(1) clear time is +// critical. +struct SparseSet<'a> { + mem: &'a mut [u16], + n: u16, +} - return Some((self.current, self.is_path_separator_equivalent)); - } +impl<'a> SparseSet<'a> { + fn from_storage(mem: &'a mut [u16]) -> Self { + debug_assert!(mem.len() & 1 == 0, "mem must have an even length"); + Self { mem, n: 0 } + } + // Sparse entries are stored at the even indices and dense entries are stored at the odd ones + // this allows us to compute addresses withough referencing the length of the slice. + + #[inline] + fn get_sparse(&self, i: u16) -> u16 { + return self.mem[i as usize * 2]; + } + #[inline] + fn get_sparse_mut(&mut self, i: u16) -> &mut u16 { + return &mut self.mem[i as usize * 2]; + } + #[inline] + fn get_dense(&self, i: u16) -> u16 { + return self.mem[i as usize * 2 + 1]; + } + #[inline] + fn get_dense_mut(&mut self, i: u16) -> &mut u16 { + return &mut self.mem[i as usize * 2 + 1]; + } + + fn clear(&mut self) { + self.n = 0 + } + + fn add(&mut self, v: u16) -> bool { + debug_assert!((v as usize) < self.mem.len() / 2); + let n = self.n; + let s = self.get_sparse(v); + // The value is already in the set if + // the sparse pointer is in range and the value at that + // index is `v` + if s < n && self.get_dense(s) == v { + // this value is already in the set. + return false; } + *self.get_sparse_mut(v) = n; + *self.get_dense_mut(n) = v; + self.n += 1; + true + } + fn get(&self, i: u16) -> u16 { + debug_assert!(i < self.n); + return self.get_dense(i); } } -impl GlobPart { - /// Iterates over all possible matches of this part with the provided path. - /// The least greedy match is returned first. This is usually used for - /// backtracking. The string slice returned is the remaining part or the - /// path. The boolean flag returned specifies if the matched part should - /// be considered as path-separator equivalent. - fn iter_matches<'a>( - &'a self, - path: &'a str, - previous_part_is_path_separator_equivalent: bool, - match_in_directory: bool, - ) -> GlobPartMatchesIterator<'a> { - GlobPartMatchesIterator { - path, - part: self, - match_in_directory, - previous_part_is_path_separator_equivalent, - cursor: GraphemeCursor::new(0, path.len(), true), - index: 0, - glob_iterator: None, +#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] +struct BitSet { + bits: Box<[u64]>, // use a boxed slice instead of a vec since this is not extendable +} +impl BitSet { + const BITS: usize = { u64::BITS as usize }; + fn new(len: usize) -> Self { + let words = len.div_ceil(Self::BITS); + Self { + bits: vec![0u64; words].into_boxed_slice(), } } + fn set(&mut self, bit: usize) { + let word_index = bit / Self::BITS; + let bit_index = bit % Self::BITS; - fn parse(input: &str, inside_of_braces: bool) -> Result<(GlobPart, &str)> { - debug_assert!(!input.is_empty()); - let two_chars = { - let mut chars = input.chars(); - (chars.next().unwrap(), chars.next()) - }; - match two_chars { - ('/', _) => Ok((GlobPart::PathSeparator, &input[1..])), - ('*', Some('*')) => Ok((GlobPart::AnyDirectories, &input[2..])), - ('*', _) => Ok((GlobPart::AnyFile, &input[1..])), - ('?', _) => Ok((GlobPart::AnyFileChar, &input[1..])), - ('[', Some('[')) => todo!("glob char classes are not implemented yet"), - ('[', _) => todo!("glob char sequences are not implemented yet"), - ('{', Some(_)) => { - let mut current = &input[1..]; - let mut alternatives = Vec::new(); - let mut expression = Vec::new(); - - loop { - let (part, remainder) = GlobPart::parse(current, true)?; - expression.push(part); - current = remainder; - match current.chars().next() { - Some(',') => { - alternatives.push(Glob { - expression: take(&mut expression), - }); - current = ¤t[1..]; + self.bits[word_index] |= 1 << bit_index; + } + fn has(&self, bit: usize) -> bool { + let word_index = bit / Self::BITS; + let bit_index = bit % Self::BITS; + self.bits[word_index] & 1 << bit_index != 0 + } +} + +#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] +pub struct GlobProgram { + instructions: Box<[GlobInstruction]>, + // Instructions we can end on that are a valid match + match_instructions: BitSet, + // Instructions we can end on that are a valid prefix match + prefix_match_instructions: BitSet, + range_sets: Box<[RangeSet]>, + // Constant prefixes if any, used as a prefilter. + prefix: Option>, +} + +impl GlobProgram { + fn compile(pattern: &str) -> Result { + GlobProgram::do_compile(pattern).with_context(|| format!("Failed to parse glob: {pattern}")) + } + fn do_compile(pattern: &str) -> Result { + let root = Ast::parse(pattern)?; + let mut instructions = Vec::new(); + let mut range_sets = Vec::new(); + + generate_code(&mut instructions, &mut range_sets, root)?; + + let mut prefix = None; + for instruction in &instructions { + match instruction { + GlobInstruction::MatchLiteral(b) => { + if prefix.is_none() { + prefix = Some(Vec::new()); + } + prefix.as_mut().unwrap().push(*b); + } + _ => { + break; + } + } + } + if let Some(ref prefix) = prefix { + // drop the prefix + instructions.drain(0..prefix.len()); + } + + instructions.push(GlobInstruction::Match); + + // Now we need to annotate 'terminal' globstars in order to speed up validating program + // matches. The issue is that if we are executing a globstar when the program + // completes then pass or fail is dependant upon whether there are any unconditional + // required subsequent matches e.g. `foo/**` matches `foo/bar/baz.js` but foo/**/a` + // does not, because we have to match that 'a' similarly `foo/{**,bar}` matches + // `foo/bar/baz.js`. To determine this we need to chase pointers from globstars to the + // end of the program, and if there is a path to match then it is a terminal globstar. + // Similarly for other matches (branches, stars) we might end a match on some control flow + // rather than following it to the end we can just precompute properties of each + // instructions + + if instructions.len() > u16::MAX as usize { + bail!("program too large"); + } + + // This is just used for validation + let mut visited = BitSet::new(instructions.len()); + // For each instruction, tracks if there is a a path from it to `Match` + let mut has_path_to_match = BitSet::new(instructions.len()); + let mut has_path_to_prefix_match = BitSet::new(instructions.len()); + + // Compute paths by iterating backwards through the instructions. + for start in (0..instructions.len()).rev() { + visited.set(start); + let (valid_prefix_end, valid_match) = match instructions[start] { + GlobInstruction::MatchLiteral(byte) => (byte == b'/', false), + GlobInstruction::MatchManyNonDelimWithLit(..) + | GlobInstruction::MatchManyNonDelim + | GlobInstruction::MatchAnyNonDelim => (false, false), + GlobInstruction::MatchGlobStar { terminal } => { + debug_assert!(!terminal); // shouldn't have been set + // a globstar is always a valid prefix end but is only a valid match if the + // subsequent instruction is. + let next = start + 1; + debug_assert!(visited.has(next), "should have already visited the target"); + let has_path_to_end = has_path_to_match.has(next); + if has_path_to_end { + instructions[start] = GlobInstruction::MatchGlobStar { terminal: true }; + } + (true, has_path_to_match.has(next)) + } + GlobInstruction::MatchClass(index) => { + (range_sets[index as usize].contains(b'/'), false) + } + GlobInstruction::NegativeMatchClass(index) => { + (!range_sets[index as usize].contains(b'/'), false) + } + GlobInstruction::Jump(offset) => { + let target = start + offset as usize; + debug_assert!( + visited.has(target), + "should have already visited the target" + ); + // copy from the target + ( + has_path_to_prefix_match.has(target), + has_path_to_match.has(target), + ) + } + GlobInstruction::Fork(offset) => { + let next_instruction = start + 1; + debug_assert!( + visited.has(next_instruction), + "should have already visited the target" + ); + let next = ( + has_path_to_prefix_match.has(next_instruction), + has_path_to_match.has(next_instruction), + ); + + let fork_target = start + offset as usize; + debug_assert!( + visited.has(fork_target), + "should have already visited the target" + ); + let fork = ( + has_path_to_prefix_match.has(fork_target), + has_path_to_match.has(fork_target), + ); + (next.0 || fork.0, next.1 || fork.1) + } + GlobInstruction::Match => { + // For prefix matches, we only want to say it matches if a a file within the + // directory could match, so if we see a Maatch instruction then we cannot match + // it with this glob + (false, true) + } + }; + if valid_prefix_end { + has_path_to_prefix_match.set(start) + } + if valid_match { + has_path_to_match.set(start) + } + } + + Ok(GlobProgram { + instructions: instructions.into_boxed_slice(), + match_instructions: has_path_to_match, + prefix_match_instructions: has_path_to_prefix_match, + range_sets: range_sets.into_boxed_slice(), + prefix: prefix.map(|v| v.into_boxed_slice()), + }) + } + + fn matches(&self, v: &str, prefix: bool) -> bool { + let mut v = v.as_bytes(); + // trim a prefix if we have one + if let Some(literal_prefix) = &self.prefix { + if prefix && v.len() <= literal_prefix.len() { + return v == &literal_prefix[0..v.len()]; + } else { + if literal_prefix.len() > v.len() || v[..literal_prefix.len()] != **literal_prefix { + return false; + } + v = &v[literal_prefix.len()..]; + } + } + + let len = self.instructions.len(); + // Use a single uninitialized allocation for our storage. + let mut storage: Vec = + unsafe { std::mem::transmute(vec![std::mem::MaybeUninit::::uninit(); len * 4]) }; + let (set1, set2) = storage.split_at_mut(len * 2); + let mut set1 = SparseSet::from_storage(set1); + let mut set2 = SparseSet::from_storage(set2); + // Access via references to make the swap operations cheaper. + let mut cur = &mut set1; + let mut next = &mut set2; + // start at the first instruction! + cur.add(0); + // Process all bytes in order + // Each iteration of the outer loop advances one byte through the input + // Each iteration of the inner loop iterates at most once for every instruction in the + // program but typically far less + // This bounds execution at O(N*M) where N is the size of the path and M is the size of the + // program + // This is the same as `cur.n` but we track it here to avoid the indirect memory read + let mut n_threads = 1; + let mut ip = 0; + let mut instruction = self.instructions[0]; + let mut vi = 0; + let vlen = v.len(); + while vi < vlen { + let mut byte = v[vi]; + vi += 1; + let mut thread_index = 0; + // We manage the loop manually at the bottom to make it easier to skip it when hitting + // some fast paths + loop { + // The dispatching is the slowest part of the loop. To mitigate we should allow + // some of our fastpaths to advance to the next byte locally. + match instruction { + GlobInstruction::MatchLiteral(m) => { + if byte == m { + // We matched, proceed to the next character + next.add(ip + 1); + } + } + GlobInstruction::MatchAnyNonDelim => { + if byte != b'/' { + next.add(ip + 1); + } + } + GlobInstruction::MatchManyNonDelim => { + if byte != b'/' { + // keep evaluating this instruction and possibly exit + next.add(ip); + next.add(ip + 1); + } + } + GlobInstruction::MatchManyNonDelimWithLit(exit) => { + loop { + if byte != b'/' { + // if we match the exit consider this the same as a literal match + // and jump to the subsequent + // instruction + if byte == exit { + next.add(ip); + next.add(ip + 2); + } else { + // otherwise we can just loop directly like a globstar + if n_threads == 1 && vi < vlen { + byte = v[vi]; + vi += 1; + continue; + } + next.add(ip); + } + } + break; + } + } + GlobInstruction::MatchGlobStar { terminal } => { + if terminal { + // If we find a terminal globstar, we are done! this must match whatever + // remains + return true; } - Some('}') => { - alternatives.push(Glob { - expression: take(&mut expression), - }); - current = ¤t[1..]; + loop { + // If we see a `/` then we need to consider ending the globstar. + if byte == b'/' { + // but even so we keep trying to match, just like a fork. + next.add(ip); + next.add(ip + 1); + } else { + // Otherwise we keep globbing, if we are the only thread jump to the + // next byte + if n_threads == 1 && vi < vlen { + byte = v[vi]; + vi += 1; + continue; + } + // otherwise wait for the other threads to complete + next.add(ip); + } break; } - None => bail!("Unterminated glob braces"), - _ => { - // next part of the glob + } + GlobInstruction::MatchClass(index) => { + if self.range_sets[index as usize].contains(byte) { + next.add(ip + 1); + } + } + GlobInstruction::NegativeMatchClass(index) => { + if !self.range_sets[index as usize].contains(byte) { + next.add(ip + 1); + } + } + GlobInstruction::Jump(offset) => { + // Push another thread onto the current list + // This is just for when we exiting alternations to skip over alternates + if cur.add(offset + ip) { + n_threads += 1; + } + } + GlobInstruction::Fork(offset) => { + if cur.add(ip + 1) { + n_threads += 1; + } + if cur.add(offset + ip) { + n_threads += 1; } } + GlobInstruction::Match => { + // We ran out of instructions while we still have characters + // so this thread dies. + } + } + // Do this at the bottom of the loop, this allows our early returns above to skip + // the dependent memory reads. + thread_index += 1; + if thread_index < n_threads { + ip = cur.get(thread_index); + instruction = self.instructions[ip as usize]; + } else { + break; } - - Ok((GlobPart::Alternatives(alternatives), current)) } - ('{', None) => { - bail!("Unterminated glob braces") + n_threads = next.n; + if n_threads == 0 { + // This means that all threads exited early. This isn't needed for correctness, + // but there is no point iterating the rest of the characters. + return false; } - _ => { - let mut is_escaped = false; - let mut literal = String::new(); - - let mut cursor = GraphemeCursor::new(0, input.len(), true); - - let mut start = cursor.cur_cursor(); - let mut end_cursor = cursor - .next_boundary(input, 0) - .map_err(|e| anyhow!("{:?}", e))?; - - while let Some(end) = end_cursor { - let c = &input[start..end]; - if is_escaped { - is_escaped = false; - } else if c == "\\" { - is_escaped = true; - } else if c == "/" - || c == "*" - || c == "?" - || c == "[" - || c == "{" - || (inside_of_braces && (c == "," || c == "}")) - { - break; - } - literal.push_str(c); - - start = cursor.cur_cursor(); - end_cursor = cursor - .next_boundary(input, end) - .map_err(|e| anyhow!("{:?}", e))?; + ip = next.get(0); + instruction = self.instructions[ip as usize]; + // We have some progress! clear current and swap the two lists to advance to the next + // character. + cur.clear(); + std::mem::swap(&mut cur, &mut next); + } + // If we get here we have matched a prefix of the instructions and run out of text. + // We need to ensure that whatever instructions we landed on are valid for a prefix match + if prefix { + for i in 0..cur.n { + let insn = cur.get(i); + if self.prefix_match_instructions.has(insn as usize) { + return true; } + } + } - Ok((GlobPart::File(literal), &input[start..])) + // We matched if there is some path from any current thread to the end, so we need to + // process all jumps and forks + // Consider a pattern like `a{b,c}` matching `ab` + // The program would be `Lit(a)Fork(2)Lit(b)Jump(2)Lit(c)Match` + // So when matching 'b' we need to execute a jump to find the match.` + for i in 0..cur.n { + let insn = cur.get(i); + if self.match_instructions.has(insn as usize) { + return true; } } + false } } -struct GlobPartMatchesIterator<'a> { - path: &'a str, - part: &'a GlobPart, - match_in_directory: bool, - previous_part_is_path_separator_equivalent: bool, - cursor: GraphemeCursor, - index: usize, - glob_iterator: Option>>, +enum Ast { + Child(usize, GlobToken), + Alternation(usize, Vec<(usize, VecDeque)>), + Class(RangeSet, bool), } - -impl<'a> Iterator for GlobPartMatchesIterator<'a> { - type Item = (&'a str, bool); - - fn next(&mut self) -> Option { - match self.part { - GlobPart::AnyDirectories => { - if self.cursor.cur_cursor() == 0 { - let Ok(Some(_)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - return Some((self.path, true)); +impl Ast { + fn parse(glob: &str) -> Result> { + let mut tok = Tokenizer::new(glob); + let mut stack: Vec> = Vec::new(); + stack.push(VecDeque::new()); + loop { + let (pos, token) = tok.next_token(); + match token { + GlobToken::Star + | GlobToken::Delimiter + | GlobToken::QuestionMark + | GlobToken::GlobStar + | GlobToken::Literal(_) => { + stack.last_mut().unwrap().push_back(Ast::Child(pos, token)) } - - if self.cursor.cur_cursor() == self.path.len() { - return None; + GlobToken::Caret | GlobToken::ExclamationPoint => { + panic!("BUG: these cannot appear at the beginning of a pattern") } + GlobToken::LSquareBracket => { + let mut set = Vec::new(); + // with in a square bracket we expect a mix of literals and hyphens followed by + // a RSuareBracket + let mut prev_literal: Option = None; + let mut partial_range: Option = None; + let mut negated = false; + loop { + let (pos, t) = tok.next_token(); + match t { + GlobToken::Caret | GlobToken::ExclamationPoint => { + if !set.is_empty() + || prev_literal.is_some() + || partial_range.is_some() + { + bail!( + "negation tokens can only appear at the beginning of \ + character classes @{pos}" + ); + } + negated = !negated; + } + GlobToken::Literal(lit) => { + if lit > 127 { + // TODO(lukesandberg): These are supportable by expanding into + // several RanngeSets for each byte in the multibyte characters + // However, this is very unlikely to be required by a user so + // for now the feature is omitted. + bail!("Unsupported non-ascii character in set @{pos}"); + } + if let Some(start) = partial_range { + set.push(ByteRange::range(start, lit)); + partial_range = None; + } else { + if let Some(prev) = prev_literal { + set.push(ByteRange::singleton(prev)); + } + prev_literal = Some(lit); + } + } + GlobToken::Hyphen => { + if let Some(lit) = prev_literal { + prev_literal = None; + partial_range = Some(lit); + } else { + bail!( + "Unexpected hyphen at the beginning of a character class \ + @{pos}" + ) + } + } - loop { - let start = self.cursor.cur_cursor(); - // next_boundary does not set cursor offset to the end of the string - // if there is no next boundary - manually set cursor to the end - let end = match self.cursor.next_boundary(self.path, 0) { - Ok(end) => { - if let Some(end) = end { - end - } else { - self.cursor.set_cursor(self.path.len()); - self.cursor.cur_cursor() + GlobToken::RSquareBracket => { + if let Some(lit) = prev_literal { + set.push(ByteRange::singleton(lit)); + } + if partial_range.is_some() { + bail!( + "Unexpected hyphen at the end of a character class @{pos}" + ) + } + stack + .last_mut() + .unwrap() + .push_back(Ast::Class(RangeSet::new(set), negated)); + break; + } + _ => { + bail!("Unexpected token {t} inside of character class @{pos}"); } } - _ => return None, - }; - - if &self.path[start..end] == "/" { - return Some((&self.path[end..], true)); - } else if start == end { - return Some((&self.path[start..], false)); } } - } - GlobPart::AnyFile => { - let Ok(Some(c)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - - let idx = self.path[0..c].len(); - - // TODO verify if `*` does match zero chars? - if let Some(slice) = self.path.get(0..c) { - if slice.ends_with('/') { - None - } else { - Some(( - &self.path[c..], - self.previous_part_is_path_separator_equivalent && idx == 1, - )) - } - } else { - None + GlobToken::RSquareBracket | GlobToken::Hyphen => panic!( + "should never happen, tokenizer should have already rejected or been consumed \ + within another branch" + ), + GlobToken::LBracket => { + stack + .last_mut() + .unwrap() + .push_back(Ast::Alternation(pos, vec![(pos, VecDeque::new())])); + stack.push(VecDeque::new()) } - } - GlobPart::AnyFileChar => todo!(), - GlobPart::PathSeparator => { - if self.cursor.cur_cursor() == 0 { - let Ok(Some(b)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - if self.path.starts_with('/') { - Some((&self.path[b..], true)) - } else if self.previous_part_is_path_separator_equivalent { - Some((self.path, true)) + GlobToken::Comma | GlobToken::RBracket => { + let mut last_branch = stack.pop().unwrap(); + if let Ast::Alternation(_, branches) = + stack.last_mut().unwrap().back_mut().unwrap() + { + branches.last_mut().unwrap().1.append(&mut last_branch); + if token == GlobToken::Comma { + branches.push((pos, VecDeque::new())); + stack.push(VecDeque::new()); + } } else { - None + // The lexer ensures that these tokens only occur in the context of an + // alternation. + panic!("impossible!, token in unexpected place"); } - } else { - None } + GlobToken::End => break, } - GlobPart::FileChar(chars) => { - let start = self.cursor.cur_cursor(); - let Ok(Some(end)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - let mut chars_in_path = self.path[start..end].chars(); - let c = chars_in_path.next()?; - if chars_in_path.next().is_some() { - return None; + } + if stack.len() != 1 { + bail!("Expected '}}' before end of pattern"); + } + if let Some(err) = tok.err { + return Err(err); + } + Ok(stack.pop().unwrap()) + } +} + +fn generate_code( + instructions: &mut Vec, + range_sets: &mut Vec, + mut root: VecDeque, +) -> Result<(bool, bool), anyhow::Error> { + // check and validate globstar structure + let mut i = 0; + let mut starts_with_globstar = false; + let mut ends_with_globstar = false; + while i < root.len() { + if let Ast::Child(pos, GlobToken::GlobStar) = root[i] { + // a ** should be followed by a `/` or the end of the branch + if i < root.len() - 1 { + let next = &root[i + 1]; + if !matches!(next, Ast::Child(_, GlobToken::Delimiter)) { + bail!("Globstar must be a complete path segment, e.g. /**/ @{pos}"); } - chars.contains(&c).then(|| (&self.path[end..], false)) + root.remove(i + 1); + } else { + ends_with_globstar = true; } - GlobPart::File(name) => { - if self.cursor.cur_cursor() == 0 && self.path.starts_with(name) { - let Ok(Some(_)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - Some((&self.path[name.len()..], false)) - } else { - None + + // a ** should be prefixed by a `/` or the beginning of the branch + // duplicated **/**/ are just dropped. + if i > 0 { + let prev = &root[i - 1]; + if matches!(prev, Ast::Child(_, GlobToken::GlobStar)) { + root.remove(i); + i -= 1; + } else if !matches!(prev, Ast::Child(_, GlobToken::Delimiter)) { + bail!("Globstar must be a complete path segment, e.g. /**/ @{pos}"); } + } else { + starts_with_globstar = true; } - GlobPart::Alternatives(alternatives) => loop { - if let Some(glob_iterator) = &mut self.glob_iterator { - if let Some((path, is_path_separator_equivalent)) = glob_iterator.next() { - return Some((path, is_path_separator_equivalent)); - } else { - self.index += 1; - self.glob_iterator = None; + } + i += 1; + } + + let mut prev_token_was_delimiter = false; + let mut is_first = true; + while let Some(node) = root.pop_front() { + match node { + Ast::Child(_, glob_token) => { + prev_token_was_delimiter = false; + match glob_token { + GlobToken::Literal(byte) => { + instructions.push(GlobInstruction::MatchLiteral(byte)) } - } else if let Some(alternative) = alternatives.get(self.index) { - self.glob_iterator = Some(Box::new(alternative.iter_matches( - self.path, - self.previous_part_is_path_separator_equivalent, - self.match_in_directory, - ))); - } else { - return None; + GlobToken::Delimiter => { + prev_token_was_delimiter = true; + instructions.push(GlobInstruction::MatchLiteral(b'/')) + } + GlobToken::Star => { + instructions.push(GlobInstruction::Fork(2)); // allowed to match nothing + instructions.push(match root.front() { + Some(Ast::Child(_, GlobToken::Literal(byte))) => { + GlobInstruction::MatchManyNonDelimWithLit(*byte) + } + _ => GlobInstruction::MatchManyNonDelim, + }); + } + GlobToken::QuestionMark => { + // A question match, optionally matches a non-delimiter character, so we + // either skip forward or not. + instructions.push(GlobInstruction::Fork(2)); + instructions.push(GlobInstruction::MatchAnyNonDelim); + } + GlobToken::GlobStar => { + // allow globstars to match nothing by skipping it + instructions.push(GlobInstruction::Fork(2)); + // We don't actually know if it is terminal or not yet. We will determine + // this after code generation. + instructions.push(GlobInstruction::MatchGlobStar { terminal: false }); + } + + GlobToken::LSquareBracket + | GlobToken::RSquareBracket + | GlobToken::LBracket + | GlobToken::RBracket + | GlobToken::Comma + | GlobToken::Hyphen + | GlobToken::ExclamationPoint + | GlobToken::Caret + | GlobToken::End => unreachable!(), + }; + } + Ast::Alternation(pos, branches) => { + let num_branches = branches.len(); + let mut branch_instructions = Vec::with_capacity(num_branches); + for (branch_pos, branch) in branches { + let mut instructions = Vec::new(); + let (branch_starts_with_globstar, branch_ends_with_globstar) = + generate_code(&mut instructions, range_sets, branch)?; + if branch_starts_with_globstar { + if !is_first { + if !prev_token_was_delimiter { + bail!( + "Alternation begins with a glob star that is not prefixed by \ + a '/' @{branch_pos}" + ); + } + } else { + starts_with_globstar = true; + } + } + if branch_ends_with_globstar { + if root.is_empty() { + ends_with_globstar = true; + } else { + bail!( + "An alternation can only end with a glob star if it is at the end \ + of the pattern @{branch_pos}" + ); + } + } + branch_instructions.push(instructions); + } + + let mut next_branch_offset = num_branches - 1; + for branch in &branch_instructions[0..num_branches - 1] { + // to jump past the branch we need to jump past all its instructions + // +1 to account for the JUMP + // instruction at the end + next_branch_offset += branch.len() + 1; + instructions.push(GlobInstruction::Fork( + next_branch_offset.try_into().with_context(|| { + format!( + "glob too large, cannot have more than 32K instructions @{pos}" + ) + })?, + )); + next_branch_offset -= 1; // subtract one since we added a fork + // instruction. } - }, + let mut end_of_alternation = + next_branch_offset + branch_instructions.last().unwrap().len(); + for branch in &mut branch_instructions[0..num_branches - 1] { + end_of_alternation -= branch.len(); // from the end of this branch, this is how far it is to the end of + // the + // alternation + instructions.append(branch); + instructions.push(GlobInstruction::Jump( + end_of_alternation.try_into().with_context(|| { + format!( + "glob too large, cannot have more than 32K instructions @{pos}" + ) + })?, + )); + end_of_alternation -= 1; // account for the jump instruction + } + let last_branch = branch_instructions.last_mut().unwrap(); + end_of_alternation -= last_branch.len(); + instructions.append(last_branch); + debug_assert!(end_of_alternation == 0); + } + Ast::Class(range_set, negated) => { + prev_token_was_delimiter = false; + let index: u8 = range_sets + .len() + .try_into() + .context("Cannot have >255 character classes in a glob")?; + range_sets.push(range_set); + instructions.push(if negated { + GlobInstruction::NegativeMatchClass(index) + } else { + GlobInstruction::MatchClass(index) + }); + } } + is_first = false; } + Ok((starts_with_globstar, ends_with_globstar)) } -impl TryFrom<&str> for Glob { - type Error = anyhow::Error; +// A more compact encoding would be nice but experimentally it does not save much +// Instead we should explore ways to encode more hints into the instructions to speed up matchers. +#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] +enum GlobInstruction { + // Matches a single literal byte + MatchLiteral(u8), + // Matches any non-`/` character + MatchAnyNonDelim, + // Matches any number of non-`/` character + MatchManyNonDelim, + // Matches any number of non-`/` character followed by a literal as a hint + MatchManyNonDelimWithLit(u8), + // Matches **, which is any character but can only 'end' on a `/` or end of string + MatchGlobStar { terminal: bool }, + // Matches any character in the set + // The value is an index into the ranges + MatchClass(u8), + // Matches any character not in the set + // The value is an index into the ranges + NegativeMatchClass(u8), + // Unconditional jump forward. This would occur at the end of an alternate to jump past the + // other alternates. + Jump(u16), + // Splits control flow into two branches. + Fork(u16), + // End of program + Match, +} - fn try_from(value: &str) -> Result { - Glob::parse(value) +#[derive(Debug, Eq, PartialEq)] +enum GlobToken { + // A sequence of bytes, possibly including `/` characters + // all bytes are unescaped. + Literal(u8), + // a `*` token + Star, + // a `**` token + GlobStar, + // A `?` token` + QuestionMark, + // a `[` token + LSquareBracket, + // a `]` token + RSquareBracket, + // a `{` token + LBracket, + // a `}` token + RBracket, + // a `,` token, this is contextual, only present within a `{...}` section + Comma, + // a `-` token, this is contextual, only present within a `[...]` section + Hyphen, + // a ! token, this is contextual, only allowed at the beginning of a `[` or at the very + // beginning of the pattern + ExclamationPoint, + // a '^' token. Same rules as ! + Caret, + // a `/` token, + Delimiter, + End, +} +impl Display for GlobToken { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let GlobToken::Literal(c) = self { + let s = c.to_string(); + f.write_str(&s) + } else { + f.write_str(match self { + GlobToken::Star => "*", + GlobToken::GlobStar => "**", + GlobToken::QuestionMark => "?", + GlobToken::LSquareBracket => "[", + GlobToken::RSquareBracket => "]", + GlobToken::LBracket => "{", + GlobToken::RBracket => "}", + GlobToken::Comma => ",", + GlobToken::Hyphen => "-", + GlobToken::End => "", + GlobToken::Literal(_) => panic!("impossible"), + GlobToken::ExclamationPoint => "!", + GlobToken::Caret => "^", + GlobToken::Delimiter => "/", + }) + } } } -#[turbo_tasks::value_impl] -impl Glob { - #[turbo_tasks::function] - pub fn new(glob: RcStr) -> Result> { - Ok(Self::cell(Glob::try_from(glob.as_str())?)) +struct Tokenizer<'a> { + input: &'a [u8], + pos: usize, + err: Option, + bracket_count: u32, + square_bracket_count: u32, +} + +impl<'a> Tokenizer<'a> { + fn new(input: &'a str) -> Tokenizer<'a> { + Self { + input: input.as_bytes(), + pos: 0, + err: None, + bracket_count: 0, + square_bracket_count: 0, + } + } + fn next_token(&mut self) -> (usize, GlobToken) { + (self.pos, self.do_next_token()) } + fn do_next_token(&mut self) -> GlobToken { + match self.input.get(self.pos) { + None => GlobToken::End, + Some(c) => { + self.pos += 1; + match c { + b'*' if self.square_bracket_count == 0 => match self.input.get(self.pos) { + Some(b) if *b == b'*' => { + // This is a globstar + self.pos += 1; + GlobToken::GlobStar + } + _ => GlobToken::Star, + }, + b'?' if self.square_bracket_count == 0 => GlobToken::QuestionMark, + b'[' => { + self.square_bracket_count += 1; + GlobToken::LSquareBracket + } + b']' => { + if self.square_bracket_count == 0 { + self.err = + Some(anyhow!("mismatched square brackets in glob @{}", self.pos)); + return GlobToken::End; + } + self.square_bracket_count -= 1; + GlobToken::RSquareBracket + } + // These are not tokens inside of a character class, they are just literals + b'{' if self.square_bracket_count == 0 => { + self.bracket_count += 1; + GlobToken::LBracket + } + b'}' if self.square_bracket_count == 0 => { + if self.bracket_count == 0 { + self.err = Some(anyhow!("mismatched brackets @{}", self.pos)); + return GlobToken::End; + } - #[turbo_tasks::function] - pub async fn alternatives(globs: Vec>) -> Result> { - if globs.len() == 1 { - return Ok(globs.into_iter().next().unwrap()); + self.bracket_count -= 1; + GlobToken::RBracket + } + // This is only a meaninful token inside of a character class + b'-' if self.square_bracket_count > 0 => GlobToken::Hyphen, + // This is only meaningful inside of an alternation (aka brackets) + b',' if self.bracket_count > 0 => GlobToken::Comma, + // only valid inside of a character class + b'!' if self.square_bracket_count > 0 => GlobToken::ExclamationPoint, + b'^' if self.square_bracket_count > 0 => GlobToken::Caret, + b'/' if self.square_bracket_count == 0 => GlobToken::Delimiter, + cur => { + if *cur == b'\\' { + match self.input.get(self.pos) { + Some(c) => { + self.pos += 1; + GlobToken::Literal(*c) + } + None => { + self.err = Some(anyhow!("found `\\` character at end of glob")); + GlobToken::End + } + } + } else { + GlobToken::Literal(*cur) + } + } + } + } } - Ok(Self::cell(Glob { - expression: vec![GlobPart::Alternatives( - globs.into_iter().map(|g| g.owned()).try_join().await?, - )], - })) } } @@ -431,7 +1065,8 @@ impl Glob { mod tests { use rstest::*; - use super::Glob; + use super::{Glob, GlobToken}; + use crate::glob::Tokenizer; #[rstest] #[case::file("file.js", "file.js")] @@ -442,6 +1077,7 @@ mod tests { #[case::dir_and_file_braces("dir/file.{ts,js}", "dir/file.js")] #[case::dir_and_file_dir_braces("{dir,other}/file.{ts,js}", "dir/file.js")] #[case::star("*.js", "file.js")] + #[case::star_empty("*.js", ".js")] // can match nothing #[case::dir_star("dir/*.js", "dir/file.js")] #[case::dir_star_partial("dir/*.js", "dir/")] #[case::globstar("**/*.js", "file.js")] @@ -503,13 +1139,14 @@ mod tests { #[case::alternatives_nested2("{a,b/c,d/e/{f,g/h}}", "b/c")] #[case::alternatives_nested3("{a,b/c,d/e/{f,g/h}}", "d/e/f")] #[case::alternatives_nested4("{a,b/c,d/e/{f,g/h}}", "d/e/g/h")] + #[case::alternatives_nested6("{a/**,b/**,{c/**,d/**}}", "d/")] // #[case::alternatives_chars("[abc]", "b")] fn glob_match(#[case] glob: &str, #[case] path: &str) { - let glob = Glob::parse(glob).unwrap(); + let parsed = Glob::parse(glob).unwrap(); - println!("{glob:?} {path}"); + println!("{glob:?} compiled to {parsed:?} matching {path}"); - assert!(glob.execute(path)); + assert!(parsed.execute(path)); } #[rstest] @@ -519,10 +1156,44 @@ mod tests { "next/dist/shared/lib/app-router-context.shared-runtime.js" )] fn glob_not_matching(#[case] glob: &str, #[case] path: &str) { - let glob = Glob::parse(glob).unwrap(); + let parsed = Glob::parse(glob).unwrap(); + + println!("{glob:?} compiled to {parsed:?} matching {path}"); + + assert!(!parsed.execute(path)); + } - println!("{glob:?} {path}"); + #[test] + fn glob_character_classes() { + let parsed = Glob::parse("[a-zA-Z0-9_\\-]").unwrap(); + + assert!(parsed.execute("a")); + assert!(!parsed.execute("$")); + let parsed = Glob::parse("[!a-zA-Z0-9_\\-]").unwrap(); + + assert!(!parsed.execute("a")); + assert!(parsed.execute("$")); + } - assert!(!glob.execute(path)); + #[test] + fn test_tokenizer() { + let mut tok = Tokenizer::new("foo/bar[a-z]/?/**"); + assert_eq!(GlobToken::Literal(b'f'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'o'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'o'), tok.next_token().1); + assert_eq!(GlobToken::Delimiter, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'b'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'a'), tok.next_token().1); + assert_eq!(GlobToken::Literal(b'r'), tok.next_token().1); + assert_eq!(GlobToken::LSquareBracket, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'a'), tok.next_token().1); + assert_eq!(GlobToken::Hyphen, tok.next_token().1); + assert_eq!(GlobToken::Literal(b'z'), tok.next_token().1); + assert_eq!(GlobToken::RSquareBracket, tok.next_token().1); + assert_eq!(GlobToken::Delimiter, tok.next_token().1); + assert_eq!(GlobToken::QuestionMark, tok.next_token().1); + assert_eq!(GlobToken::Delimiter, tok.next_token().1); + assert_eq!(GlobToken::GlobStar, tok.next_token().1); + assert_eq!(GlobToken::End, tok.next_token().1); } } diff --git a/turbopack/crates/turbo-tasks-fs/src/read_glob.rs b/turbopack/crates/turbo-tasks-fs/src/read_glob.rs index 4168c64bee11e..9ae6a676ba40e 100644 --- a/turbopack/crates/turbo-tasks-fs/src/read_glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/read_glob.rs @@ -321,7 +321,7 @@ pub mod tests { path, Vec::new(), )); - let read_dir = fs + let read_dir = &*fs .root() .read_glob(Glob::new("*.js".into()), false) .await diff --git a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs index 65173eb921fc6..892841fc648c0 100644 --- a/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs +++ b/turbopack/crates/turbopack-ecmascript/src/chunk/placeable.rs @@ -33,7 +33,7 @@ pub trait EcmascriptChunkPlaceable: ChunkableModule + Module + Asset { enum SideEffectsValue { None, Constant(bool), - Glob(ResolvedVc), + Glob(Vec>), } #[turbo_tasks::function] @@ -77,7 +77,7 @@ async fn side_effects_from_package_json( } }) .map(|glob| async move { - match glob.resolve().await { + match glob.to_resolved().await { Ok(glob) => Ok(Some(glob)), Err(err) => { SideEffectsInPackageJsonIssue { @@ -101,9 +101,7 @@ async fn side_effects_from_package_json( }) .try_flat_join() .await?; - return Ok( - SideEffectsValue::Glob(Glob::alternatives(globs).to_resolved().await?).cell(), - ); + return Ok(SideEffectsValue::Glob(globs).cell()); } else { SideEffectsInPackageJsonIssue { path: package_json, @@ -172,17 +170,22 @@ pub async fn is_marked_as_side_effect_free( let find_package_json = find_context_file(path.parent(), package_json()).await?; if let FindContextFileResult::Found(package_json, _) = *find_package_json { - match *side_effects_from_package_json(*package_json).await? { + match &*side_effects_from_package_json(*package_json).await? { SideEffectsValue::None => {} - SideEffectsValue::Constant(side_effects) => return Ok(Vc::cell(!side_effects)), - SideEffectsValue::Glob(glob) => { + SideEffectsValue::Constant(side_effects) => return Ok(Vc::cell(!*side_effects)), + SideEffectsValue::Glob(globs) => { if let Some(rel_path) = package_json .parent() .await? .get_relative_path_to(&*path.await?) { let rel_path = rel_path.strip_prefix("./").unwrap_or(&rel_path); - return Ok(Vc::cell(!glob.await?.execute(rel_path))); + for glob in globs { + if glob.await?.execute(rel_path) { + return Ok(Vc::cell(false)); + } + } + return Ok(Vc::cell(true)); } } } diff --git a/turbopack/crates/turbopack/src/lib.rs b/turbopack/crates/turbopack/src/lib.rs index 22d3095a1820e..80860974d3e1a 100644 --- a/turbopack/crates/turbopack/src/lib.rs +++ b/turbopack/crates/turbopack/src/lib.rs @@ -927,13 +927,13 @@ impl AssetContext for ModuleAssetContext { async fn side_effect_free_packages(&self) -> Result> { let pkgs = &*self.module_options_context.await?.side_effect_free_packages; - let mut globs = Vec::with_capacity(pkgs.len()); - - for pkg in pkgs { - globs.push(Glob::new(format!("**/node_modules/{{{}}}/**", pkg).into())); + let mut glob = String::new(); + if !pkgs.is_empty() { + glob.push_str("**/node_modules/{"); + glob.push_str(pkgs.join(",").as_str()); + glob.push_str("}/**"); } - - Ok(Glob::alternatives(globs)) + Ok(Glob::new(glob.into())) } }