@@ -380,10 +380,49 @@ impl GlobProgram {
380
380
// If we hit a `}` then we are done so compute the jumps and pop the prefix.
381
381
if t == GlobToken :: RBracket {
382
382
let mut branches = left_bracket_starts. pop ( ) . unwrap ( ) ;
383
- let mut prefix = & mut branches. prefix ;
383
+ let prefix = & mut branches. prefix ;
384
384
let branches = & mut branches. branches ;
385
- build_alternatives ( prefix, branches) ?;
386
- std:: mem:: swap ( & mut instructions, & mut prefix) ;
385
+
386
+ let num_branches = branches. len ( ) ;
387
+ if num_branches <= 1 {
388
+ bail ! (
389
+ "Cannot have an alternation with less than 2 members, remove the \
390
+ brackets?"
391
+ ) ;
392
+ }
393
+ let mut next_branch_offset = num_branches - 1 ;
394
+ for branch in & branches[ 0 ..num_branches - 1 ] {
395
+ // to jump past the branch we need to jump past all its instructions
396
+ // +1 to account for the JUMP
397
+ // instruction at the end
398
+ next_branch_offset += branch. len ( ) + 1 ;
399
+ prefix. push ( GlobInstruction :: Fork (
400
+ next_branch_offset. try_into ( ) . context (
401
+ "glob too large, cannot have more than 32K instructions" ,
402
+ ) ?,
403
+ ) ) ;
404
+ next_branch_offset -= 1 ; // subtract one since we added a fork
405
+ // instruction.
406
+ }
407
+ let mut end_of_alternation =
408
+ next_branch_offset + branches. last ( ) . unwrap ( ) . len ( ) ;
409
+ for branch in & mut branches[ 0 ..num_branches - 1 ] {
410
+ end_of_alternation -= branch. len ( ) ; // from the end of this branch, this is how far it is to the end of
411
+ // the
412
+ // alternation
413
+ prefix. append ( branch) ;
414
+ prefix. push ( GlobInstruction :: Jump (
415
+ end_of_alternation. try_into ( ) . context (
416
+ "glob too large, cannot have more than 64K instructions" ,
417
+ ) ?,
418
+ ) ) ;
419
+ end_of_alternation -= 1 ; // account for the jump instruction
420
+ }
421
+ end_of_alternation -= branches. last ( ) . unwrap ( ) . len ( ) ;
422
+ prefix. append ( branches. last_mut ( ) . unwrap ( ) ) ;
423
+ debug_assert ! ( end_of_alternation == 0 ) ;
424
+
425
+ std:: mem:: swap ( & mut instructions, prefix) ;
387
426
}
388
427
}
389
428
GlobToken :: End => {
@@ -416,8 +455,8 @@ impl GlobProgram {
416
455
let mut has_path_to_prefix_match = BitSet :: new ( instructions. len ( ) ) ;
417
456
418
457
for start in ( 0 ..instructions. len ( ) ) . rev ( ) {
419
- visited. set ( start as usize ) ;
420
- let ( valid_prefix_end, valid_match) = match instructions[ start as usize ] {
458
+ visited. set ( start) ;
459
+ let ( valid_prefix_end, valid_match) = match instructions[ start] {
421
460
GlobInstruction :: MatchLiteral ( byte) => ( byte == b'/' , false ) ,
422
461
GlobInstruction :: MatchAnyNonDelim => ( false , false ) ,
423
462
GlobInstruction :: MatchGlobStar { terminal } => {
@@ -451,7 +490,7 @@ impl GlobProgram {
451
490
)
452
491
}
453
492
GlobInstruction :: Fork ( offset) => {
454
- let next_instruction = ( start + 1 ) as usize ;
493
+ let next_instruction = start + 1 ;
455
494
debug_assert ! (
456
495
visited. has( next_instruction) ,
457
496
"should have already visited the target"
@@ -466,7 +505,7 @@ impl GlobProgram {
466
505
// So we don't need to follow them now
467
506
next
468
507
} else {
469
- let fork_target = start as usize + offset as usize ;
508
+ let fork_target = start + offset as usize ;
470
509
debug_assert ! (
471
510
visited. has( fork_target) ,
472
511
"should have already visited the target"
@@ -486,10 +525,10 @@ impl GlobProgram {
486
525
}
487
526
} ;
488
527
if valid_prefix_end {
489
- has_path_to_prefix_match. set ( start as usize )
528
+ has_path_to_prefix_match. set ( start)
490
529
}
491
530
if valid_match {
492
- has_path_to_match. set ( start as usize )
531
+ has_path_to_match. set ( start)
493
532
}
494
533
}
495
534
@@ -520,22 +559,34 @@ impl GlobProgram {
520
559
// program but typically far less
521
560
// This bounds execution at O(N*M) where N is the size of the path and M is the size of the
522
561
// program
523
- for & byte in v. as_bytes ( ) {
562
+ let mut n_threads = 1 ;
563
+ let mut ip = 0 ;
564
+ ' outer: for & byte in v. as_bytes ( ) {
524
565
let mut thread_index = 0 ;
525
- // We need to use this looping construct since we may add elements to `cur` as we go.
526
- // `cur.n` will never be > `len` so this loop is bounded to N iterations.
527
- let mut n_threads = cur. n ;
528
- while thread_index < n_threads {
529
- let ip = cur. get ( thread_index) ;
566
+ // We manage the loop manually at the bottom to make it easier to skip it when hitting
567
+ // some fast paths
568
+ loop {
530
569
match self . instructions [ ip as usize ] {
531
570
GlobInstruction :: MatchLiteral ( m) => {
532
571
if byte == m {
572
+ if n_threads == 1 {
573
+ cur. clear ( ) ;
574
+ ip += 1 ;
575
+ cur. add ( ip) ;
576
+ continue ' outer;
577
+ }
533
578
// We matched, proceed to the next character
534
579
next. add ( ip + 1 ) ;
535
580
}
536
581
}
537
582
GlobInstruction :: MatchAnyNonDelim => {
538
583
if byte != b'/' {
584
+ if n_threads == 1 {
585
+ cur. clear ( ) ;
586
+ ip += 1 ;
587
+ cur. add ( ip) ;
588
+ continue ' outer;
589
+ }
539
590
next. add ( ip + 1 ) ;
540
591
}
541
592
}
@@ -548,17 +599,34 @@ impl GlobProgram {
548
599
// If we see a `/` then we need to consider ending the globstar.
549
600
if byte == b'/' {
550
601
next. add ( ip + 1 ) ;
602
+ // but even so we should keep trying to match, just like a fork.
603
+ next. add ( ip) ;
604
+ } else {
605
+ if n_threads == 1 {
606
+ continue ' outer;
607
+ }
608
+ next. add ( ip) ;
551
609
}
552
- // but even so we should keep trying to match, just like a fork.
553
- next. add ( ip) ;
554
610
}
555
611
GlobInstruction :: MatchClass ( index) => {
556
612
if self . range_sets [ index as usize ] . contains ( byte) {
613
+ if n_threads == 1 {
614
+ cur. clear ( ) ;
615
+ ip += 1 ;
616
+ cur. add ( ip) ;
617
+ continue ' outer;
618
+ }
557
619
next. add ( ip + 1 ) ;
558
620
}
559
621
}
560
622
GlobInstruction :: NegativeMatchClass ( index) => {
561
623
if !self . range_sets [ index as usize ] . contains ( byte) {
624
+ if n_threads == 1 {
625
+ cur. clear ( ) ;
626
+ ip += 1 ;
627
+ cur. add ( ip) ;
628
+ continue ' outer;
629
+ }
562
630
next. add ( ip + 1 ) ;
563
631
}
564
632
}
@@ -569,24 +637,33 @@ impl GlobProgram {
569
637
}
570
638
}
571
639
GlobInstruction :: Fork ( offset) => {
572
- let added1 = cur. add ( ip + 1 ) ;
573
- let added2 = cur. add ( ( offset + ( ip as i16 ) ) as u16 ) ;
574
- if added1 || added2 {
575
- n_threads = cur. n ;
640
+ if cur. add ( ip + 1 ) {
641
+ n_threads += 1 ;
642
+ }
643
+ if cur. add ( ( offset + ( ip as i16 ) ) as u16 ) {
644
+ n_threads += 1 ;
576
645
}
577
646
}
578
647
GlobInstruction :: Match => {
579
648
// We ran out of instructions while we still have characters
580
649
// so this thread dies
581
650
}
582
651
}
652
+ // Do this at the bottom of the loop
583
653
thread_index += 1 ;
654
+ if thread_index < n_threads {
655
+ ip = cur. get ( thread_index) ;
656
+ } else {
657
+ break ;
658
+ }
584
659
}
585
- if next. n == 0 {
660
+ n_threads = next. n ;
661
+ if n_threads == 0 {
586
662
// This means that all threads exited early. This isn't needed for correctness,
587
663
// but there is no point iterating the rest of the characters.
588
664
return false ;
589
665
}
666
+ ip = next. get ( 0 ) ;
590
667
// We have some progress! clear current and swap the two lists to advance to the next
591
668
// character.
592
669
cur. clear ( ) ;
@@ -618,45 +695,6 @@ impl GlobProgram {
618
695
}
619
696
}
620
697
621
- fn build_alternatives (
622
- prefix : & mut Vec < GlobInstruction > ,
623
- branches : & mut Vec < Vec < GlobInstruction > > ,
624
- ) -> Result < ( ) , anyhow:: Error > {
625
- let num_branches = branches. len ( ) ;
626
- if num_branches <= 1 {
627
- bail ! ( "Cannot have an alternation with less than 2 members, remove the brackets?" ) ;
628
- }
629
- let mut next_branch_offset = num_branches - 1 ;
630
- for branch in & branches[ 0 ..num_branches - 1 ] {
631
- // to jump past the branch we need to jump past all its instructions +1
632
- // to account for the JUMP instruction at the end
633
- next_branch_offset += branch. len ( ) + 1 ;
634
- prefix. push ( GlobInstruction :: Fork (
635
- next_branch_offset
636
- . try_into ( )
637
- . context ( "glob too large, cannot have more than 32K instructions" ) ?,
638
- ) ) ;
639
- next_branch_offset -= 1 ; // subtract one since we added a fork
640
- // instruction.
641
- }
642
- let mut end_of_alternation = next_branch_offset + branches. last ( ) . unwrap ( ) . len ( ) ;
643
- for branch in & mut branches[ 0 ..num_branches - 1 ] {
644
- end_of_alternation -= branch. len ( ) ; // from the end of this branch, this is how far it is to the end of the
645
- // alternation
646
- prefix. extend ( branch. drain ( ..) ) ;
647
- prefix. push ( GlobInstruction :: Jump (
648
- end_of_alternation
649
- . try_into ( )
650
- . context ( "glob too large, cannot have more than 64K instructions" ) ?,
651
- ) ) ;
652
- end_of_alternation -= 1 ; // account for the jump instruction
653
- }
654
- end_of_alternation -= branches. last ( ) . unwrap ( ) . len ( ) ;
655
- prefix. extend ( branches. last_mut ( ) . unwrap ( ) . drain ( ..) ) ;
656
- debug_assert ! ( end_of_alternation == 0 ) ;
657
- Ok ( ( ) )
658
- }
659
-
660
698
// Consider a more compact encoding.
661
699
// The jump offsets force this to 4 bytes
662
700
// A variable length instruction encoding would help a lot
@@ -966,7 +1004,7 @@ mod tests {
966
1004
#[ test]
967
1005
fn test_tokenizer ( ) {
968
1006
let mut tok = Tokenizer :: new ( "foo/bar[a-z]/?/**" ) ;
969
- let prefix: Vec < GlobToken > = "foo/bar" . bytes ( ) . map ( |c| GlobToken :: Literal ( c ) ) . collect ( ) ;
1007
+ let prefix: Vec < GlobToken > = "foo/bar" . bytes ( ) . map ( GlobToken :: Literal ) . collect ( ) ;
970
1008
for t in prefix {
971
1009
assert_eq ! ( t, tok. next_token( ) ) ;
972
1010
}
0 commit comments