Skip to content

Commit a6ecf24

Browse files
committed
Port CJ's simple_keys optimization to v3 (#555)
Message from original commit (53403b5): This change introduces an index to lookup token numbers referenced by simple_keys in O(1), thus significantly reducing the performance impact of certain abusively constructed snippets. When we build up the simple_keys stack, we count on the (formerly named) staleness check to catch errors where a simple key is required but would be > 1024 chars or span lines. The previous simplification that searches the stack from the top can go 1024 keys deep before finding a "stale" key and stopping. I added a test that shows that this consumes ~3s per 1MB of document size.
1 parent 4206685 commit a6ecf24

File tree

3 files changed

+36
-26
lines changed

3 files changed

+36
-26
lines changed

limit_test.go

+15
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ var limitTests = []struct {
3737
{name: "10kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 10*1024/4-1) + `]`)},
3838
{name: "100kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 100*1024/4-1) + `]`)},
3939
{name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)},
40+
{name: "1000kb slice nested at max-depth", data: []byte(strings.Repeat(`[`, 10000) + `1` + strings.Repeat(`,1`, 1000*1024/2-20000-1) + strings.Repeat(`]`, 10000))},
41+
{name: "1000kb slice nested in maps at max-depth", data: []byte("{a,b:\n" + strings.Repeat(" {a,b:", 10000-2) + ` [1` + strings.Repeat(",1", 1000*1024/2-6*10000-1) + `]` + strings.Repeat(`}`, 10000-1))},
42+
{name: "1000kb of 10000-nested lines", data: []byte(strings.Repeat(`- `+strings.Repeat(`[`, 10000)+strings.Repeat(`]`, 10000)+"\n", 1000*1024/20000))},
4043
}
4144

4245
func (s *S) TestLimits(c *C) {
@@ -82,6 +85,18 @@ func Benchmark1000KBMaps(b *testing.B) {
8285
benchmark(b, "1000kb of maps")
8386
}
8487

88+
func BenchmarkDeepSlice(b *testing.B) {
89+
benchmark(b, "1000kb slice nested at max-depth")
90+
}
91+
92+
func BenchmarkDeepFlow(b *testing.B) {
93+
benchmark(b, "1000kb slice nested in maps at max-depth")
94+
}
95+
96+
func Benchmark1000KBMaxDepthNested(b *testing.B) {
97+
benchmark(b, "1000kb of 10000-nested lines")
98+
}
99+
85100
func benchmark(b *testing.B, name string) {
86101
for _, t := range limitTests {
87102
if t.name != name {

scannerc.go

+20-26
Original file line numberDiff line numberDiff line change
@@ -657,35 +657,22 @@ func trace(args ...interface{}) func() {
657657
func yaml_parser_fetch_more_tokens(parser *yaml_parser_t) bool {
658658
// While we need more tokens to fetch, do it.
659659
for {
660-
// Check if we really need to fetch more tokens.
661-
need_more_tokens := false
662-
663660
// [Go] The comment parsing logic requires a lookahead of two tokens
664661
// so that foot comments may be parsed in time of associating them
665662
// with the tokens that are parsed before them, and also for line
666663
// comments to be transformed into head comments in some edge cases.
667-
if parser.tokens_head >= len(parser.tokens)-2 {
668-
need_more_tokens = true
669-
} else {
670-
// Check if any potential simple key may occupy the head position.
671-
for i := len(parser.simple_keys) - 1; i >= 0; i-- {
672-
simple_key := &parser.simple_keys[i]
673-
if simple_key.token_number < parser.tokens_parsed {
674-
break
675-
}
676-
if valid, ok := yaml_simple_key_is_valid(parser, simple_key); !ok {
677-
return false
678-
} else if valid && simple_key.token_number == parser.tokens_parsed {
679-
need_more_tokens = true
680-
break
681-
}
664+
if parser.tokens_head < len(parser.tokens)-2 {
665+
// If a potential simple key is at the head position, we need to fetch
666+
// the next token to disambiguate it.
667+
head_tok_idx, ok := parser.simple_keys_by_tok[parser.tokens_parsed]
668+
if !ok {
669+
break
670+
} else if valid, ok := yaml_simple_key_is_valid(parser, &parser.simple_keys[head_tok_idx]); !ok {
671+
return false
672+
} else if !valid {
673+
break
682674
}
683675
}
684-
685-
// We are finished.
686-
if !need_more_tokens {
687-
break
688-
}
689676
// Fetch the next token.
690677
if !yaml_parser_fetch_next_token(parser) {
691678
return false
@@ -938,6 +925,7 @@ func yaml_parser_save_simple_key(parser *yaml_parser_t) bool {
938925
return false
939926
}
940927
parser.simple_keys[len(parser.simple_keys)-1] = simple_key
928+
parser.simple_keys_by_tok[simple_key.token_number] = len(parser.simple_keys) - 1
941929
}
942930
return true
943931
}
@@ -952,9 +940,10 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool {
952940
"while scanning a simple key", parser.simple_keys[i].mark,
953941
"could not find expected ':'")
954942
}
943+
// Remove the key from the stack.
944+
parser.simple_keys[i].possible = false
945+
delete(parser.simple_keys_by_tok, parser.simple_keys[i].token_number)
955946
}
956-
// Remove the key from the stack.
957-
parser.simple_keys[i].possible = false
958947
return true
959948
}
960949

@@ -985,7 +974,9 @@ func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
985974
func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
986975
if parser.flow_level > 0 {
987976
parser.flow_level--
988-
parser.simple_keys = parser.simple_keys[:len(parser.simple_keys)-1]
977+
last := len(parser.simple_keys) - 1
978+
delete(parser.simple_keys_by_tok, parser.simple_keys[last].token_number)
979+
parser.simple_keys = parser.simple_keys[:last]
989980
}
990981
return true
991982
}
@@ -1092,6 +1083,8 @@ func yaml_parser_fetch_stream_start(parser *yaml_parser_t) bool {
10921083
// Initialize the simple key stack.
10931084
parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{})
10941085

1086+
parser.simple_keys_by_tok = make(map[int]int)
1087+
10951088
// A simple key is allowed at the beginning of the stream.
10961089
parser.simple_key_allowed = true
10971090

@@ -1396,6 +1389,7 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool {
13961389

13971390
// Remove the simple key.
13981391
simple_key.possible = false
1392+
delete(parser.simple_keys_by_tok, simple_key.token_number)
13991393

14001394
// A simple key cannot follow another simple key.
14011395
parser.simple_key_allowed = false

yamlh.go

+1
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,7 @@ type yaml_parser_t struct {
622622

623623
simple_key_allowed bool // May a simple key occur at the current position?
624624
simple_keys []yaml_simple_key_t // The stack of simple keys.
625+
simple_keys_by_tok map[int]int // possible simple_key indexes indexed by token_number
625626

626627
// Parser stuff
627628

0 commit comments

Comments
 (0)