Skip to content

Commit

Permalink
Check for terminals that also match a string production but match lon…
Browse files Browse the repository at this point in the history
…ger than that string; they should not match.
  • Loading branch information
gkellogg committed Oct 15, 2024
1 parent 16f9caa commit c8f4095
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 17 deletions.
9 changes: 4 additions & 5 deletions etc/iso-ebnf.isoebnf
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
(* W3C EBNF for ISO/IEC 14977 : 1996 EBNF *)
(* Scoured from https://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf *)

syntax = syntax_rule, {syntax_rule} ;
Expand Down Expand Up @@ -44,10 +43,10 @@ repeated_sequence = start_repeat_symbol, definitions_list, end_repeat_symbol
grouped_sequence = '(', definitions_list, ')'
(* The brackets ( and ) allow any <definitions list> to be a <primary> *);

terminal_string = ("'", first_terminal_character, {first_terminal_character}, "'")
| ('"', second_terminal_character, {second_terminal_character}, '"')
(* A <terminal string> represents the
<characters> between the quote symbols '_' or "_" *);
terminal_string = ("'", first_terminal_character, {first_terminal_character}, "'")
| ('"', second_terminal_character, {second_terminal_character}, '"')
(* A <terminal string> represents the
<characters> between the quote symbols '_' or "_" *);

meta_identifier = letter, {meta_identifier_character}
(* A <meta identifier> is the name of a syntactic element of the language being defined *);
Expand Down
7 changes: 4 additions & 3 deletions lib/ebnf/peg/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def terminal(term, regexp = nil, **options, &block)

##
# Defines a production called at the beggining of a particular production
# with data from previous production. Block is called in an evaluation block from
# with data from previous production along with data defined for the
# current production. Block is called in an evaluation block from
# the enclosing parser.
#
# @param [Symbol] term
Expand Down Expand Up @@ -578,5 +579,5 @@ def initialize(message, **options)
super(message.to_s)
end
end # class Error
end # class Parser
end # module EBNF::LL1
end # module Parser
end # module EBNF::PEG
23 changes: 15 additions & 8 deletions lib/ebnf/peg/rule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def parse(input, **options)

result = case expr.first
when :alt
# Return the first expression to match.
# Return the first expression to match. Look at strings before terminals before non-terminals, with strings ordered by longest first
# Result is either :unmatched, or the value of the matching rule
alt = :unmatched
expr[1..-1].each do |prod|
Expand All @@ -87,9 +87,8 @@ def parse(input, **options)
raise "No rule found for #{prod}" unless rule
rule.parse(input, **options)
when String
# If the input matches any terminal, then it can't be treated as a string
if matched = parser.class.terminal_regexps.detect {|sym, re| input.scan(re)}
input.unscan # Reset scan position
# If the input matches a terminal for which the string is a prefix, don't match the string
if terminal_also_matches(input, prod, string_regexp_opts)
:unmatched
else
s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
Expand Down Expand Up @@ -136,8 +135,7 @@ def parse(input, **options)
raise "No rule found for #{prod}" unless rule
rule.parse(input, **options)
when String
if matched = parser.class.terminal_regexps.detect {|sym, re| input.scan(re)}
input.unscan # Reset scan position
if terminal_also_matches(input, prod, string_regexp_opts)
:unmatched
else
s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
Expand Down Expand Up @@ -195,8 +193,7 @@ def parse(input, **options)
raise "No rule found for #{prod}" unless rule
rule.parse(input, **options.merge(_rept_data: accumulator))
when String
if matched = parser.class.terminal_regexps.detect {|sym, re| input.scan(re)}
input.unscan # Reset scan position
if terminal_also_matches(input, prod, string_regexp_opts)
:unmatched
else
s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
Expand Down Expand Up @@ -236,6 +233,7 @@ def parse(input, **options)
end

if result == :unmatched
# Rewind input to entry point if unmatched.
input.pos, input.lineno = pos, lineno
end

Expand Down Expand Up @@ -287,6 +285,15 @@ def rept(input, min, max, prod, string_regexp_opts, **options)
result.length < min ? :unmatched : result.compact
end

##
# See if a terminal could have a longer match than a string
def terminal_also_matches(input, prod, string_regexp_opts)
str_regex = Regexp.new(Regexp.quote(prod), string_regexp_opts)
input.match?(str_regex) && parser.class.terminal_regexps.any? do |sym, re|
(match_len = input.match?(re)) && match_len > prod.length
end
end

##
# Eat whitespace between non-terminal rules
def eat_whitespace(input)
Expand Down
5 changes: 4 additions & 1 deletion spec/peg/rule_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

describe EBNF::PEG::Rule do
describe "#parse" do
let(:parser) {double("PEG Parser", whitespace: /\s+/, packrat: {}, update_furthest_failure: true)}
let(:parser_class) {double("PEG Parser Class", terminal_regexps: {})}
let(:parser) {
double("PEG Parser", whitespace: /\s+/, packrat: {}, update_furthest_failure: true, class: parser_class)
}

context "non-terminal rules" do
{
Expand Down

0 comments on commit c8f4095

Please sign in to comment.