Skip to content

Commit

Permalink
feat: character set intersections
Browse files Browse the repository at this point in the history
  • Loading branch information
Aloso committed Nov 30, 2024
1 parent 1ca33b2 commit a8e4710
Show file tree
Hide file tree
Showing 31 changed files with 473 additions and 129 deletions.
1 change: 1 addition & 0 deletions pomsky-bin/src/args/features.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pub(super) fn parse_features(value: OsString) -> Result<PomskyFeatures, ParseArg
"regexes" => features.regexes(true),
"dot" => features.dot(true),
"recursion" => features.recursion(true),
"intersection" => features.intersection(true),
s => {
efprintln!(Y!"warning" ": unknown feature `" {s} "`");
features
Expand Down
1 change: 1 addition & 0 deletions pomsky-bin/src/args/help.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Currently, the following warnings can be disabled:"]
"boundaries" => { ["Allows matching word boundaries and anchors " g:"%" ", " g:"!%" ", " g:"^" ", " g:"$"] }
"dot" => { ["Allows matching the dot " g:"."] }
"grapheme" => { ["Allows matching a grapheme cluster with " g:"Grapheme" " or " g:"G"] }
"intersection" => { ["Allows intersecting character sets with " g:"&"] }
"lazy-mode" => { ["Allows enabling lazy mode globally with " g:"enable lazy;"] }
"lookahead" => { ["Allows (negative) lookahead, e.g. " g:"(>> 'test')"] }
"lookbehind" => { ["Allows (negative) lookbehind, e.g. " g:"(<< 'test')"] }
Expand Down
13 changes: 11 additions & 2 deletions pomsky-lib/src/diagnose/compile_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ pub(crate) enum CompileErrorKind {
flavor: RegexFlavor,
},
NestedTest,
BadIntersection,
}

impl CompileErrorKind {
Expand All @@ -113,6 +114,10 @@ impl core::fmt::Display for CompileErrorKind {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
CompileErrorKind::ParseError(kind) => write!(f, "Parse error: {kind}"),
CompileErrorKind::BadIntersection => write!(f,
"Intersecting these expressions is not supported. Only character sets \
can be intersected."
),
CompileErrorKind::Unsupported(feature, flavor) => match feature {
Feature::SpecificUnicodeProp => write!(
f,
Expand All @@ -133,11 +138,13 @@ impl core::fmt::Display for CompileErrorKind {
),
Feature::NegativeShorthandW => write!(
f,
"In the `{flavor:?}` flavor, `word` can only be negated in a character class when Unicode is disabled"
"In the `{flavor:?}` flavor, `word` can only be negated in a character class \
when Unicode is disabled"
),
Feature::NegativeShorthandS => write!(
f,
"In the `{flavor:?}` flavor, `space` can only be negated in a character class when Unicode is disabled"
"In the `{flavor:?}` flavor, `space` can only be negated in a character class \
when Unicode is disabled"
),
_ => write!(
f,
Expand Down Expand Up @@ -239,6 +246,7 @@ pub(crate) enum UnsupportedError {
Regexes,
Dot,
Recursion,
Intersection,
}

impl std::error::Error for UnsupportedError {}
Expand All @@ -261,6 +269,7 @@ impl core::fmt::Display for UnsupportedError {
UnsupportedError::Regexes => "Unescaped regexes aren't supported",
UnsupportedError::Dot => "The dot isn't supported",
UnsupportedError::Recursion => "Recursion isn't supported",
UnsupportedError::Intersection => "Intersection isn't supported",
};

f.write_str(error)
Expand Down
2 changes: 2 additions & 0 deletions pomsky-lib/src/diagnose/diagnostic_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ diagnostic_code! {
RubyLookaheadInLookbehind = 319,
UnsupportedInLookbehind = 320,
LookbehindNotConstantLength = 321,
BadIntersection = 322,

// Warning indicating something might not be supported
PossiblyUnsupported = 400,
Expand Down Expand Up @@ -232,6 +233,7 @@ impl<'a> From<&'a CompileErrorKind> for DiagnosticCode {
C::UnsupportedInLookbehind { .. } => Self::UnsupportedInLookbehind,
C::LookbehindNotConstantLength { .. } => Self::LookbehindNotConstantLength,
C::NestedTest => Self::NestedTest,
C::BadIntersection => Self::BadIntersection,
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion pomsky-lib/src/diagnose/diagnostic_kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ impl From<&CompileErrorKind> for DiagnosticKind {
| K::DotNetNumberedRefWithMixedGroups
| K::RubyLookaheadInLookbehind { .. }
| K::UnsupportedInLookbehind { .. }
| K::LookbehindNotConstantLength { .. } => DiagnosticKind::Unsupported,
| K::LookbehindNotConstantLength { .. }
| K::BadIntersection => DiagnosticKind::Unsupported,
K::RangeIsTooBig(_) => DiagnosticKind::Limits,
}
}
Expand Down
3 changes: 3 additions & 0 deletions pomsky-lib/src/diagnose/feature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ pub enum Feature {
WordStartEnd,
/// Unicode script extensions, e.g. `[scx:Greek]`
ScriptExtensions,
/// Character set intersections
CharSetIntersection,
}

impl Feature {
Expand All @@ -70,6 +72,7 @@ impl Feature {
Feature::UnicodeWordBoundaries => "word boundaries in Unicode mode",
Feature::WordStartEnd => "word start and word end",
Feature::ScriptExtensions => "Unicode script extensions",
Feature::CharSetIntersection => "Character set intersections",
}
}
}
167 changes: 167 additions & 0 deletions pomsky-lib/src/exprs/char_class/char_set_item.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
use std::fmt;

use crate::options::RegexFlavor;

use super::{literal, Regex, RegexProperty, RegexShorthand, UnicodeSet};

#[cfg_attr(feature = "dbg", derive(Debug))]
#[derive(Default)]
pub(crate) struct RegexCompoundCharSet {
pub(crate) negative: bool,
pub(crate) intersections: Vec<RegexCharSet>,
}

impl RegexCompoundCharSet {
pub(crate) fn new(set: RegexCharSet) -> Self {
RegexCompoundCharSet { negative: false, intersections: vec![set] }
}

pub(crate) fn negate(mut self) -> RegexCompoundCharSet {
self.negative = !self.negative;
self
}

pub(crate) fn add(mut self, other: RegexCharSet) -> Regex {
if other.negative && self.intersections.iter().all(|i| i.negative) {
let mut intersections = self.intersections.into_iter();
let mut char_set = intersections.next().expect("Intersection is empty");
for next_set in intersections {
char_set.set.extend(next_set.set);
}
char_set.set.extend(other.set);
if self.negative {
char_set = char_set.negate();
}
Regex::CharSet(char_set)
} else {
self.intersections.push(other);
Regex::CompoundCharSet(self)
}
}

pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {
if self.negative {
buf.push_str("[^");
} else {
buf.push('[');
}

let mut is_first = true;
for intersection in &self.intersections {
if !is_first {
buf.push_str("&&");
}
intersection.codegen(buf, flavor, true);
is_first = false;
}

buf.push(']');
}
}

#[cfg_attr(feature = "dbg", derive(Debug))]
#[derive(Default)]
pub(crate) struct RegexCharSet {
pub(crate) negative: bool,
pub(crate) set: UnicodeSet,
}

impl RegexCharSet {
pub(crate) fn new(items: UnicodeSet) -> Self {
Self { negative: false, set: items }
}

pub(crate) fn negate(mut self) -> Self {
self.negative = !self.negative;
self
}

pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor, inside_compound: bool) {
if self.set.len() == 1 {
if let Some(range) = self.set.ranges().next() {
let (first, last) = range.as_chars();
if first == last && !self.negative {
return literal::codegen_char_esc(first, buf, flavor);
}
} else if let Some(prop) = self.set.props().next() {
match prop {
RegexCharSetItem::Shorthand(s) => {
let shorthand = if self.negative { s.negate() } else { Some(s) };
if let Some(shorthand) = shorthand {
return shorthand.codegen(buf);
}
}
RegexCharSetItem::Property { negative, value } => {
return value.codegen(buf, negative ^ self.negative, flavor);
}
}
}
}

if self.negative {
buf.push_str("[^");
} else if !inside_compound {
buf.push('[');
}

let mut is_first = true;
for prop in self.set.props() {
match prop {
RegexCharSetItem::Shorthand(s) => s.codegen(buf),
RegexCharSetItem::Property { negative, value } => {
value.codegen(buf, negative, flavor);
}
}
is_first = false;
}
for range in self.set.ranges() {
let (first, last) = range.as_chars();
if first == last {
literal::compile_char_esc_in_class(first, buf, is_first, flavor);
} else {
literal::compile_char_esc_in_class(first, buf, is_first, flavor);
if range.first + 1 < range.last {
buf.push('-');
}
literal::compile_char_esc_in_class(last, buf, false, flavor);
}
is_first = false;
}

if self.negative || !inside_compound {
buf.push(']');
}
}
}

#[derive(Clone, Copy, PartialEq, Eq)]
pub(crate) enum RegexCharSetItem {
Shorthand(RegexShorthand),
Property { negative: bool, value: RegexProperty },
}

impl RegexCharSetItem {
pub(crate) fn negate(self) -> Option<Self> {
match self {
RegexCharSetItem::Shorthand(s) => s.negate().map(RegexCharSetItem::Shorthand),
RegexCharSetItem::Property { negative, value } => {
Some(RegexCharSetItem::Property { negative: !negative, value })
}
}
}
}

impl fmt::Debug for RegexCharSetItem {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Shorthand(s) => f.write_str(s.as_str()),
&Self::Property { value, negative } => {
if negative {
f.write_str("!")?;
}
f.write_str(value.prefix_as_str())?;
f.write_str(value.as_str())
}
}
}
}
Loading

0 comments on commit a8e4710

Please sign in to comment.