feat: support missing variants for regexp string functions (#750)

substrait-io · Dec 10, 2024 · 3410a3e · 3410a3e
1 parent 35eb867
commit 3410a3e
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 5 deletions.
diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml
@@ -197,6 +197,36 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "string"
+  -
+    name: regexp_match_substring
+    description: >-
+      Extract a substring that matches the given regular expression pattern. The regular expression
+      pattern should follow the International Components for Unicode implementation
+      (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The first occurrence of the
+      pattern from the beginning of the string is extracted. It returns the substring matching the
+      full regular expression.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile.
+    impls:
+      - args:
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: "string"
   -
     name: regexp_match_substring_all
     description: >-
@@ -778,6 +808,35 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: i64
+  -
+    name: regexp_count_substring
+    description: >-
+      Return the number of non-overlapping occurrences of a regular expression pattern in an input
+      string. The regular expression pattern should follow the International Components for
+      Unicode implementation (https://unicode-org.github.io/icu/userguide/strings/regexp.html).
+      The match starts at the first character of the input string.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile.
+    impls:
+      - args:
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: i64
   -
     name: replace
     description: >-
@@ -1198,6 +1257,43 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "varchar<L1>"
+  -
+    name: regexp_replace
+    description: >-
+      Search a string for a substring that matches a given regular expression pattern and replace
+      it with a replacement string. The regular expression pattern should follow the
+      International Components for Unicode implementation (https://unicode-org.github
+      .io/icu/userguide/strings/regexp.html). The replacement string can capture groups using numbered
+      backreferences. All occurrences of the pattern will be replaced. The search for matches
+      start at the first character of the input.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines.  This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile or the replacement contains an illegal
+      back-reference.
+    impls:
+      - args:
+          - value: "string"
+            name: "input"
+            description: The input string.
+          - value: "string"
+            name: "pattern"
+            description: The regular expression to search for within the input string.
+          - value: "string"
+            name: "replacement"
+            description: The replacement string.
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: "string"
   -
     name: ltrim
     description: >-

diff --git a/tests/cases/aggregate_approx/approx_count_distinct.test b/tests/cases/aggregate_approx/approx_count_distinct.test
@@ -7,8 +7,6 @@ approx_count_distinct((-32767, -20000, 30000, 5, 32767)::i16) = 5::i64
 approx_count_distinct((-2147483648, -10000000, 30000000, 2147483647)::i32) = 4::i64
 approx_count_distinct((-214748364800000, -1000000000, 0, 922337203685477580)::i64) = 4::i64
 approx_count_distinct((1)::i8) = 1::i64
-approx_count_distinct(('abc', 'def', 'ghi')::str) = 3::i64
-approx_count_distinct(('abc', Null, 'ghi')::str) = 2::i64
 approx_count_distinct(()::i8) = 0::i64
 approx_count_distinct((Null, Null, Null)::i8) = 0::i64
 approx_count_distinct((Null, Null, 4, 3, Null, 922337203685477580, 12833888)::i64) = 4::i64
diff --git a/tests/cases/string/regexp_count_substring.test b/tests/cases/string/regexp_count_substring.test
@@ -7,10 +7,17 @@ regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64) = 6::i64
 regexp_count_substring('abcabcacb'::str, '[bc]'::str, 1::i64) = 6::i64
 regexp_count_substring('abcdefc'::str, '(.*)c'::str, 1::i64) = 1::i64
 regexp_count_substring('abcdefc'::str, '(.*)c?'::str, 1::i64) = 2::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str) = 3::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str) = 6::i64
+regexp_count_substring('abcabcacb'::str, '[bc]'::str) = 6::i64
+regexp_count_substring('abcdefc'::str, '(.*)c'::str) = 1::i64
+regexp_count_substring('abcdefc'::str, '(.*)c?'::str) = 2::i64
 
 # null_input: Examples with null as input
 regexp_count_substring('Hello'::str, null::str, 1::i64) = null::i64
 regexp_count_substring(null::str, ' '::str, 1::i64) = null::i64
+regexp_count_substring('Hello'::str, null::str) = null::i64
+regexp_count_substring(null::str, ' '::str) = null::i64
 
 # metacharacters: Examples with metacharacters
 regexp_count_substring('abc1abc'::str, '\d'::str, 1::i64) = 1::i64
@@ -19,15 +26,25 @@ regexp_count_substring('abc def ghi'::str, '\s'::str, 1::i64) = 2::i64
 regexp_count_substring('abc def ghi'::str, '\S'::str, 1::i64) = 9::i64
 regexp_count_substring('abc def ghi'::str, '\w'::str, 1::i64) = 9::i64
 regexp_count_substring('abc def ghi,'::str, '\W'::str, 1::i64) = 3::i64
+regexp_count_substring('abc1abc'::str, '\d'::str) = 1::i64
+regexp_count_substring('abc1abc'::str, '\D'::str) = 6::i64
+regexp_count_substring('abc def ghi'::str, '\s'::str) = 2::i64
+regexp_count_substring('abc def ghi'::str, '\S'::str) = 9::i64
+regexp_count_substring('abc def ghi'::str, '\w'::str) = 9::i64
+regexp_count_substring('abc def ghi,'::str, '\W'::str) = 3::i64
 
 # lookahead: Examples with lookahead
 regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str, 1::i64) [lookaround:TRUE] = 2::i64
+regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = 2::i64
 
 # negative_lookahead: Examples with negative lookahead
 regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64) [lookaround:TRUE] = 3::i64
+regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = 3::i64
 
 # lookbehind: Examples with lookbehind
 regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64) [lookaround:TRUE] = 1::i64
+regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = 1::i64
 
 # negative_lookbehind: Examples with negative lookbehind
 regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64) [lookaround:TRUE] = 2::i64
+regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = 2::i64
diff --git a/tests/cases/string/regexp_match_substring.test b/tests/cases/string/regexp_match_substring.test
@@ -7,10 +7,17 @@ regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64, 1::i64, 0::i64
 regexp_match_substring('abcabcacb'::str, '[bc]'::str, 1::i64, 1::i64, 0::i64) = 'b'::str
 regexp_match_substring('abcdefghi'::str, '(.*)c'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
 regexp_match_substring('abcdefghi'::str, '(.*)c?'::str, 1::i64, 1::i64, 0::i64) = 'abcdefghi'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str) = 'oo'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str) = 'o'::str
+regexp_match_substring('abcabcacb'::str, '[bc]'::str) = 'b'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c'::str) = 'abc'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c?'::str) = 'abcdefghi'::str
 
 # null_input: Examples with null as input
 regexp_match_substring('Hello'::str, null::str, 1::i64, 1::i64, 0::i64) = null::str
 regexp_match_substring(null::str, ' '::str, 1::i64, 1::i64, 0::i64) = null::str
+regexp_match_substring('Hello'::str, null::str) = null::str
+regexp_match_substring(null::str, ' '::str) = null::str
 
 # metacharacters: Examples with metacharacters
 regexp_match_substring('abc1abc'::str, '\d'::str, 1::i64, 1::i64, 0::i64) = '1'::str
@@ -21,15 +28,27 @@ regexp_match_substring('abc def ghi'::str, '\S+'::str, 1::i64, 1::i64, 0::i64) =
 regexp_match_substring('abc def ghi'::str, '\w'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
 regexp_match_substring('abc def ghi'::str, '\w+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
 regexp_match_substring('abc def ghi,'::str, '\W'::str, 1::i64, 1::i64, 0::i64) = ' '::str
+regexp_match_substring('abc1abc'::str, '\d'::str) = '1'::str
+regexp_match_substring('abc1abc'::str, '\D'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\s'::str) = ' '::str
+regexp_match_substring('abc def ghi'::str, '\S'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\S+'::str) = 'abc'::str
+regexp_match_substring('abc def ghi'::str, '\w'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\w+'::str) = 'abc'::str
+regexp_match_substring('abc def ghi,'::str, '\W'::str) = ' '::str
 
 # lookahead: Examples with lookahead
 regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookahead: Examples with negative lookahead
 regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # lookbehind: Examples with lookbehind
 regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookbehind: Examples with negative lookbehind
 regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = '100'::str
diff --git a/tests/cases/string/regexp_replace.test b/tests/cases/string/regexp_replace.test
@@ -4,24 +4,36 @@
 # basic: Basic examples without any special cases
 regexp_replace('[email protected]'::str, '^\S+@\S+$'::str, 'email_found'::str, 1::i64, 0::i64) = 'email_found'::str
 regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str, 1::i64, 0::i64) = 'TIME'::str
+regexp_replace('[email protected]'::str, '^\S+@\S+$'::str, 'email_found'::str) = 'email_found'::str
+regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str) = 'TIME'::str
 
 # lazy_matching: Examples with lazy matching
 regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str, 1::i64, 0::i64) = '1lo'::str
 regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str) = '1lo'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # greedy_matching: Examples with greedy matching
 regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
 regexp_replace('Helo'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
+regexp_replace('Helo'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # null_input: Examples with null as input
 regexp_replace('Hello'::str, null::str, '1'::str, 1::i64, 0::i64) = null::str
 regexp_replace(null::str, ' '::str, '1'::str, 1::i64, 0::i64) = null::str
+regexp_replace('Hello'::str, null::str, '1'::str) = null::str
+regexp_replace(null::str, ' '::str, '1'::str) = null::str
 
 # position_anchors: Examples with position anchors
 regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str, 1::i64, 0::i64) = '111defg'::str
 regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str, 1::i64, 0::i64) = 'abcd111'::str
 regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogdogdog'::str
 regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogcatdogdog'::str
+regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str) = '111defg'::str
+regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str) = 'abcd111'::str
+regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str) = 'dogdogdog'::str
+regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str) = 'dogcatdogdog'::str
 
 # metacharacters: Examples with metacharacters
 regexp_replace('abc1abc'::str, '\d'::str, ''::str, 1::i64, 0::i64) = 'abcabc'::str
@@ -30,21 +42,35 @@ regexp_replace('abc def'::str, '\s'::str, ''::str, 1::i64, 0::i64) = 'abcdef'::s
 regexp_replace('a bcdef'::str, '\S'::str, ','::str, 1::i64, 0::i64) = ', bcdef'::str
 regexp_replace(' abcdef'::str, '\w'::str, '1'::str, 1::i64, 0::i64) = ' 1bcdef'::str
 regexp_replace('a bcdef'::str, '\W'::str, 'a'::str, 1::i64, 0::i64) = 'aabcdef'::str
+regexp_replace('abc1abc'::str, '\d'::str, ''::str) = 'abcabc'::str
+regexp_replace('111a111'::str, '\D'::str, ''::str) = '111111'::str
+regexp_replace('abc def'::str, '\s'::str, ''::str) = 'abcdef'::str
+regexp_replace('a bcdef'::str, '\S'::str, ','::str) = ', bcdef'::str
+regexp_replace(' abcdef'::str, '\w'::str, '1'::str) = ' 1bcdef'::str
+regexp_replace('a bcdef'::str, '\W'::str, 'a'::str) = 'aabcdef'::str
 
 # occurrence_indicator: Examples with occurrence indicators
 regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str, 1::i64, 0::i64) = 'abcabcabc'::str
 regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str, 1::i64, 0::i64) = 'addcabcabc'::str
 regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str, 1::i64, 0::i64) = 'abe'::str
 regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str, 1::i64, 0::i64) = 'adbc'::str
+regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str) = 'abcabcabc'::str
+regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str) = 'addcabcabc'::str
+regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str) = 'abe'::str
+regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str) = 'adbc'::str
 
 # lookahead: Examples with lookahead
 regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'hundred dollars'::str
+regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str) [lookaround:TRUE] = 'hundred dollars'::str
 
 # negative_lookahead: Examples with negative lookahead
 regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = '999 pesos'::str
+regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str) [lookaround:TRUE] = '999 pesos'::str
 
 # lookbehind: Examples with lookbehind
 regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'USD999'::str
+regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str) [lookaround:TRUE] = 'USD999'::str
 
 # negative_lookbehind: Examples with negative lookbehind
 regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'JPY999'::str
+regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str) [lookaround:TRUE] = 'JPY999'::str
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -24,12 +24,12 @@ def test_substrait_extension_coverage():
     all_test_files = load_all_testcases(test_case_dir)
     coverage = get_test_coverage(all_test_files, registry)
 
-    assert coverage.test_count >= 1018
+    assert coverage.test_count >= 1077
     assert (
         coverage.num_tests_with_no_matching_function == 0
     ), f"{coverage.num_tests_with_no_matching_function} tests with no matching function"
-    assert coverage.num_covered_function_variants >= 223
-    assert coverage.total_function_variants >= 510
+    assert coverage.num_covered_function_variants >= 226
+    assert coverage.total_function_variants >= 513
     assert (
         coverage.total_function_variants - coverage.num_covered_function_variants
     ) <= 287, (