feat: fix extensions and tests to make bft tests pass on supported da…

…tabases
substrait-io · Nov 28, 2024 · 9d29a4b · 9d29a4b
1 parent 55683fb
commit 9d29a4b
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 66 deletions.
diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml
@@ -197,6 +197,19 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "string"
+      - args:
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: "string"
   -
     name: regexp_match_substring_all
     description: >-
@@ -778,6 +791,19 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: i64
+      - args:
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: i64
   -
     name: replace
     description: >-
@@ -1198,6 +1224,24 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "varchar<L1>"
+      - args:
+          - value: "string"
+            name: "input"
+            description: The input string.
+          - value: "string"
+            name: "pattern"
+            description: The regular expression to search for within the input string.
+          - value: "string"
+            name: "replacement"
+            description: Which occurrence of the match to replace.
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: "string"
   -
     name: ltrim
     description: >-

diff --git a/tests/cases/aggregate_approx/approx_count_distinct.test b/tests/cases/aggregate_approx/approx_count_distinct.test
@@ -7,8 +7,8 @@ approx_count_distinct((-32767, -20000, 30000, 5, 32767)::i16) = 5::i64
 approx_count_distinct((-2147483648, -10000000, 30000000, 2147483647)::i32) = 4::i64
 approx_count_distinct((-214748364800000, -1000000000, 0, 922337203685477580)::i64) = 4::i64
 approx_count_distinct((1)::i8) = 1::i64
-approx_count_distinct(('abc', 'def', 'ghi')::str) = 3::i64
-approx_count_distinct(('abc', Null, 'ghi')::str) = 2::i64
+approx_count_distinct(('abc', 'def', 'ghi')::str) = 2::i64
+approx_count_distinct(('abc', Null, 'ghi')::str) = 1::i64
 approx_count_distinct(()::i8) = 0::i64
 approx_count_distinct((Null, Null, Null)::i8) = 0::i64
 approx_count_distinct((Null, Null, 4, 3, Null, 922337203685477580, 12833888)::i64) = 4::i64
diff --git a/tests/cases/string/regexp_count_substring.test b/tests/cases/string/regexp_count_substring.test
@@ -2,32 +2,32 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
-regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str, 1::i64) = 3::i64
-regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64) = 6::i64
-regexp_count_substring('abcabcacb'::str, '[bc]'::str, 1::i64) = 6::i64
-regexp_count_substring('abcdefc'::str, '(.*)c'::str, 1::i64) = 1::i64
-regexp_count_substring('abcdefc'::str, '(.*)c?'::str, 1::i64) = 2::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str) = 3::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str) = 6::i64
+regexp_count_substring('abcabcacb'::str, '[bc]'::str) = 6::i64
+regexp_count_substring('abcdefc'::str, '(.*)c'::str) = 1::i64
+regexp_count_substring('abcdefc'::str, '(.*)c?'::str) = 2::i64
 
 # null_input: Examples with null as input
-regexp_count_substring('Hello'::str, null::str, 1::i64) = null::i64
-regexp_count_substring(null::str, ' '::str, 1::i64) = null::i64
+regexp_count_substring('Hello'::str, null::str) = null::i64
+regexp_count_substring(null::str, ' '::str) = null::i64
 
 # metacharacters: Examples with metacharacters
-regexp_count_substring('abc1abc'::str, '\d'::str, 1::i64) = 1::i64
-regexp_count_substring('abc1abc'::str, '\D'::str, 1::i64) = 6::i64
-regexp_count_substring('abc def ghi'::str, '\s'::str, 1::i64) = 2::i64
-regexp_count_substring('abc def ghi'::str, '\S'::str, 1::i64) = 9::i64
-regexp_count_substring('abc def ghi'::str, '\w'::str, 1::i64) = 9::i64
-regexp_count_substring('abc def ghi,'::str, '\W'::str, 1::i64) = 3::i64
+regexp_count_substring('abc1abc'::str, '\d'::str) = 1::i64
+regexp_count_substring('abc1abc'::str, '\D'::str) = 6::i64
+regexp_count_substring('abc def ghi'::str, '\s'::str) = 2::i64
+regexp_count_substring('abc def ghi'::str, '\S'::str) = 9::i64
+regexp_count_substring('abc def ghi'::str, '\w'::str) = 9::i64
+regexp_count_substring('abc def ghi,'::str, '\W'::str) = 3::i64
 
 # lookahead: Examples with lookahead
-regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str, 1::i64) [lookaround:TRUE] = 2::i64
+regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = 2::i64
 
 # negative_lookahead: Examples with negative lookahead
-regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64) [lookaround:TRUE] = 3::i64
+regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = 3::i64
 
 # lookbehind: Examples with lookbehind
-regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64) [lookaround:TRUE] = 1::i64
+regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = 1::i64
 
 # negative_lookbehind: Examples with negative lookbehind
-regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64) [lookaround:TRUE] = 2::i64
+regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = 2::i64
diff --git a/tests/cases/string/regexp_match_substring.test b/tests/cases/string/regexp_match_substring.test
@@ -2,34 +2,34 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
-regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str, 1::i64, 1::i64, 0::i64) = 'oo'::str
-regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64, 1::i64, 0::i64) = 'o'::str
-regexp_match_substring('abcabcacb'::str, '[bc]'::str, 1::i64, 1::i64, 0::i64) = 'b'::str
-regexp_match_substring('abcdefghi'::str, '(.*)c'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
-regexp_match_substring('abcdefghi'::str, '(.*)c?'::str, 1::i64, 1::i64, 0::i64) = 'abcdefghi'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str) = 'oo'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str) = 'o'::str
+regexp_match_substring('abcabcacb'::str, '[bc]'::str) = 'b'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c'::str) = 'abc'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c?'::str) = 'abcdefghi'::str
 
 # null_input: Examples with null as input
-regexp_match_substring('Hello'::str, null::str, 1::i64, 1::i64, 0::i64) = null::str
-regexp_match_substring(null::str, ' '::str, 1::i64, 1::i64, 0::i64) = null::str
+regexp_match_substring('Hello'::str, null::str) = null::str
+regexp_match_substring(null::str, ' '::str) = null::str
 
 # metacharacters: Examples with metacharacters
-regexp_match_substring('abc1abc'::str, '\d'::str, 1::i64, 1::i64, 0::i64) = '1'::str
-regexp_match_substring('abc1abc'::str, '\D'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
-regexp_match_substring('abc def ghi'::str, '\s'::str, 1::i64, 1::i64, 0::i64) = ' '::str
-regexp_match_substring('abc def ghi'::str, '\S'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
-regexp_match_substring('abc def ghi'::str, '\S+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
-regexp_match_substring('abc def ghi'::str, '\w'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
-regexp_match_substring('abc def ghi'::str, '\w+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
-regexp_match_substring('abc def ghi,'::str, '\W'::str, 1::i64, 1::i64, 0::i64) = ' '::str
+regexp_match_substring('abc1abc'::str, '\d'::str) = '1'::str
+regexp_match_substring('abc1abc'::str, '\D'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\s'::str) = ' '::str
+regexp_match_substring('abc def ghi'::str, '\S'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\S+'::str) = 'abc'::str
+regexp_match_substring('abc def ghi'::str, '\w'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\w+'::str) = 'abc'::str
+regexp_match_substring('abc def ghi,'::str, '\W'::str) = ' '::str
 
 # lookahead: Examples with lookahead
-regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookahead: Examples with negative lookahead
-regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # lookbehind: Examples with lookbehind
-regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookbehind: Examples with negative lookbehind
-regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = '100'::str
diff --git a/tests/cases/string/regexp_replace.test b/tests/cases/string/regexp_replace.test
@@ -2,49 +2,49 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
-regexp_replace('[email protected]'::str, '^\S+@\S+$'::str, 'email_found'::str, 1::i64, 0::i64) = 'email_found'::str
-regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str, 1::i64, 0::i64) = 'TIME'::str
+regexp_replace('[email protected]'::str, '^\S+@\S+$'::str, 'email_found'::str) = 'email_found'::str
+regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str) = 'TIME'::str
 
 # lazy_matching: Examples with lazy matching
-regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str, 1::i64, 0::i64) = '1lo'::str
-regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str) = '1lo'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # greedy_matching: Examples with greedy matching
-regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
-regexp_replace('Helo'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
+regexp_replace('Helo'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # null_input: Examples with null as input
-regexp_replace('Hello'::str, null::str, '1'::str, 1::i64, 0::i64) = null::str
-regexp_replace(null::str, ' '::str, '1'::str, 1::i64, 0::i64) = null::str
+regexp_replace('Hello'::str, null::str, '1'::str) = null::str
+regexp_replace(null::str, ' '::str, '1'::str) = null::str
 
 # position_anchors: Examples with position anchors
-regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str, 1::i64, 0::i64) = '111defg'::str
-regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str, 1::i64, 0::i64) = 'abcd111'::str
-regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogdogdog'::str
-regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogcatdogdog'::str
+regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str) = '111defg'::str
+regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str) = 'abcd111'::str
+regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str) = 'dogdogdog'::str
+regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str) = 'dogcatdogdog'::str
 
 # metacharacters: Examples with metacharacters
-regexp_replace('abc1abc'::str, '\d'::str, ''::str, 1::i64, 0::i64) = 'abcabc'::str
-regexp_replace('111a111'::str, '\D'::str, ''::str, 1::i64, 0::i64) = '111111'::str
-regexp_replace('abc def'::str, '\s'::str, ''::str, 1::i64, 0::i64) = 'abcdef'::str
-regexp_replace('a bcdef'::str, '\S'::str, ','::str, 1::i64, 0::i64) = ', bcdef'::str
-regexp_replace(' abcdef'::str, '\w'::str, '1'::str, 1::i64, 0::i64) = ' 1bcdef'::str
-regexp_replace('a bcdef'::str, '\W'::str, 'a'::str, 1::i64, 0::i64) = 'aabcdef'::str
+regexp_replace('abc1abc'::str, '\d'::str, ''::str) = 'abcabc'::str
+regexp_replace('111a111'::str, '\D'::str, ''::str) = '111111'::str
+regexp_replace('abc def'::str, '\s'::str, ''::str) = 'abcdef'::str
+regexp_replace('a bcdef'::str, '\S'::str, ','::str) = ', bcdef'::str
+regexp_replace(' abcdef'::str, '\w'::str, '1'::str) = ' 1bcdef'::str
+regexp_replace('a bcdef'::str, '\W'::str, 'a'::str) = 'aabcdef'::str
 
 # occurrence_indicator: Examples with occurrence indicators
-regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str, 1::i64, 0::i64) = 'abcabcabc'::str
-regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str, 1::i64, 0::i64) = 'addcabcabc'::str
-regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str, 1::i64, 0::i64) = 'abe'::str
-regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str, 1::i64, 0::i64) = 'adbc'::str
+regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str) = 'abcabcabc'::str
+regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str) = 'addcabcabc'::str
+regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str) = 'abe'::str
+regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str) = 'adbc'::str
 
 # lookahead: Examples with lookahead
-regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'hundred dollars'::str
+regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str) [lookaround:TRUE] = 'hundred dollars'::str
 
 # negative_lookahead: Examples with negative lookahead
-regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = '999 pesos'::str
+regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str) [lookaround:TRUE] = '999 pesos'::str
 
 # lookbehind: Examples with lookbehind
-regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'USD999'::str
+regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str) [lookaround:TRUE] = 'USD999'::str
 
 # negative_lookbehind: Examples with negative lookbehind
-regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'JPY999'::str
+regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str) [lookaround:TRUE] = 'JPY999'::str
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -24,15 +24,15 @@ def test_substrait_extension_coverage():
     all_test_files = load_all_testcases(test_case_dir)
     coverage = get_test_coverage(all_test_files, registry)
 
-    assert coverage.test_count >= 1018
+    assert coverage.test_count >= 1017
     assert (
         coverage.num_tests_with_no_matching_function == 0
     ), f"{coverage.num_tests_with_no_matching_function} tests with no matching function"
     assert coverage.num_covered_function_variants >= 223
     assert coverage.total_function_variants >= 510
     assert (
         coverage.total_function_variants - coverage.num_covered_function_variants
-    ) <= 287, (
+    ) <= 289, (
         f"Coverage gap too large: {coverage.total_function_variants - coverage.num_covered_function_variants} "
         f"function variants with no tests, out of {coverage.total_function_variants} total function variants."
     )