From 9d29a4b59e363dace98f2c2364316668be055cd0 Mon Sep 17 00:00:00 2001
From: Sri Krishna chowdary K <krishna@sundeck.io>
Date: Thu, 28 Nov 2024 17:29:20 +0530
Subject: [PATCH 1/2] feat: fix extensions and tests to make bft tests pass on
 supported databases

---
 extensions/functions_string.yaml              | 44 ++++++++++++++++
 .../approx_count_distinct.test                |  4 +-
 .../cases/string/regexp_count_substring.test  | 34 ++++++------
 .../cases/string/regexp_match_substring.test  | 38 +++++++-------
 tests/cases/string/regexp_replace.test        | 52 +++++++++----------
 tests/test_extensions.py                      |  4 +-
 6 files changed, 110 insertions(+), 66 deletions(-)
diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml
index 3acdb48ae..e7995dfbb 100644
--- a/extensions/functions_string.yaml
+++ b/extensions/functions_string.yaml
@@ -197,6 +197,19 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "string"
+      - args:
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: "string"
   -
     name: regexp_match_substring_all
     description: >-
@@ -778,6 +791,19 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: i64
+      - args:
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: i64
   -
     name: replace
     description: >-
@@ -1198,6 +1224,24 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "varchar<L1>"
+      - args:
+          - value: "string"
+            name: "input"
+            description: The input string.
+          - value: "string"
+            name: "pattern"
+            description: The regular expression to search for within the input string.
+          - value: "string"
+            name: "replacement"
+            description: Which occurrence of the match to replace.
+        options:
+          case_sensitivity:
+            values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
+          multiline:
+            values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ]
+          dotall:
+            values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
+        return: "string"
   -
     name: ltrim
     description: >-
diff --git a/tests/cases/aggregate_approx/approx_count_distinct.test b/tests/cases/aggregate_approx/approx_count_distinct.test
index 792bfa507..558de551d 100644
--- a/tests/cases/aggregate_approx/approx_count_distinct.test
+++ b/tests/cases/aggregate_approx/approx_count_distinct.test
@@ -7,8 +7,8 @@ approx_count_distinct((-32767, -20000, 30000, 5, 32767)::i16) = 5::i64
 approx_count_distinct((-2147483648, -10000000, 30000000, 2147483647)::i32) = 4::i64
 approx_count_distinct((-214748364800000, -1000000000, 0, 922337203685477580)::i64) = 4::i64
 approx_count_distinct((1)::i8) = 1::i64
-approx_count_distinct(('abc', 'def', 'ghi')::str) = 3::i64
-approx_count_distinct(('abc', Null, 'ghi')::str) = 2::i64
+approx_count_distinct(('abc', 'def', 'ghi')::str) = 2::i64
+approx_count_distinct(('abc', Null, 'ghi')::str) = 1::i64
 approx_count_distinct(()::i8) = 0::i64
 approx_count_distinct((Null, Null, Null)::i8) = 0::i64
 approx_count_distinct((Null, Null, 4, 3, Null, 922337203685477580, 12833888)::i64) = 4::i64
diff --git a/tests/cases/string/regexp_count_substring.test b/tests/cases/string/regexp_count_substring.test
index bd4f1081f..67778734f 100644
--- a/tests/cases/string/regexp_count_substring.test
+++ b/tests/cases/string/regexp_count_substring.test
@@ -2,32 +2,32 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
-regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str, 1::i64) = 3::i64
-regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64) = 6::i64
-regexp_count_substring('abcabcacb'::str, '[bc]'::str, 1::i64) = 6::i64
-regexp_count_substring('abcdefc'::str, '(.*)c'::str, 1::i64) = 1::i64
-regexp_count_substring('abcdefc'::str, '(.*)c?'::str, 1::i64) = 2::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str) = 3::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str) = 6::i64
+regexp_count_substring('abcabcacb'::str, '[bc]'::str) = 6::i64
+regexp_count_substring('abcdefc'::str, '(.*)c'::str) = 1::i64
+regexp_count_substring('abcdefc'::str, '(.*)c?'::str) = 2::i64
 
 # null_input: Examples with null as input
-regexp_count_substring('Hello'::str, null::str, 1::i64) = null::i64
-regexp_count_substring(null::str, ' '::str, 1::i64) = null::i64
+regexp_count_substring('Hello'::str, null::str) = null::i64
+regexp_count_substring(null::str, ' '::str) = null::i64
 
 # metacharacters: Examples with metacharacters
-regexp_count_substring('abc1abc'::str, '\d'::str, 1::i64) = 1::i64
-regexp_count_substring('abc1abc'::str, '\D'::str, 1::i64) = 6::i64
-regexp_count_substring('abc def ghi'::str, '\s'::str, 1::i64) = 2::i64
-regexp_count_substring('abc def ghi'::str, '\S'::str, 1::i64) = 9::i64
-regexp_count_substring('abc def ghi'::str, '\w'::str, 1::i64) = 9::i64
-regexp_count_substring('abc def ghi,'::str, '\W'::str, 1::i64) = 3::i64
+regexp_count_substring('abc1abc'::str, '\d'::str) = 1::i64
+regexp_count_substring('abc1abc'::str, '\D'::str) = 6::i64
+regexp_count_substring('abc def ghi'::str, '\s'::str) = 2::i64
+regexp_count_substring('abc def ghi'::str, '\S'::str) = 9::i64
+regexp_count_substring('abc def ghi'::str, '\w'::str) = 9::i64
+regexp_count_substring('abc def ghi,'::str, '\W'::str) = 3::i64
 
 # lookahead: Examples with lookahead
-regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str, 1::i64) [lookaround:TRUE] = 2::i64
+regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = 2::i64
 
 # negative_lookahead: Examples with negative lookahead
-regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64) [lookaround:TRUE] = 3::i64
+regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = 3::i64
 
 # lookbehind: Examples with lookbehind
-regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64) [lookaround:TRUE] = 1::i64
+regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = 1::i64
 
 # negative_lookbehind: Examples with negative lookbehind
-regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64) [lookaround:TRUE] = 2::i64
+regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = 2::i64
diff --git a/tests/cases/string/regexp_match_substring.test b/tests/cases/string/regexp_match_substring.test
index 8fde1c6e3..f7dd3b8be 100644
--- a/tests/cases/string/regexp_match_substring.test
+++ b/tests/cases/string/regexp_match_substring.test
@@ -2,34 +2,34 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
-regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str, 1::i64, 1::i64, 0::i64) = 'oo'::str
-regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64, 1::i64, 0::i64) = 'o'::str
-regexp_match_substring('abcabcacb'::str, '[bc]'::str, 1::i64, 1::i64, 0::i64) = 'b'::str
-regexp_match_substring('abcdefghi'::str, '(.*)c'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
-regexp_match_substring('abcdefghi'::str, '(.*)c?'::str, 1::i64, 1::i64, 0::i64) = 'abcdefghi'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str) = 'oo'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str) = 'o'::str
+regexp_match_substring('abcabcacb'::str, '[bc]'::str) = 'b'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c'::str) = 'abc'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c?'::str) = 'abcdefghi'::str
 
 # null_input: Examples with null as input
-regexp_match_substring('Hello'::str, null::str, 1::i64, 1::i64, 0::i64) = null::str
-regexp_match_substring(null::str, ' '::str, 1::i64, 1::i64, 0::i64) = null::str
+regexp_match_substring('Hello'::str, null::str) = null::str
+regexp_match_substring(null::str, ' '::str) = null::str
 
 # metacharacters: Examples with metacharacters
-regexp_match_substring('abc1abc'::str, '\d'::str, 1::i64, 1::i64, 0::i64) = '1'::str
-regexp_match_substring('abc1abc'::str, '\D'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
-regexp_match_substring('abc def ghi'::str, '\s'::str, 1::i64, 1::i64, 0::i64) = ' '::str
-regexp_match_substring('abc def ghi'::str, '\S'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
-regexp_match_substring('abc def ghi'::str, '\S+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
-regexp_match_substring('abc def ghi'::str, '\w'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
-regexp_match_substring('abc def ghi'::str, '\w+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
-regexp_match_substring('abc def ghi,'::str, '\W'::str, 1::i64, 1::i64, 0::i64) = ' '::str
+regexp_match_substring('abc1abc'::str, '\d'::str) = '1'::str
+regexp_match_substring('abc1abc'::str, '\D'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\s'::str) = ' '::str
+regexp_match_substring('abc def ghi'::str, '\S'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\S+'::str) = 'abc'::str
+regexp_match_substring('abc def ghi'::str, '\w'::str) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\w+'::str) = 'abc'::str
+regexp_match_substring('abc def ghi,'::str, '\W'::str) = ' '::str
 
 # lookahead: Examples with lookahead
-regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookahead: Examples with negative lookahead
-regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # lookbehind: Examples with lookbehind
-regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookbehind: Examples with negative lookbehind
-regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
+regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = '100'::str
diff --git a/tests/cases/string/regexp_replace.test b/tests/cases/string/regexp_replace.test
index df2022cea..4de3c8955 100644
--- a/tests/cases/string/regexp_replace.test
+++ b/tests/cases/string/regexp_replace.test
@@ -2,49 +2,49 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
-regexp_replace('test@test.com'::str, '^\S+@\S+$'::str, 'email_found'::str, 1::i64, 0::i64) = 'email_found'::str
-regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str, 1::i64, 0::i64) = 'TIME'::str
+regexp_replace('test@test.com'::str, '^\S+@\S+$'::str, 'email_found'::str) = 'email_found'::str
+regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str) = 'TIME'::str
 
 # lazy_matching: Examples with lazy matching
-regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str, 1::i64, 0::i64) = '1lo'::str
-regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str) = '1lo'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # greedy_matching: Examples with greedy matching
-regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
-regexp_replace('Helo'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
+regexp_replace('Helo'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # null_input: Examples with null as input
-regexp_replace('Hello'::str, null::str, '1'::str, 1::i64, 0::i64) = null::str
-regexp_replace(null::str, ' '::str, '1'::str, 1::i64, 0::i64) = null::str
+regexp_replace('Hello'::str, null::str, '1'::str) = null::str
+regexp_replace(null::str, ' '::str, '1'::str) = null::str
 
 # position_anchors: Examples with position anchors
-regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str, 1::i64, 0::i64) = '111defg'::str
-regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str, 1::i64, 0::i64) = 'abcd111'::str
-regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogdogdog'::str
-regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogcatdogdog'::str
+regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str) = '111defg'::str
+regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str) = 'abcd111'::str
+regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str) = 'dogdogdog'::str
+regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str) = 'dogcatdogdog'::str
 
 # metacharacters: Examples with metacharacters
-regexp_replace('abc1abc'::str, '\d'::str, ''::str, 1::i64, 0::i64) = 'abcabc'::str
-regexp_replace('111a111'::str, '\D'::str, ''::str, 1::i64, 0::i64) = '111111'::str
-regexp_replace('abc def'::str, '\s'::str, ''::str, 1::i64, 0::i64) = 'abcdef'::str
-regexp_replace('a bcdef'::str, '\S'::str, ','::str, 1::i64, 0::i64) = ', bcdef'::str
-regexp_replace(' abcdef'::str, '\w'::str, '1'::str, 1::i64, 0::i64) = ' 1bcdef'::str
-regexp_replace('a bcdef'::str, '\W'::str, 'a'::str, 1::i64, 0::i64) = 'aabcdef'::str
+regexp_replace('abc1abc'::str, '\d'::str, ''::str) = 'abcabc'::str
+regexp_replace('111a111'::str, '\D'::str, ''::str) = '111111'::str
+regexp_replace('abc def'::str, '\s'::str, ''::str) = 'abcdef'::str
+regexp_replace('a bcdef'::str, '\S'::str, ','::str) = ', bcdef'::str
+regexp_replace(' abcdef'::str, '\w'::str, '1'::str) = ' 1bcdef'::str
+regexp_replace('a bcdef'::str, '\W'::str, 'a'::str) = 'aabcdef'::str
 
 # occurrence_indicator: Examples with occurrence indicators
-regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str, 1::i64, 0::i64) = 'abcabcabc'::str
-regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str, 1::i64, 0::i64) = 'addcabcabc'::str
-regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str, 1::i64, 0::i64) = 'abe'::str
-regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str, 1::i64, 0::i64) = 'adbc'::str
+regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str) = 'abcabcabc'::str
+regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str) = 'addcabcabc'::str
+regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str) = 'abe'::str
+regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str) = 'adbc'::str
 
 # lookahead: Examples with lookahead
-regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'hundred dollars'::str
+regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str) [lookaround:TRUE] = 'hundred dollars'::str
 
 # negative_lookahead: Examples with negative lookahead
-regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = '999 pesos'::str
+regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str) [lookaround:TRUE] = '999 pesos'::str
 
 # lookbehind: Examples with lookbehind
-regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'USD999'::str
+regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str) [lookaround:TRUE] = 'USD999'::str
 
 # negative_lookbehind: Examples with negative lookbehind
-regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'JPY999'::str
+regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str) [lookaround:TRUE] = 'JPY999'::str
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index e3120c7e9..ad22ede8f 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -24,7 +24,7 @@ def test_substrait_extension_coverage():
     all_test_files = load_all_testcases(test_case_dir)
     coverage = get_test_coverage(all_test_files, registry)
 
-    assert coverage.test_count >= 1018
+    assert coverage.test_count >= 1017
     assert (
         coverage.num_tests_with_no_matching_function == 0
     ), f"{coverage.num_tests_with_no_matching_function} tests with no matching function"
@@ -32,7 +32,7 @@ def test_substrait_extension_coverage():
     assert coverage.total_function_variants >= 510
     assert (
         coverage.total_function_variants - coverage.num_covered_function_variants
-    ) <= 287, (
+    ) <= 289, (
         f"Coverage gap too large: {coverage.total_function_variants - coverage.num_covered_function_variants} "
         f"function variants with no tests, out of {coverage.total_function_variants} total function variants."
     )

From 7acacb3375e673da16b42803d7ec8014ad1ceb99 Mon Sep 17 00:00:00 2001
From: Sri Krishna chowdary K <krishna@sundeck.io>
Date: Mon, 2 Dec 2024 18:18:31 +0530
Subject: [PATCH 2/2] review comments

---
 extensions/functions_string.yaml              | 54 ++++++++++++++++++-
 .../approx_count_distinct.test                |  2 -
 .../cases/string/regexp_count_substring.test  | 17 ++++++
 .../cases/string/regexp_match_substring.test  | 19 +++++++
 tests/cases/string/regexp_replace.test        | 26 +++++++++
 tests/test_extensions.py                      |  8 +--
 6 files changed, 119 insertions(+), 7 deletions(-)

diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml
index e7995dfbb..399fb3797 100644
--- a/extensions/functions_string.yaml
+++ b/extensions/functions_string.yaml
@@ -197,6 +197,23 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "string"
+  -
+    name: regexp_match_substring
+    description: >-
+      Extract a substring that matches the given regular expression pattern. The regular expression
+      pattern should follow the International Components for Unicode implementation
+      (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The first occurrence of the
+      pattern from the beginning of the string is extracted. It returns the substring matching the
+      full regular expression.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile.
+    impls:
       - args:
           - value: "string"
             name: "input"
@@ -791,6 +808,22 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: i64
+  -
+    name: regexp_count_substring
+    description: >-
+      Return the number of non-overlapping occurrences of a regular expression pattern in an input
+      string. The regular expression pattern should follow the International Components for
+      Unicode implementation (https://unicode-org.github.io/icu/userguide/strings/regexp.html).
+      The match starts at the first character of the input string.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile.
+    impls:
       - args:
           - value: "string"
             name: "input"
@@ -1224,6 +1257,25 @@ scalar_functions:
           dotall:
             values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
         return: "varchar<L1>"
+  -
+    name: regexp_replace
+    description: >-
+      Search a string for a substring that matches a given regular expression pattern and replace
+      it with a replacement string. The regular expression pattern should follow the
+      International Components for Unicode implementation (https://unicode-org.github
+      .io/icu/userguide/strings/regexp.html). The replacement string can capture groups using numbered
+      backreferences. All occurrences of the pattern will be replaced. The search for matches
+      start at the first character of the input.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines.  This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile or the replacement contains an illegal
+      back-reference.
+    impls:
       - args:
           - value: "string"
             name: "input"
@@ -1233,7 +1285,7 @@ scalar_functions:
             description: The regular expression to search for within the input string.
           - value: "string"
             name: "replacement"
-            description: Which occurrence of the match to replace.
+            description: The replacement string.
         options:
           case_sensitivity:
             values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ]
diff --git a/tests/cases/aggregate_approx/approx_count_distinct.test b/tests/cases/aggregate_approx/approx_count_distinct.test
index 558de551d..dfabe7e24 100644
--- a/tests/cases/aggregate_approx/approx_count_distinct.test
+++ b/tests/cases/aggregate_approx/approx_count_distinct.test
@@ -7,8 +7,6 @@ approx_count_distinct((-32767, -20000, 30000, 5, 32767)::i16) = 5::i64
 approx_count_distinct((-2147483648, -10000000, 30000000, 2147483647)::i32) = 4::i64
 approx_count_distinct((-214748364800000, -1000000000, 0, 922337203685477580)::i64) = 4::i64
 approx_count_distinct((1)::i8) = 1::i64
-approx_count_distinct(('abc', 'def', 'ghi')::str) = 2::i64
-approx_count_distinct(('abc', Null, 'ghi')::str) = 1::i64
 approx_count_distinct(()::i8) = 0::i64
 approx_count_distinct((Null, Null, Null)::i8) = 0::i64
 approx_count_distinct((Null, Null, 4, 3, Null, 922337203685477580, 12833888)::i64) = 4::i64
diff --git a/tests/cases/string/regexp_count_substring.test b/tests/cases/string/regexp_count_substring.test
index 67778734f..07cfc1e3b 100644
--- a/tests/cases/string/regexp_count_substring.test
+++ b/tests/cases/string/regexp_count_substring.test
@@ -2,6 +2,11 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
+regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str, 1::i64) = 3::i64
+regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64) = 6::i64
+regexp_count_substring('abcabcacb'::str, '[bc]'::str, 1::i64) = 6::i64
+regexp_count_substring('abcdefc'::str, '(.*)c'::str, 1::i64) = 1::i64
+regexp_count_substring('abcdefc'::str, '(.*)c?'::str, 1::i64) = 2::i64
 regexp_count_substring('foobarboopzoo'::str, 'o{1,}'::str) = 3::i64
 regexp_count_substring('foobarboopzoo'::str, 'o{1}'::str) = 6::i64
 regexp_count_substring('abcabcacb'::str, '[bc]'::str) = 6::i64
@@ -9,10 +14,18 @@ regexp_count_substring('abcdefc'::str, '(.*)c'::str) = 1::i64
 regexp_count_substring('abcdefc'::str, '(.*)c?'::str) = 2::i64
 
 # null_input: Examples with null as input
+regexp_count_substring('Hello'::str, null::str, 1::i64) = null::i64
+regexp_count_substring(null::str, ' '::str, 1::i64) = null::i64
 regexp_count_substring('Hello'::str, null::str) = null::i64
 regexp_count_substring(null::str, ' '::str) = null::i64
 
 # metacharacters: Examples with metacharacters
+regexp_count_substring('abc1abc'::str, '\d'::str, 1::i64) = 1::i64
+regexp_count_substring('abc1abc'::str, '\D'::str, 1::i64) = 6::i64
+regexp_count_substring('abc def ghi'::str, '\s'::str, 1::i64) = 2::i64
+regexp_count_substring('abc def ghi'::str, '\S'::str, 1::i64) = 9::i64
+regexp_count_substring('abc def ghi'::str, '\w'::str, 1::i64) = 9::i64
+regexp_count_substring('abc def ghi,'::str, '\W'::str, 1::i64) = 3::i64
 regexp_count_substring('abc1abc'::str, '\d'::str) = 1::i64
 regexp_count_substring('abc1abc'::str, '\D'::str) = 6::i64
 regexp_count_substring('abc def ghi'::str, '\s'::str) = 2::i64
@@ -21,13 +34,17 @@ regexp_count_substring('abc def ghi'::str, '\w'::str) = 9::i64
 regexp_count_substring('abc def ghi,'::str, '\W'::str) = 3::i64
 
 # lookahead: Examples with lookahead
+regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str, 1::i64) [lookaround:TRUE] = 2::i64
 regexp_count_substring('100 dollars 100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = 2::i64
 
 # negative_lookahead: Examples with negative lookahead
+regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64) [lookaround:TRUE] = 3::i64
 regexp_count_substring('100 pesos, 99 pesos, 98 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = 3::i64
 
 # lookbehind: Examples with lookbehind
+regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64) [lookaround:TRUE] = 1::i64
 regexp_count_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = 1::i64
 
 # negative_lookbehind: Examples with negative lookbehind
+regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64) [lookaround:TRUE] = 2::i64
 regexp_count_substring('JPY100JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = 2::i64
diff --git a/tests/cases/string/regexp_match_substring.test b/tests/cases/string/regexp_match_substring.test
index f7dd3b8be..a86127596 100644
--- a/tests/cases/string/regexp_match_substring.test
+++ b/tests/cases/string/regexp_match_substring.test
@@ -2,6 +2,11 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
+regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str, 1::i64, 1::i64, 0::i64) = 'oo'::str
+regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str, 1::i64, 1::i64, 0::i64) = 'o'::str
+regexp_match_substring('abcabcacb'::str, '[bc]'::str, 1::i64, 1::i64, 0::i64) = 'b'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
+regexp_match_substring('abcdefghi'::str, '(.*)c?'::str, 1::i64, 1::i64, 0::i64) = 'abcdefghi'::str
 regexp_match_substring('foobarboopzoo'::str, 'o{1,}'::str) = 'oo'::str
 regexp_match_substring('foobarboopzoo'::str, 'o{1}'::str) = 'o'::str
 regexp_match_substring('abcabcacb'::str, '[bc]'::str) = 'b'::str
@@ -9,10 +14,20 @@ regexp_match_substring('abcdefghi'::str, '(.*)c'::str) = 'abc'::str
 regexp_match_substring('abcdefghi'::str, '(.*)c?'::str) = 'abcdefghi'::str
 
 # null_input: Examples with null as input
+regexp_match_substring('Hello'::str, null::str, 1::i64, 1::i64, 0::i64) = null::str
+regexp_match_substring(null::str, ' '::str, 1::i64, 1::i64, 0::i64) = null::str
 regexp_match_substring('Hello'::str, null::str) = null::str
 regexp_match_substring(null::str, ' '::str) = null::str
 
 # metacharacters: Examples with metacharacters
+regexp_match_substring('abc1abc'::str, '\d'::str, 1::i64, 1::i64, 0::i64) = '1'::str
+regexp_match_substring('abc1abc'::str, '\D'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\s'::str, 1::i64, 1::i64, 0::i64) = ' '::str
+regexp_match_substring('abc def ghi'::str, '\S'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\S+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
+regexp_match_substring('abc def ghi'::str, '\w'::str, 1::i64, 1::i64, 0::i64) = 'a'::str
+regexp_match_substring('abc def ghi'::str, '\w+'::str, 1::i64, 1::i64, 0::i64) = 'abc'::str
+regexp_match_substring('abc def ghi,'::str, '\W'::str, 1::i64, 1::i64, 0::i64) = ' '::str
 regexp_match_substring('abc1abc'::str, '\d'::str) = '1'::str
 regexp_match_substring('abc1abc'::str, '\D'::str) = 'a'::str
 regexp_match_substring('abc def ghi'::str, '\s'::str) = ' '::str
@@ -23,13 +38,17 @@ regexp_match_substring('abc def ghi'::str, '\w+'::str) = 'abc'::str
 regexp_match_substring('abc def ghi,'::str, '\W'::str) = ' '::str
 
 # lookahead: Examples with lookahead
+regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
 regexp_match_substring('100 dollars'::str, '\d+(?= dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookahead: Examples with negative lookahead
+regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
 regexp_match_substring('100 pesos'::str, '\d+(?!\d| dollars)'::str) [lookaround:TRUE] = '100'::str
 
 # lookbehind: Examples with lookbehind
+regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
 regexp_match_substring('USD100'::str, '(?<=USD)\d{3}'::str) [lookaround:TRUE] = '100'::str
 
 # negative_lookbehind: Examples with negative lookbehind
+regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, 1::i64, 1::i64, 0::i64) [lookaround:TRUE] = '100'::str
 regexp_match_substring('JPY100'::str, '\d{3}(?<!USD\d{3})'::str) [lookaround:TRUE] = '100'::str
diff --git a/tests/cases/string/regexp_replace.test b/tests/cases/string/regexp_replace.test
index 4de3c8955..c67da818f 100644
--- a/tests/cases/string/regexp_replace.test
+++ b/tests/cases/string/regexp_replace.test
@@ -2,28 +2,46 @@
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
 
 # basic: Basic examples without any special cases
+regexp_replace('test@test.com'::str, '^\S+@\S+$'::str, 'email_found'::str, 1::i64, 0::i64) = 'email_found'::str
+regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str, 1::i64, 0::i64) = 'TIME'::str
 regexp_replace('test@test.com'::str, '^\S+@\S+$'::str, 'email_found'::str) = 'email_found'::str
 regexp_replace('17:50'::str, '[0-9]?[0-9]:[0-9][0-9]'::str, 'TIME'::str) = 'TIME'::str
 
 # lazy_matching: Examples with lazy matching
+regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str, 1::i64, 0::i64) = '1lo'::str
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
 regexp_replace('Hello'::str, 'Hel+?'::str, '1'::str) = '1lo'::str
 regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # greedy_matching: Examples with greedy matching
+regexp_replace('Hello'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
+regexp_replace('Helo'::str, 'Hel+'::str, '1'::str, 1::i64, 0::i64) = '1o'::str
 regexp_replace('Hello'::str, 'Hel+'::str, '1'::str) = '1o'::str
 regexp_replace('Helo'::str, 'Hel+'::str, '1'::str) = '1o'::str
 
 # null_input: Examples with null as input
+regexp_replace('Hello'::str, null::str, '1'::str, 1::i64, 0::i64) = null::str
+regexp_replace(null::str, ' '::str, '1'::str, 1::i64, 0::i64) = null::str
 regexp_replace('Hello'::str, null::str, '1'::str) = null::str
 regexp_replace(null::str, ' '::str, '1'::str) = null::str
 
 # position_anchors: Examples with position anchors
+regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str, 1::i64, 0::i64) = '111defg'::str
+regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str, 1::i64, 0::i64) = 'abcd111'::str
+regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogdogdog'::str
+regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str, 1::i64, 0::i64) = 'dogcatdogdog'::str
 regexp_replace('abcdefg'::str, '\Aabc'::str, '111'::str) = '111defg'::str
 regexp_replace('abcdefg'::str, 'efg$'::str, '111'::str) = 'abcd111'::str
 regexp_replace('catdogdog'::str, '^cat'::str, 'dog'::str) = 'dogdogdog'::str
 regexp_replace('dogcatdogdog'::str, '^cat'::str, 'dog'::str) = 'dogcatdogdog'::str
 
 # metacharacters: Examples with metacharacters
+regexp_replace('abc1abc'::str, '\d'::str, ''::str, 1::i64, 0::i64) = 'abcabc'::str
+regexp_replace('111a111'::str, '\D'::str, ''::str, 1::i64, 0::i64) = '111111'::str
+regexp_replace('abc def'::str, '\s'::str, ''::str, 1::i64, 0::i64) = 'abcdef'::str
+regexp_replace('a bcdef'::str, '\S'::str, ','::str, 1::i64, 0::i64) = ', bcdef'::str
+regexp_replace(' abcdef'::str, '\w'::str, '1'::str, 1::i64, 0::i64) = ' 1bcdef'::str
+regexp_replace('a bcdef'::str, '\W'::str, 'a'::str, 1::i64, 0::i64) = 'aabcdef'::str
 regexp_replace('abc1abc'::str, '\d'::str, ''::str) = 'abcabc'::str
 regexp_replace('111a111'::str, '\D'::str, ''::str) = '111111'::str
 regexp_replace('abc def'::str, '\s'::str, ''::str) = 'abcdef'::str
@@ -32,19 +50,27 @@ regexp_replace(' abcdef'::str, '\w'::str, '1'::str) = ' 1bcdef'::str
 regexp_replace('a bcdef'::str, '\W'::str, 'a'::str) = 'aabcdef'::str
 
 # occurrence_indicator: Examples with occurrence indicators
+regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str, 1::i64, 0::i64) = 'abcabcabc'::str
+regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str, 1::i64, 0::i64) = 'addcabcabc'::str
+regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str, 1::i64, 0::i64) = 'abe'::str
+regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str, 1::i64, 0::i64) = 'adbc'::str
 regexp_replace('abc123abc'::str, '[0-9]+'::str, 'abc'::str) = 'abcabcabc'::str
 regexp_replace('abcabcabc'::str, '[bc]'::str, 'dd'::str) = 'addcabcabc'::str
 regexp_replace('abc'::str, '(.*)c'::str, '\1e'::str) = 'abe'::str
 regexp_replace('abbbbc'::str, '[b]{2,3}'::str, 'd'::str) = 'adbc'::str
 
 # lookahead: Examples with lookahead
+regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'hundred dollars'::str
 regexp_replace('100 dollars'::str, '\d+(?= dollars)'::str, 'hundred'::str) [lookaround:TRUE] = 'hundred dollars'::str
 
 # negative_lookahead: Examples with negative lookahead
+regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = '999 pesos'::str
 regexp_replace('100 pesos'::str, '\d+(?!\d| dollars)'::str, '999'::str) [lookaround:TRUE] = '999 pesos'::str
 
 # lookbehind: Examples with lookbehind
+regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'USD999'::str
 regexp_replace('USD100'::str, '(?<=USD)\d{3}'::str, '999'::str) [lookaround:TRUE] = 'USD999'::str
 
 # negative_lookbehind: Examples with negative lookbehind
+regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str, 1::i64, 0::i64) [lookaround:TRUE] = 'JPY999'::str
 regexp_replace('JPY100'::str, '\d{3}(?<!USD\d{3})'::str, '999'::str) [lookaround:TRUE] = 'JPY999'::str
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index ad22ede8f..44335f19e 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -24,15 +24,15 @@ def test_substrait_extension_coverage():
     all_test_files = load_all_testcases(test_case_dir)
     coverage = get_test_coverage(all_test_files, registry)
 
-    assert coverage.test_count >= 1017
+    assert coverage.test_count >= 1077
     assert (
         coverage.num_tests_with_no_matching_function == 0
     ), f"{coverage.num_tests_with_no_matching_function} tests with no matching function"
-    assert coverage.num_covered_function_variants >= 223
-    assert coverage.total_function_variants >= 510
+    assert coverage.num_covered_function_variants >= 226
+    assert coverage.total_function_variants >= 513
     assert (
         coverage.total_function_variants - coverage.num_covered_function_variants
-    ) <= 289, (
+    ) <= 287, (
         f"Coverage gap too large: {coverage.total_function_variants - coverage.num_covered_function_variants} "
         f"function variants with no tests, out of {coverage.total_function_variants} total function variants."
     )