Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add pluggable regex engine support #837

Merged
merged 19 commits into from
Jan 22, 2025
28 changes: 25 additions & 3 deletions fuzz/url_pattern.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,32 @@
#include "ada.cpp"
#include "ada.h"

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
std::string bytesToAlphanumeric(const std::string& source) {
static const char alphanumeric[] =
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789";

std::string result;
result.reserve(source.size());

for (char byte : source) {
int index = static_cast<unsigned char>(byte) % (sizeof(alphanumeric) - 1);
result.push_back(alphanumeric[index]);
}

return result;
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
FuzzedDataProvider fdp(data, size);
std::string source = fdp.ConsumeRandomLengthString(256);
std::string base_source = fdp.ConsumeRandomLengthString(256);
// We do not want to trigger arbitrary regex matching.
std::string source =
"/" + bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50)) + "/" +
bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50));
std::string base_source =
"/" + bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50)) + "/" +
bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50));

// Without base or options
auto result = ada::parse_url_pattern(source, nullptr, nullptr);
Expand Down
5 changes: 3 additions & 2 deletions fuzz/url_pattern.options
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[libfuzzer]
dict = url.dict
max_len = 512
rss_limit_mb = 3000
max_len = 100
rss_limit_mb = 16000
timeout = 60
1 change: 1 addition & 0 deletions include/ada.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "ada/url_pattern-inl.h"
#include "ada/url_pattern_helpers.h"
#include "ada/url_pattern_helpers-inl.h"
#include "ada/url_pattern_regex.h"

// Public API
#include "ada/ada_version.h"
Expand Down
17 changes: 10 additions & 7 deletions include/ada/implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@

#include "ada/parser.h"
#include "ada/common_defs.h"
#include "ada/encoding_type.h"
#include "ada/url.h"
#include "ada/state.h"
#include "ada/url_aggregator.h"
#include "ada/url_pattern_regex.h"

namespace ada {
enum class errors : uint8_t { type_error };
Expand Down Expand Up @@ -56,12 +54,17 @@ bool can_parse(std::string_view input,
* @param input valid UTF-8 string or URLPatternInit struct
* @param base_url an optional valid UTF-8 string
* @param options an optional url_pattern_options struct
* @param regex_provider an optional regex provider. if not provided, it will
* use ada::url_pattern_regex::std_regex_provider
* @return url_pattern instance
*/
ada_warn_unused tl::expected<url_pattern, errors> parse_url_pattern(
std::variant<std::string_view, url_pattern_init> input,
const std::string_view* base_url = nullptr,
const url_pattern_options* options = nullptr);
template <url_pattern_regex::regex_concept regex_provider =
ada::url_pattern_regex::std_regex_provider>
ada_warn_unused tl::expected<url_pattern<regex_provider>, errors>
parse_url_pattern(std::variant<std::string_view, url_pattern_init> input,
const std::string_view* base_url = nullptr,
const url_pattern_options* options = nullptr,
std::optional<regex_provider> provider = std::nullopt);

/**
* Computes a href string from a file path. The function assumes
Expand Down
8 changes: 6 additions & 2 deletions include/ada/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
#include <variant>

#include "ada/expected.h"
#include "ada/url_pattern_regex.h"

/**
* @private
*/
namespace ada {
struct url_aggregator;
struct url;
template <url_pattern_regex::regex_concept regex_provider>
class url_pattern;
struct url_pattern_options;
struct url_pattern_init;
Expand Down Expand Up @@ -51,9 +53,11 @@ extern template url_aggregator parse_url_impl<url_aggregator>(
extern template url parse_url_impl<url>(std::string_view user_input,
const url* base_url);

tl::expected<url_pattern, errors> parse_url_pattern_impl(
template <url_pattern_regex::regex_concept regex_provider>
tl::expected<url_pattern<regex_provider>, errors> parse_url_pattern_impl(
std::variant<std::string_view, url_pattern_init> input,
const std::string_view* base_url, const url_pattern_options* options);
const std::string_view* base_url, const url_pattern_options* options,
regex_provider&& provider);

} // namespace ada::parser

Expand Down
8 changes: 5 additions & 3 deletions include/ada/url_aggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,11 @@ struct url_aggregator : url_base {
friend url_aggregator parser::parse_url_impl<url_aggregator, false>(
std::string_view, const url_aggregator *);
// url_pattern methods
friend tl::expected<url_pattern, errors> parse_url_pattern_impl(
std::variant<std::string_view, url_pattern_init> input,
const std::string_view *base_url, const url_pattern_options *options);
template <url_pattern_regex::regex_concept regex_provider>
friend tl::expected<url_pattern<regex_provider>, errors>
parse_url_pattern_impl(std::variant<std::string_view, url_pattern_init> input,
const std::string_view *base_url,
const url_pattern_options *options);

std::string buffer{};
url_components components{};
Expand Down
76 changes: 45 additions & 31 deletions include/ada/url_pattern-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ inline bool url_pattern_component_result::operator==(
return input == other.input && groups == other.groups;
}

inline std::string url_pattern_component::to_string() const {
template <url_pattern_regex::regex_concept regex_provider>
std::string url_pattern_component<regex_provider>::to_string() const {
#ifdef ADA_HAS_FORMAT
return std::format(R"({{"pattern": "{}", "has_regexp_groups": {}}})", pattern,
has_regexp_groups ? "true" : "false" //,
Expand All @@ -34,43 +35,38 @@ inline std::string url_pattern_component::to_string() const {
#endif
}

inline url_pattern_component_result
url_pattern_component::create_component_match_result(
std::string_view input, const std::smatch& exec_result) {
template <url_pattern_regex::regex_concept regex_provider>
url_pattern_component_result
url_pattern_component<regex_provider>::create_component_match_result(
std::string_view input, std::vector<std::string>&& exec_result) {
// Let result be a new URLPatternComponentResult.
// Set result["input"] to input.
// Let groups be a record<USVString, (USVString or undefined)>.
auto result =
url_pattern_component_result{.input = std::string(input), .groups = {}};

// If input is empty, then groups will always be empty.
if (input.empty()) {
if (input.empty() || exec_result.empty()) {
return result;
}

// Optimization: Let's reserve the size.
result.groups.reserve(exec_result.size() - 1);

size_t group_index = 0;
// Let index be 1.
// While index is less than Get(execResult, "length"):
for (size_t index = 1; index < exec_result.size(); index++) {
// Let name be component’s group name list[index - 1].
// Let value be Get(execResult, ToString(index)).
// Set groups[name] to value.
auto exec = exec_result[index];
if (!exec.matched) continue;
// We explicitly start iterating from 0 even though the spec
// says we should start from 1. This case is handled by the
// std_regex_provider.
for (size_t index = 0; index < exec_result.size(); index++) {
result.groups.insert({
group_name_list[group_index],
exec.str(),
group_name_list[index],
std::move(exec_result[index]),
});

group_index++;
}
return result;
}

inline std::string url_pattern::to_string() const {
template <url_pattern_regex::regex_concept regex_provider>
std::string url_pattern<regex_provider>::to_string() const {
#ifdef ADA_HAS_FORMAT
return std::format(
R"({{"protocol_component": "{}", "username_component": {}, "password_component": {}, "hostname_component": {}, "port_component": {}, "pathname_component": {}, "search_component": {}, "hash_component": {}, "ignore_case": {}}})",
Expand All @@ -84,42 +80,60 @@ inline std::string url_pattern::to_string() const {
#endif
}

inline std::string_view url_pattern::get_protocol() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_protocol() const
ada_lifetime_bound {
// Return this's associated URL pattern's protocol component's pattern string.
return protocol_component.pattern;
}
inline std::string_view url_pattern::get_username() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_username() const
ada_lifetime_bound {
// Return this's associated URL pattern's username component's pattern string.
return username_component.pattern;
}
inline std::string_view url_pattern::get_password() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_password() const
ada_lifetime_bound {
// Return this's associated URL pattern's password component's pattern string.
return password_component.pattern;
}
inline std::string_view url_pattern::get_hostname() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_hostname() const
ada_lifetime_bound {
// Return this's associated URL pattern's hostname component's pattern string.
return hostname_component.pattern;
}
inline std::string_view url_pattern::get_port() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_port() const
ada_lifetime_bound {
// Return this's associated URL pattern's port component's pattern string.
return port_component.pattern;
}
inline std::string_view url_pattern::get_pathname() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_pathname() const
ada_lifetime_bound {
// Return this's associated URL pattern's pathname component's pattern string.
return pathname_component.pattern;
}
inline std::string_view url_pattern::get_search() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_search() const
ada_lifetime_bound {
// Return this's associated URL pattern's search component's pattern string.
return search_component.pattern;
}
inline std::string_view url_pattern::get_hash() const ada_lifetime_bound {
template <url_pattern_regex::regex_concept regex_provider>
std::string_view url_pattern<regex_provider>::get_hash() const
ada_lifetime_bound {
// Return this's associated URL pattern's hash component's pattern string.
return hash_component.pattern;
}

inline bool url_pattern::ignore_case() const { return ignore_case_; }

inline bool url_pattern::has_regexp_groups() const {
template <url_pattern_regex::regex_concept regex_provider>
bool url_pattern<regex_provider>::ignore_case() const {
return ignore_case_;
}
template <url_pattern_regex::regex_concept regex_provider>
bool url_pattern<regex_provider>::has_regexp_groups() const {
// If this's associated URL pattern's has regexp groups, then return true.
return protocol_component.has_regexp_groups ||
username_component.has_regexp_groups ||
Expand Down
Loading
Loading