Skip to content

Commit

Permalink
Adding ICU comparison.
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire authored and anonrig committed Feb 14, 2023
1 parent cd22010 commit 699705f
Show file tree
Hide file tree
Showing 3 changed files with 226 additions and 2 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,27 @@ According to our benchmarks, it can be faster than ICU.
}
```
## Benchmarks
You may build a benchmarking tool with the library as follows under macOS and Linux:
```
cmake -D ADA_IDNA_BENCHMARKS=ON -B build
./build/benchmarks/to_ascii
```
The commands for users of Visual Studio are slightly different.
Sample result (LLVM 14, Apple M2 processor):
```
---------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------
Ada 1411 ns 1411 ns 491877 speed=50.3364M/s time/byte=19.8664ns time/domain=235.085ns url/s=4.25378M/s
Icu 2405 ns 2405 ns 299564 speed=29.5213M/s time/byte=33.8738ns time/url=400.84ns url/s=2.49476M/s
```
## Contributing
### Git hooks
Expand Down
37 changes: 37 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ target_link_libraries(to_ascii PRIVATE ada-idna)
target_include_directories(to_ascii PUBLIC "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>")
target_include_directories(to_ascii PUBLIC "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/benchmarks>")



include(${PROJECT_SOURCE_DIR}/cmake/import.cmake)

set_off(BENCHMARK_ENABLE_TESTING)
Expand All @@ -20,3 +22,38 @@ message(STATUS "Compiler is " ${CMAKE_CXX_COMPILER_ID})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
message(STATUS "Compiler version " ${CMAKE_CXX_COMPILER_VERSION})
endif()



if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
message(STATUS "Apple system detected.")
# People who run macOS often use brew.
if(EXISTS /opt/homebrew/opt/icu4c)
message(STATUS "icu is provided by homebrew at /opt/homebrew/opt/icu4c.")
## This is a bit awkward, but it is a lot better than asking the
## user to figure that out.
list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c/include")
list(APPEND CMAKE_LIBRARY_PATH "/opt/homebrew/opt/icu4c/lib")
elseif(EXISTS /usr/local/opt/icu4c)
message(STATUS "icu is provided by homebrew at /usr/local/opt/icu4c.")
list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c/include")
list(APPEND CMAKE_LIBRARY_PATH "/usr/local/opt/icu4c/lib")
endif()
endif()

find_package(ICU COMPONENTS uc i18n)
### If the user does not have ICU, let us help them with instructions:
if(ICU_FOUND)
target_link_libraries(to_ascii PRIVATE ICU::uc ICU::i18n)
target_compile_definitions(to_ascii PRIVATE ICU_AVAILABLE=1)
else(ICU_FOUND)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
if(EXISTS /opt/homebrew)
message(STATUS "Under macOS, you may install ICU with brew, using 'brew install icu4c'.")
else()
message(STATUS "Under macOS, you should install brew (see https://brew.sh) and then icu4c ('brew install icu4c').")
endif()
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
message(STATUS "Under Linux, you may be able to install ICU with a command such as 'apt-get install libicu-dev'." )
endif()
endif(ICU_FOUND)
170 changes: 168 additions & 2 deletions benchmarks/to_ascii.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
#include <cstdlib>
#include <iostream>
#include <memory>
#if ICU_AVAILABLE
#include <unicode/utypes.h>
#include <unicode/uidna.h>
#include <unicode/utf8.h>
#endif

#include "idna.h"
#include "performancecounters/event_counter.h"
Expand All @@ -16,6 +21,8 @@ std::string inputs[] = {
"xn--zca.xn--zca",
"xn--mgba3gch31f060k",
"xn--1ch",
"x-.\xc3\x9f",
"\xd9\x86\xd8\xa7\xd9\x85\xd9\x87\xe2\x80\x8c\xd8\xa7\xdb\x8c"
};

double inputs_total_byte = []() -> double {
Expand Down Expand Up @@ -45,6 +52,124 @@ static void Ada(benchmark::State& state) {
event_count allocate_count = collector.end();
aggregate << allocate_count;
}
state.counters["cycles/domain"] = aggregate.best.cycles() / std::size(inputs);
state.counters["instructions/domain"] =
aggregate.best.instructions() / std::size(inputs);
state.counters["instructions/cycle"] =
aggregate.best.instructions() / aggregate.best.cycles();
state.counters["instructions/byte"] =
aggregate.best.instructions() / inputs_total_byte;
state.counters["instructions/ns"] =
aggregate.best.instructions() / aggregate.best.elapsed_ns();
state.counters["GHz"] =
aggregate.best.cycles() / aggregate.best.elapsed_ns();
state.counters["ns/domain"] = aggregate.best.elapsed_ns() / std::size(inputs);
state.counters["cycle/byte"] = aggregate.best.cycles() / inputs_total_byte;
}
state.counters["time/byte"] = benchmark::Counter(
inputs_total_byte, benchmark::Counter::kIsIterationInvariantRate |
benchmark::Counter::kInvert);
state.counters["time/domain"] = benchmark::Counter(
double(std::size(inputs)), benchmark::Counter::kIsIterationInvariantRate |
benchmark::Counter::kInvert);
state.counters["speed"] = benchmark::Counter(
inputs_total_byte, benchmark::Counter::kIsIterationInvariantRate);
state.counters["url/s"] = benchmark::Counter(
double(std::size(inputs)), benchmark::Counter::kIsIterationInvariantRate);
}

BENCHMARK(Ada);

#if ICU_AVAILABLE

// returns empty string on error
std::string icu_to_array(std::string_view input) {
static std::string error = "";

std::string out(255, 0);
constexpr bool be_strict = false;

UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_ASCII;

if (be_strict) {
options |= UIDNA_USE_STD3_RULES;
}

UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status)) {
return error;
}

UIDNAInfo info = UIDNA_INFO_INITIALIZER;
// RFC 1035 section 2.3.4.
// The domain name must be at most 255 octets.
// It cannot contain a label longer than 63 octets.
// Thus we should never need more than 255 octets, if we
// do the domain name is in error.
int32_t length = uidna_nameToASCII_UTF8(uidna,
input.data(),
int32_t(input.length()),
out.data(), 255,
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
out.resize(length);
// When be_strict is true, this should not be allowed!
length = uidna_nameToASCII_UTF8(uidna,
input.data(),
int32_t(input.length()),
out.data(), length,
&info,
&status);
}

// A label contains hyphen-minus ('-') in the third and fourth positions.
info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
// A label starts with a hyphen-minus ('-').
info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
// A label ends with a hyphen-minus ('-').
info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;

if (!be_strict) { // This seems to violate RFC 1035 section 2.3.4.
// A non-final domain name label (or the whole domain name) is empty.
info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
// A domain name label is longer than 63 bytes.
info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
// A domain name is longer than 255 bytes in its storage form.
info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}

uidna_close(uidna);

if (U_FAILURE(status) || info.errors != 0 || length == 0) {
return error;
}
out.resize(length); // we possibly want to call :shrink_to_fit otherwise we use 255 bytes.
return out;
}

static void Icu(benchmark::State& state) {
for (auto _ : state) {
for (std::string& url_string : inputs) {
benchmark::DoNotOptimize(icu_to_array(url_string));
}
}

if (collector.has_events()) {
event_aggregate aggregate{};
for (size_t i = 0; i < N; i++) {
std::atomic_thread_fence(std::memory_order_acquire);
collector.start();
for (std::string& url_string : inputs) {
benchmark::DoNotOptimize(icu_to_array(url_string));
}
std::atomic_thread_fence(std::memory_order_release);
event_count allocate_count = collector.end();
aggregate << allocate_count;
}
state.counters["cycles/url"] = aggregate.best.cycles() / std::size(inputs);
state.counters["instructions/url"] =
aggregate.best.instructions() / std::size(inputs);
Expand All @@ -71,6 +196,47 @@ static void Ada(benchmark::State& state) {
double(std::size(inputs)), benchmark::Counter::kIsIterationInvariantRate);
}

BENCHMARK(Ada);
BENCHMARK(Icu);

BENCHMARK_MAIN();
bool verify() {
bool is_ok = true;
for (std::string& url_string : inputs) {
std::string icu_answer = icu_to_array(url_string);
std::string ada_answer = ada::idna::to_ascii(url_string);
if(icu_answer != ada_answer) {
std::cerr << " ada/icu mismatch " << ada_answer << " vs. " << icu_answer << std::endl;
is_ok = false;
}
}
if(!is_ok) {
std::cout << "\n\n\nWarning: errors found.\n\n\n\n";
} else {
std::cout << "ICU and ada/idna agree on all test inputs.\n";
}
return is_ok;
}
#endif

int main(int argc, char **argv) {
#if ICU_AVAILABLE
verify();
#endif
#if (__APPLE__ && __aarch64__) || defined(__linux__)
if(!collector.has_events()) {
benchmark::AddCustomContext("performance counters", "No privileged access (sudo may help).");
}
#else
if(!collector.has_events()) {
benchmark::AddCustomContext("performance counters", "Unsupported system.");
}
#endif
benchmark::AddCustomContext("input bytes", std::to_string(size_t(inputs_total_byte)));
benchmark::AddCustomContext("number of domains", std::to_string(std::size(inputs)));
benchmark::AddCustomContext("bytes/domains", std::to_string(inputs_total_byte/std::size(inputs)));
if(collector.has_events()) {
benchmark::AddCustomContext("performance counters", "Enabled");
}
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
}

0 comments on commit 699705f

Please sign in to comment.