diff --git a/.gitignore b/.gitignore index fd1dcbd0b..41136fcb7 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,9 @@ nbactions.xml # Local environment .env +# Direnv +.envrc + #JReleaser out/ @@ -47,3 +50,7 @@ out/ /*.out out_expected.txt /*-timing.json + +# 1BRC test-unique output +/logs +/results diff --git a/create_measurements_unique.sh b/create_measurements_unique.sh new file mode 100755 index 000000000..6ea3e481d --- /dev/null +++ b/create_measurements_unique.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurementsUnique $1 diff --git a/reset-sdk.sh b/reset-sdk.sh new file mode 100755 index 000000000..9b7e83fa1 --- /dev/null +++ b/reset-sdk.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +source "$HOME/.sdkman/bin/sdkman-init.sh" +sdk e install \ No newline at end of file diff --git a/scripts/review/README.adoc b/scripts/review/README.adoc new file mode 100644 index 000000000..ebf980ba7 --- /dev/null +++ b/scripts/review/README.adoc @@ -0,0 +1,123 @@ += Code review made easy +:toc: left +:icons: font +:branch: feature/review-made-easy + +If you want to perform a local review of a GitHub pull-request, these scripts will help (checkout) prepare, test and rollback the code. + +== Pre-requisites + +Make sure the following assumptions are true + +. You can access GitHub via SSH (cf. GitHub docs) +. You have the `gh` CLI tool installed (e.g., via `brew install gh` on macOS) +. You have a current version of the original repository cloned locally on your machine (e.g., by `git clone git@github.com:gunnarmorling/1brc`) +. You perform all future steps from this directory (i.e., `cd 1brc` in the beginning) + +== Preparation + +=== Copy scripts (mandatory for misc. use cases) + +As long as the code is not merged, you perhaps need to copy the script files into your local directory (depending on the work mode). +This is necessary since they may vanish when you switch tasks. + +[NOTE] +==== +Once it is merged, or if you work on a subbranch of the current development branch, you may skip this step. +If you want to use the current branch for these changes as baseline, we recommend to set the environment variable: `export REVIEW_BASE_BRANCH={branch}` (or whatever your local branch is named). +==== + +Use the latest version from my repository + +[source, bash, subs='normal'] +---- +git remote add ascheman git@github.com:ascheman/1brc +git checkout {branch} +---- + +[[prepare-link]] +Copy (or better link) the files to the base directory. + +[source, bash] +.Copy/Link scripts to local directory +---- +ln scripts/review/prepare.sh ./review-prepare.sh +ln scripts/review/rollback.sh ./review-rollback.sh +---- + +[[sec:environment]] +=== Environment variables + +Set the environment variable +`${EDITOR}`+ to your favored editor, e.g., `vim` on Linux. +It enables to review source code later in this application. + +[TIP] +.Use IntelliJ (or other IDE) +==== +If you use IntelliJ, you can point the editor variable to the IntelliJ executable. +If you run the subsequent tasks from an existing IntelliJ project, it will then open the files in the IDE then. +This should work with other IDEs as well, e.g., Visual Studio Code. + +IntelliJ IDEA may install a convenience wrapper script `idea` directly to `/usr/local/bin` (or some other directory at your disposal). +If you have that directory on your +`${PATH}`+, you may just set `export EDITOR=idea`. +==== + +== Usage + +=== Show existing PRs + +Check the open PRs + +[script, bash] +---- +gh pr list +---- + +=== Check out PR + +[CAUTION] +==== +The following may only work on open (or draft) PRs. +==== + +Pick one of the open PRs and call the _prepare_ script. + +[source, bash] +---- +./review-prepare.sh # <1> +---- +<1> Depending on whether you have executed the <> step (otherwise you my use `./scripts/review/prepare.sh` instead). + +The script will perform the following steps + +. Fetch repository for the PR +. Prepare merge of PR branch (but do not commit) +. Build jar (and other artifacts using the respective `prepare_...` script if it exists and is executable) +. Execute default `test.sh` +. Execute tests with 100.000 unique measure points to check for hash conflicts +. Show PR and comments +. Optionally open changed files in your editor/IDE (cf. <>). + +[NOTE] +==== +As the script prepares (but does not commit) a merge, you will find all changed files in the Git index (just execute `git status` to see the list of changed files). +==== + +=== Roll back PR + +[CAUTION] +==== +Make sure you do not change some files (either from the PR or other existing files). +Otherwise, automatic rollback is not so easy. + +In this case, we trust your Git knowledge to get rid of the changes. +If all else fails, check the actions performed in the{nbsp}link:rollback.sh[] script. +==== + +Once you are finished with your work, you can roll back the checked-out files and get back to the state of your original branch. + +[source, bash] +---- +./review-rollback.sh # <1> +---- +<1> Depending on whether you have executed the <> step (otherwise you may use `./scripts/review/rollback.sh` instead). \ No newline at end of file diff --git a/scripts/review/prepare.sh b/scripts/review/prepare.sh new file mode 100755 index 000000000..db2885726 --- /dev/null +++ b/scripts/review/prepare.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -euo pipefail + +if ! type -p gh >/dev/null; then + echo "Please install the 'gh' tool, e.g., via Homebrew: brew install gh" >&2 + exit 1 +fi + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +pr="${1}" + +author="$(gh pr view "${pr}" --json author -q .author.login)" +repository="$(gh pr view "${pr}" --json headRepository -q .headRepository.name)" +branch="$(gh pr view "${pr}" --json headRefName -q .headRefName)" + +remote="review-${author}" +if ! git remote | grep -s "${remote}"; then + git remote add "${remote}" "git@github.com:${author}/${repository}" +fi + +current_branch="$(git rev-parse --abbrev-ref HEAD)" +new_branch="review/${author}" +if [ "${current_branch}" != "${new_branch}" ]; then + git checkout -b "${new_branch}" +fi + +git fetch "${remote}" +#gh pr merge "${pr}" + +git merge --no-commit "${remote}/${branch}" + +./mvnw clean verify -Dlicense.skip + +./test.sh "${author}" +test -x test-unique.sh && ./test-unique.sh "${author}" + +gh pr view "${pr}" -c + +if test "${EDITOR:-}"; then + files="$(gh pr view "${pr}" --json files -q .files[].path)" + # shellcheck disable=SC2086 + "${EDITOR}" ${files} +fi \ No newline at end of file diff --git a/scripts/review/rollback.sh b/scripts/review/rollback.sh new file mode 100755 index 000000000..532ac4485 --- /dev/null +++ b/scripts/review/rollback.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +: "${REVIEW_BASE_BRANCH:=main}" + +set -euo pipefail + +git merge --abort || echo "Skipping merge rollback" +git stash save +git checkout "${REVIEW_BASE_BRANCH}" +git stash pop + +if ! type -p gh >/dev/null; then + echo "Please install the 'gh' tool, e.g., via Homebrew: brew install gh" >&2 + exit 1 +fi + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +pr="${1}" + +author="$(gh pr view "${pr}" --json author -q .author.login)" + +review_branch="review/${author}" +if git branch -v | grep -E -s "^\s+${review_branch}\s+"; then + git branch -d "${review_branch}" +fi + +remote="review-${author}" +if git remote | grep -s "${remote}"; then + git remote remove "${remote}" +fi + +./reset-sdk.sh + +./mvnw clean verify -Dlicense.skip diff --git a/src/main/java/dev/morling/onebrc/CreateMeasurementsUnique.java b/src/main/java/dev/morling/onebrc/CreateMeasurementsUnique.java new file mode 100644 index 000000000..6b5cb9bae --- /dev/null +++ b/src/main/java/dev/morling/onebrc/CreateMeasurementsUnique.java @@ -0,0 +1,170 @@ +/* + * Copyright 2023 The original authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dev.morling.onebrc; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.concurrent.ThreadLocalRandom; + +public class CreateMeasurementsUnique { + + public static final int MAX_NAME_LEN = 100; + public static final int KEYSET_SIZE = 10_000; + + public static void main(String[] args) throws Exception { + if (args.length != 1) { + System.out.println("Usage: create_measurementsunique.sh "); + System.exit(1); + } + int size = 0; + try { + size = Integer.parseInt(args[0]); + } + catch (NumberFormatException e) { + System.out.println("Invalid value for "); + System.out.println("Usage: create_measurements3.sh "); + System.exit(1); + } + final var weatherStations = generateWeatherStations(); + final var start = System.currentTimeMillis(); + final var rnd = ThreadLocalRandom.current(); + final var numLength = args[0].length() + 2; + final var fmt = "%0" + String.format("%d", numLength) + "d%s"; + + try (var out = new BufferedWriter(new FileWriter(String.format("measurements-unique-%d.txt", size)))) { + for (int i = 1; i <= size; i++) { + var station = weatherStations.get(rnd.nextInt(KEYSET_SIZE)); + double temp = rnd.nextGaussian(station.avgTemp, 7.0); + var name = String.format(fmt, i, station.name); + out.write(name.substring(0, Math.min(name.length(), 99))); + out.write(';'); + out.write(Double.toString(Math.round(temp * 10.0) / 10.0)); + out.newLine(); + if (i % 50_000_000 == 0) { + System.out.printf("Wrote %,d measurements in %,d ms%n", i, System.currentTimeMillis() - start); + } + } + } + } + + record WeatherStation(String name, float avgTemp) { + } + + private static ArrayList generateWeatherStations() throws Exception { + // Use a public list of city names and concatenate them all into a long string, + // which we'll use as a "source of city name randomness" + var bigName = new StringBuilder(1 << 20); + try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"));) { + skipComments(rows); + while (true) { + var row = rows.readLine(); + if (row == null) { + break; + } + bigName.append(row, 0, row.indexOf(';')); + } + } + final var weatherStations = new ArrayList(); + final var names = new HashSet(); + var minLen = Integer.MAX_VALUE; + var maxLen = Integer.MIN_VALUE; + try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"))) { + skipComments(rows); + final var nameSource = new StringReader(bigName.toString()); + final var buf = new char[MAX_NAME_LEN]; + final var rnd = ThreadLocalRandom.current(); + final double yOffset = 4; + final double factor = 2500; + final double xOffset = 0.372; + final double power = 7; + for (int i = 0; i < KEYSET_SIZE; i++) { + var row = rows.readLine(); + if (row == null) { + break; + } + // Use a 7th-order curve to simulate the name length distribution. + // It gives us mostly short names, but with large outliers. + var nameLen = (int) (yOffset + factor * Math.pow(rnd.nextDouble() - xOffset, power)); + var count = nameSource.read(buf, 0, nameLen); + if (count == -1) { + throw new Exception("Name source exhausted"); + } + var nameBuf = new StringBuilder(nameLen); + nameBuf.append(buf, 0, nameLen); + if (Character.isWhitespace(nameBuf.charAt(0))) { + nameBuf.setCharAt(0, readNonSpace(nameSource)); + } + if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) { + nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource)); + } + var name = nameBuf.toString(); + while (names.contains(name)) { + nameBuf.setCharAt(rnd.nextInt(nameBuf.length()), readNonSpace(nameSource)); + name = nameBuf.toString(); + } + int actualLen; + while (true) { + actualLen = name.getBytes(StandardCharsets.UTF_8).length; + if (actualLen <= 100) { + break; + } + nameBuf.deleteCharAt(nameBuf.length() - 1); + if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) { + nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource)); + } + name = nameBuf.toString(); + } + if (name.indexOf(';') != -1) { + throw new Exception("Station name contains a semicolon!"); + } + names.add(name); + minLen = Integer.min(minLen, actualLen); + maxLen = Integer.max(maxLen, actualLen); + var lat = Float.parseFloat(row.substring(row.indexOf(';') + 1)); + // Guesstimate mean temperature using cosine of latitude + var avgTemp = (float) (30 * Math.cos(Math.toRadians(lat))) - 10; + weatherStations.add(new WeatherStation(name, avgTemp)); + } + } + System.out.format("Generated %,d station names with length from %,d to %,d%n", KEYSET_SIZE, minLen, maxLen); + return weatherStations; + } + + private static void skipComments(BufferedReader rows) throws IOException { + while (rows.readLine().startsWith("#")) { + } + } + + private static char readNonSpace(StringReader nameSource) throws IOException { + while (true) { + var n = nameSource.read(); + if (n == -1) { + throw new IOException("Name source exhausted"); + } + var ch = (char) n; + if (ch != ' ') { + return ch; + } + } + } +} diff --git a/test-unique-all.sh b/test-unique-all.sh new file mode 100755 index 000000000..f71f5440d --- /dev/null +++ b/test-unique-all.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +: "${UNIQUE=100000}" +users="$(awk -F\; '{print $1}' github_users.txt | sort -u)" + +mkdir -p logs_prefix + +logs_prefix="logs/test-unique-${UNIQUE}" +for user in ${users}; do + if ./test-unique.sh "${user}" > "${logs_prefix}-${user}.log" 2> "${logs_prefix}-${user}.err"; then + echo "Worked for user '${user}'" + else + echo "Failed for user '${user}'" + fi +done \ No newline at end of file diff --git a/test-unique.sh b/test-unique.sh new file mode 100755 index 000000000..a8a880983 --- /dev/null +++ b/test-unique.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -euo pipefail + +: "${UNIQUE:=100000}" + +if [ $# -ne 1 ]; then + echo "usage: $0 " >&2 + exit 1 +fi +user="${1}" + +exe="calculate_average_${1}.sh" + +if [ ! -x "${exe}" ]; then + echo "Start script '${exe}' does not exist" >&2 + exit 2 +fi + +testdata="measurements-unique-${UNIQUE}.txt" +if [ ! -r "${testdata}" ]; then + ./create_measurements_unique.sh "${UNIQUE}" +fi + +ln -sfn "${testdata}" "measurements.txt" + +mkdir -p results + +baseline="results/result-unique-${UNIQUE}-baseline.txt" +if [ ! -r "${baseline}" ]; then + echo "Baseline result '${baseline}' does not yet exists, we create it first!" >&2 + ./prepare_baseline.sh + ./calculate_average_baseline.sh > "${baseline}" +fi + +prep="prepare_${user}.sh" +test -r "${prep}" && bash ./${prep} + +./reset-sdk.sh + +result="results/result-unique-${UNIQUE}-${user}.txt" +./${exe} > "${result}" + +if ! diff -q "${result}" "${baseline}"; then + ls -l "${result}" "${baseline}" +fi \ No newline at end of file