Skip to content

Commit

Permalink
Fix #23471: fix an inconsistency between fast ASCII sort and slower u…
Browse files Browse the repository at this point in the history
…nicode-aware sort

git-svn-id: https://josm.openstreetmap.de/svn/trunk@18983 0c6e7542-c601-0410-84e7-c038aed88b3b
  • Loading branch information
taylor.smock committed Feb 14, 2024
1 parent 7e448a6 commit 67d83c4
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 16 deletions.
47 changes: 31 additions & 16 deletions src/org/openstreetmap/josm/tools/AlphanumComparator.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@
import java.util.Arrays;
import java.util.Comparator;

import org.openstreetmap.josm.gui.MainApplication;
import org.openstreetmap.josm.spi.lifecycle.Lifecycle;

/**
* The Alphanum Algorithm is an improved sorting algorithm for strings
* containing numbers: Instead of sorting numbers in ASCII order like a standard
Expand All @@ -51,13 +48,20 @@
*
*/
public final class AlphanumComparator implements Comparator<String>, Serializable {
/** {@code true} to use the faster ASCII sorting algorithm. Set to {@code false} when testing compatibility. */
static boolean useFastASCIISort = true;
/**
* The sort order for the fast ASCII sort method.
*/
static final String ASCII_SORT_ORDER =
" \r\t\n\f\u000b-_,;:!?/.`^~'\"()[]{}@$*\\&#%+<=>|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

private static final long serialVersionUID = 1L;

private static final AlphanumComparator INSTANCE = new AlphanumComparator();
/**
* A mapping from ASCII characters to the default {@link Collator} order.
* At writing, the default rules can be found in {@link sun.util.locale.provider.CollationRules#DEFAULTRULES}.
* At writing, the default rules can be found in CollationRules#DEFAULTRULES.
*/
private static final byte[] ASCII_MAPPING = new byte[128];
static {
Expand All @@ -70,9 +74,8 @@ public final class AlphanumComparator implements Comparator<String>, Serializabl
// We have 37 order overrides for symbols; ASCII tables has control characters through 31. 32-47 are symbols.
// After the symbols, we have 0-9, and then aA-zZ.
// The character order
final String order = " \r\t\n\f\u000b-_,;:!?/.`^~'\"()[]{}@$*\\&#%+<=>|0123456789aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ";
for (int i = 0; i < order.length(); i++) {
char c = order.charAt(i);
for (int i = 0; i < ASCII_SORT_ORDER.length(); i++) {
char c = ASCII_SORT_ORDER.charAt(i);
ASCII_MAPPING[c] = (byte) (i + 1);
}
}
Expand Down Expand Up @@ -100,15 +103,28 @@ private AlphanumComparator() {
* @return See {@link String#compareToIgnoreCase(String)} (e.g. {@code string1.compareToIgnoreCase(string2)}).
*/
private static int compareString(String string1, int len1, String string2, int len2) {
int lim = Math.min(len1, len2);
int k = 0;
while (k < lim) {
final int c1 = ASCII_MAPPING[string1.charAt(k)];
final int c2 = ASCII_MAPPING[string2.charAt(k)];
int loc1 = 0;
int loc2 = 0;
while (loc1 < len1 && loc2 < len2) {
// Ignore control symbols
while (loc1 < len1 - 1 && string1.charAt(loc1) <= 32) {
loc1++;
}
while (loc2 < len2 - 1 && string2.charAt(loc2) <= 32) {
loc2++;
}
if (loc1 >= len1 || loc2 >= len2) break;

char lower1 = Character.toLowerCase(string1.charAt(loc1));
char lower2 = Character.toLowerCase(string2.charAt(loc2));

final int c1 = ASCII_MAPPING[lower1];
final int c2 = ASCII_MAPPING[lower2];
if (c1 != c2) {
return c1 - c2;
}
k++;
loc1++;
loc2++;
}
return len1 - len2;
}
Expand Down Expand Up @@ -184,9 +200,8 @@ private static int compareChunk(String thisChunk, int thisChunkLength, String th
}
}
} else {
// Check if both chunks are ascii only
// FIXME: re-enable once #23471 is fixed (the exception at startup keeps JOSM from finishing startup)
if (false && isAscii(thisChunk, thisChunkLength) && isAscii(thatChunk, thatChunkLength)) {
// Check if both chunks are ascii only; if so, use a much faster sorting algorithm.
if (useFastASCIISort && isAscii(thisChunk, thisChunkLength) && isAscii(thatChunk, thatChunkLength)) {
return Utils.clamp(compareString(thisChunk, thisChunkLength, thatChunk, thatChunkLength), -1, 1);
}
// Instantiate the collator
Expand Down
82 changes: 82 additions & 0 deletions test/unit/org/openstreetmap/josm/tools/AlphanumComparatorTest.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
// License: GPL. For details, see LICENSE file.
package org.openstreetmap.josm.tools;

import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;

import java.text.Collator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.function.BiFunction;
import java.util.stream.Stream;

import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;

/**
* Unit tests of {@link AlphanumComparator}.
*/
class AlphanumComparatorTest {
@AfterEach
void teardown() {
AlphanumComparator.useFastASCIISort = true;
}

/**
* Test numeric strings.
Expand All @@ -32,4 +42,76 @@ void testMixed() {
lst.sort(AlphanumComparator.getInstance());
assertEquals(Arrays.asList("a5", "a100", "a00999", "b1", "b20"), lst);
}

private static Stream<String[]> testNonRegression23471Arguments() {
List<String> testStrings = Arrays.asList(
"AMEN",
"Ameriabank",
"America First Credit Union",
"BAC Credomatic",
"BADR Banque",
"BAI",
"Banca Popolare di Cividale",
"Banca Popolare di Sondrio",
"Banca Sella",
"Banca Transilvania",
"Bancaribe",
"BancaStato",
"Banco Agrario",
"Banco AV Villas",
"Banco Azteca",
"Banco Bicentenario",
"Banco BISA",
"Banco BMG",
"Banco BPI (Portugal)",
"Banco BPM",
"Banco Caja Social",
"Banco Ciudad",
"Banco Continental (Paraguay)",
"Banco di Sardegna"
);
List<String> testChars = new ArrayList<>(AlphanumComparator.ASCII_SORT_ORDER.length());
for (char c : AlphanumComparator.ASCII_SORT_ORDER.toCharArray()) {
testChars.add(Character.toString(c));
}
BiFunction<List<String>, String, List<String>> subList = (list, string) -> list.subList(list.indexOf(string), list.size());
return Stream.concat(
testStrings.stream().flatMap(first -> subList.apply(testStrings, first).stream().map(second -> new String[]{first, second})),
testChars.stream().flatMap(first -> subList.apply(testChars, first).stream().map(second -> new String[]{first, second}))
);
}

/**
* Non-regression test for #23471
* This ensures that the comparison contract holds.
* There are ~5300 combinations run in <1s (as of 2024-02-14).
*/
@Test
void testNonRegression23471() {
assertAll(testNonRegression23471Arguments().map(strings -> () -> testNonRegression23471(strings[0], strings[1])));
}

private static void testNonRegression23471(String first, String second) {
AlphanumComparator.useFastASCIISort = true;
final AlphanumComparator instance = AlphanumComparator.getInstance();
assertEquals(-instance.compare(first, second), instance.compare(second, first));
// Ensure that the fast sort is equivalent to the slow sort
AlphanumComparator.useFastASCIISort = false;
final int slowFirstSecond = instance.compare(first, second);
final int slowSecondFirst = instance.compare(second, first);
AlphanumComparator.useFastASCIISort = true;
final int fastFirstSecond = instance.compare(first, second);
final int fastSecondFirst = instance.compare(second, first);
assertEquals(slowFirstSecond, fastFirstSecond);
assertEquals(slowSecondFirst, fastSecondFirst);

final Collator collator = Collator.getInstance();
collator.setStrength(Collator.SECONDARY);
// Check against the collator instance
assertEquals(Utils.clamp(collator.compare(first, second), -1, 1),
Utils.clamp(instance.compare(first, second), -1, 1));
assertEquals(Utils.clamp(collator.compare(second, first), -1, 1),
Utils.clamp(instance.compare(second, first), -1, 1));
}

}

0 comments on commit 67d83c4

Please sign in to comment.