mirror of
https://github.com/JetBrains/JetBrainsRuntime.git
synced 2025-12-06 01:19:28 +01:00
8365675: Add String Unicode Case-Folding Support
Reviewed-by: rriggs, naoto, ihse
This commit is contained in:
@@ -79,7 +79,7 @@ TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_too
|
||||
build.tools.generateextraproperties.GenerateExtraProperties
|
||||
|
||||
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||
build.tools.generatecharacter.CaseFolding
|
||||
build.tools.generatecharacter.GenerateCaseFolding
|
||||
|
||||
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||
build.tools.makezipreproducible.MakeZipReproducible
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package build.tools.generatecharacter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class CaseFolding {
|
||||
|
||||
public static void main(String[] args) throws Throwable {
|
||||
if (args.length != 3) {
|
||||
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
|
||||
System.exit(1);
|
||||
}
|
||||
var templateFile = Paths.get(args[0]);
|
||||
var caseFoldingTxt = Paths.get(args[1]);
|
||||
var genSrcFile = Paths.get(args[2]);
|
||||
var supportedTypes = "^.*; [CTS]; .*$";
|
||||
var caseFoldingEntries = Files.lines(caseFoldingTxt)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
|
||||
.map(line -> {
|
||||
String[] cols = line.split("; ");
|
||||
return new String[] {cols[0], cols[1], cols[2]};
|
||||
})
|
||||
.filter(cols -> {
|
||||
// the folding case doesn't map back to the original char.
|
||||
var cp1 = Integer.parseInt(cols[0], 16);
|
||||
var cp2 = Integer.parseInt(cols[2], 16);
|
||||
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
|
||||
})
|
||||
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
|
||||
.collect(Collectors.joining(",\n", "", ""));
|
||||
|
||||
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
|
||||
// 0049; T; 0131; # LATIN CAPITAL LETTER I
|
||||
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
|
||||
|
||||
// Generate .java file
|
||||
Files.write(
|
||||
genSrcFile,
|
||||
Files.lines(templateFile)
|
||||
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
|
||||
.collect(Collectors.toList()),
|
||||
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package build.tools.generatecharacter;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class GenerateCaseFolding {
|
||||
|
||||
public static void main(String[] args) throws Throwable {
|
||||
if (args.length != 3) {
|
||||
System.err.println("Usage: java GenerateCaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
|
||||
System.exit(1);
|
||||
}
|
||||
var templateFile = Paths.get(args[0]);
|
||||
var caseFoldingTxt = Paths.get(args[1]);
|
||||
var genSrcFile = Paths.get(args[2]);
|
||||
|
||||
// java.lang
|
||||
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
|
||||
String[][] caseFoldings = Files.lines(caseFoldingTxt)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
|
||||
.map(line -> {
|
||||
var fields = line.split("; ");
|
||||
var cp = fields[0];
|
||||
fields = fields[2].trim().split(" ");
|
||||
var folding = new String[fields.length + 1];
|
||||
folding[0] = cp;
|
||||
System.arraycopy(fields, 0, folding, 1, fields.length);
|
||||
return folding;
|
||||
})
|
||||
.toArray(size -> new String[size][]);
|
||||
|
||||
// util.regex
|
||||
var expandedSupportedTypes = "^.*; [CTS]; .*$";
|
||||
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
|
||||
.map(line -> {
|
||||
String[] cols = line.split("; ");
|
||||
return new String[]{cols[0], cols[1], cols[2]};
|
||||
})
|
||||
.filter(cols -> {
|
||||
// the folding case doesn't map back to the original char.
|
||||
var cp1 = Integer.parseInt(cols[0], 16);
|
||||
var cp2 = Integer.parseInt(cols[2], 16);
|
||||
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
|
||||
})
|
||||
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
|
||||
.collect(Collectors.joining(",\n", "", ""));
|
||||
|
||||
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
|
||||
// 0049; T; 0131; # LATIN CAPITAL LETTER I
|
||||
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
|
||||
|
||||
Files.write(
|
||||
genSrcFile,
|
||||
Files.lines(templateFile)
|
||||
.map(line -> line.contains("%%%Entries") ? genFoldingEntries(caseFoldings) : line)
|
||||
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
|
||||
.collect(Collectors.toList()),
|
||||
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
}
|
||||
|
||||
private static long foldingToLong(String[] folding) {
|
||||
int cp = Integer.parseInt(folding[0], 16);
|
||||
long value = (long)Integer.parseInt(folding[1], 16);
|
||||
if (!Character.isSupplementaryCodePoint(cp) && folding.length != 2) {
|
||||
var shift = 16;
|
||||
for (int j = 2; j < folding.length; j++) {
|
||||
value |= (long)Integer.parseInt(folding[j], 16) << shift;
|
||||
shift <<= 1;
|
||||
}
|
||||
value = value | (long) (folding.length - 1) << 48;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private static String genFoldingEntries(String[][] foldings) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(" private static final int[] CASE_FOLDING_CPS = {\n");
|
||||
int width = 10;
|
||||
for (int i = 0; i < foldings.length; i++) {
|
||||
if (i % width == 0)
|
||||
sb.append(" ");
|
||||
sb.append(String.format("0X%s", foldings[i][0]));
|
||||
if (i < foldings.length - 1)
|
||||
sb.append(", ");
|
||||
if (i % width == width - 1 || i == foldings.length - 1)
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append(" };\n\n");
|
||||
|
||||
sb.append(" private static final long[] CASE_FOLDING_VALUES = {\n");
|
||||
width = 6;
|
||||
for (int i = 0; i < foldings.length; i++) {
|
||||
if (i % width == 0)
|
||||
sb.append(" "); // indent
|
||||
sb.append(String.format("0x%013xL", foldingToLong(foldings[i])));
|
||||
if (i < foldings.length - 1)
|
||||
sb.append(", ");
|
||||
if (i % width == width - 1 || i == foldings.length - 1) {
|
||||
sb.append("\n");
|
||||
}
|
||||
}
|
||||
sb.append(" };\n");
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
@@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)
|
||||
|
||||
################################################################################
|
||||
|
||||
|
||||
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/lang/CaseFolding.java
|
||||
|
||||
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
|
||||
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
|
||||
|
||||
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
|
||||
$(call LogInfo, Generating $@)
|
||||
$(call MakeTargetDir)
|
||||
$(TOOL_GENERATECASEFOLDING) \
|
||||
$(STRINGCASEFOLDING_TEMPLATE) \
|
||||
$(CASEFOLDINGTXT) \
|
||||
$(GENSRC_STRINGCASEFOLDING)
|
||||
|
||||
TARGETS += $(GENSRC_STRINGCASEFOLDING)
|
||||
|
||||
|
||||
endif # include guard
|
||||
include MakeIncludeEnd.gmk
|
||||
|
||||
@@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
|
||||
|
||||
################################################################################
|
||||
|
||||
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
|
||||
|
||||
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
|
||||
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
|
||||
|
||||
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
|
||||
$(call LogInfo, Generating $@)
|
||||
$(call MakeTargetDir)
|
||||
$(TOOL_GENERATECASEFOLDING) \
|
||||
$(CASEFOLDINGTEMP) \
|
||||
$(CASEFOLDINGTXT) \
|
||||
$(GENSRC_CASEFOLDING)
|
||||
|
||||
TARGETS += $(GENSRC_CASEFOLDING)
|
||||
|
||||
################################################################################
|
||||
|
||||
endif # include guard
|
||||
include MakeIncludeEnd.gmk
|
||||
|
||||
@@ -117,9 +117,38 @@ import sun.nio.cs.UTF_8;
|
||||
* Unicode code points (i.e., characters), in addition to those for
|
||||
* dealing with Unicode code units (i.e., {@code char} values).
|
||||
*
|
||||
* <p>Unless otherwise noted, methods for comparing Strings do not take locale
|
||||
* into account. The {@link java.text.Collator} class provides methods for
|
||||
* finer-grain, locale-sensitive String comparison.
|
||||
* <p><b>String comparison and case-insensitive matching</b>
|
||||
*
|
||||
* <p>There are several related ways to compare {@code String} values; choose
|
||||
* the one whose semantics fit your purpose:
|
||||
*
|
||||
* <ul>
|
||||
* <li><b>Exact content equality</b> — {@link #equals(Object)} checks that two
|
||||
* strings contain the identical char sequence of UTF-16 code units. This is
|
||||
* a strict, case-sensitive comparison suitable for exact matching, hashing
|
||||
* and any situation that requires bit-for-bit stability.</li>
|
||||
*
|
||||
* <li><b>Simple case-insensitive equality</b> — {@link #equalsIgnoreCase(String)}
|
||||
* (and the corresponding {@link #compareToIgnoreCase(String)} and {@link #CASE_INSENSITIVE_ORDER})
|
||||
* performs a per-code-point, locale-independent comparison using
|
||||
* {@link Character#toUpperCase(int)} and {@link Character#toLowerCase(int)}.
|
||||
* It is convenient for many common case-insensitive checks.</li>
|
||||
*
|
||||
* <li><b>Unicode case-folded equivalence</b> — {@link #equalsFoldCase(String)}
|
||||
* (and the corresponding {@link #compareToFoldCase(String)} and {@link #UNICODE_CASEFOLD_ORDER})
|
||||
* implement the Unicode <em>{@index "full case folding"}</em> rules defined in
|
||||
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">Unicode CaseFolding.txt</a>.
|
||||
* Case folding is locale-independent and language-neutral and may map a single code
|
||||
* point to multiple code points (1:M mappings). For example, the German sharp
|
||||
* s ({@code U+00DF}) is folded to the sequence {@code "ss"}.
|
||||
* Use these methods when you need Unicode-compliant
|
||||
* <a href="https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790">
|
||||
* caseless matching</a>, searching, or ordering.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>Unless otherwise noted, methods for comparing Strings do not take locale into
|
||||
* account. The {@link java.text.Collator} class provides methods for finer-grain,
|
||||
* locale-sensitive String comparison.
|
||||
*
|
||||
* @implNote The implementation of the string concatenation operator is left to
|
||||
* the discretion of a Java compiler, as long as the compiler ultimately conforms
|
||||
@@ -2179,6 +2208,7 @@ public final class String
|
||||
* false} otherwise
|
||||
*
|
||||
* @see #equals(Object)
|
||||
* @see #equalsFoldCase(String)
|
||||
* @see #codePoints()
|
||||
*/
|
||||
public boolean equalsIgnoreCase(String anotherString) {
|
||||
@@ -2188,6 +2218,57 @@ public final class String
|
||||
&& regionMatches(true, 0, anotherString, 0, length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this {@code String} to another {@code String} for equality,
|
||||
* using <em>{@index "Unicode case folding"}</em>. Two strings are considered equal
|
||||
* by this method if their case-folded forms are identical.
|
||||
* <p>
|
||||
* Case folding is defined by the Unicode Standard in
|
||||
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
|
||||
* including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")}
|
||||
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
|
||||
* to {@code "ss"}.
|
||||
* <p>
|
||||
* Case folding is locale-independent and language-neutral, unlike
|
||||
* locale-sensitive transformations such as {@link #toLowerCase()} or
|
||||
* {@link #toUpperCase()}. It is intended for caseless matching,
|
||||
* searching, and indexing.
|
||||
*
|
||||
* @apiNote
|
||||
* This method is the Unicode-compliant alternative to
|
||||
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
|
||||
* defined by the Unicode Standard, which may differ from the simpler
|
||||
* per-character mapping performed by {@code equalsIgnoreCase}.
|
||||
* For example:
|
||||
* {@snippet lang=java :
|
||||
* String a = "Fuß";
|
||||
* String b = "FUSS";
|
||||
* boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
|
||||
* boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
|
||||
* }
|
||||
*
|
||||
* @param anotherString
|
||||
* The {@code String} to compare this {@code String} against
|
||||
*
|
||||
* @return {@code true} if the given object is not {@code null} and represents
|
||||
* the same sequence of characters as this string under Unicode case
|
||||
* folding; {@code false} otherwise.
|
||||
*
|
||||
* @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
|
||||
* @see #compareToFoldCase(String)
|
||||
* @see #equalsIgnoreCase(String)
|
||||
* @since 26
|
||||
*/
|
||||
public boolean equalsFoldCase(String anotherString) {
|
||||
if (this == anotherString) {
|
||||
return true;
|
||||
}
|
||||
if (anotherString == null) {
|
||||
return false;
|
||||
}
|
||||
return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two strings lexicographically.
|
||||
* The comparison is based on the Unicode value of each character in
|
||||
@@ -2303,12 +2384,86 @@ public final class String
|
||||
* than this String, ignoring case considerations.
|
||||
* @see java.text.Collator
|
||||
* @see #codePoints()
|
||||
* @see #compareToFoldCase(String)
|
||||
* @since 1.2
|
||||
*/
|
||||
public int compareToIgnoreCase(String str) {
|
||||
return CASE_INSENSITIVE_ORDER.compare(this, str);
|
||||
}
|
||||
|
||||
/**
|
||||
* A Comparator that orders {@code String} objects as by
|
||||
* {@link #compareToFoldCase(String) compareToFoldCase()}.
|
||||
*
|
||||
* @see #compareToFoldCase(String)
|
||||
* @since 26
|
||||
*/
|
||||
public static final Comparator<String> UNICODE_CASEFOLD_ORDER
|
||||
= new FoldCaseComparator();
|
||||
|
||||
private static class FoldCaseComparator implements Comparator<String> {
|
||||
|
||||
@Override
|
||||
public int compare(String s1, String s2) {
|
||||
byte[] v1 = s1.value;
|
||||
byte[] v2 = s2.value;
|
||||
if (s1.coder == s2.coder()) {
|
||||
return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2)
|
||||
: StringUTF16.compareToFC(v1, v2);
|
||||
}
|
||||
return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2)
|
||||
: StringUTF16.compareToFC_Latin1(v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two strings lexicographically using <em>{@index "Unicode case folding"}</em>.
|
||||
* This method returns an integer whose sign is that of calling {@code compareTo}
|
||||
* on the Unicode case folded version of the strings. Unicode Case folding
|
||||
* eliminates differences in case according to the Unicode Standard, using the
|
||||
* mappings defined in
|
||||
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
|
||||
* including 1:M mappings, such as {@code"ß"} → {@code }"ss"}.
|
||||
* <p>
|
||||
* Case folding is a locale-independent, language-neutral form of case mapping,
|
||||
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
|
||||
* which applies a simpler locale-insensitive uppercase mapping. This method
|
||||
* follows the Unicode <em>{@index "full"}</em> case folding, providing stable and
|
||||
* consistent results across all environments.
|
||||
* <p>
|
||||
* Note that this method does <em>not</em> take locale into account, and may
|
||||
* produce results that differ from locale-sensitive ordering. Use
|
||||
* {@link java.text.Collator} for locale-sensitive comparison.
|
||||
*
|
||||
* @apiNote
|
||||
* This method is the Unicode-compliant alternative to
|
||||
* {@link #compareToIgnoreCase(String)}. It implements the
|
||||
* <em>{@index "full case folding"}</em> as defined by the Unicode Standard, which
|
||||
* may differ from the simpler per-character mapping performed by
|
||||
* {@code compareToIgnoreCase}.
|
||||
* For example:
|
||||
* {@snippet lang=java :
|
||||
* String a = "Fuß";
|
||||
* String b = "FUSS";
|
||||
* int cmpFoldCase = a.compareToFoldCase(b); // returns 0
|
||||
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
|
||||
* }
|
||||
*
|
||||
* @param str the {@code String} to be compared.
|
||||
* @return a negative integer, zero, or a positive integer as the specified
|
||||
* String is greater than, equal to, or less than this String,
|
||||
* ignoring case considerations by case folding.
|
||||
*
|
||||
* @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
|
||||
* @see java.text.Collator
|
||||
* @see #compareToIgnoreCase(String)
|
||||
* @see #equalsFoldCase(String)
|
||||
* @since 26
|
||||
*/
|
||||
public int compareToFoldCase(String str) {
|
||||
return UNICODE_CASEFOLD_ORDER.compare(this, str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests if two string regions are equal.
|
||||
* <p>
|
||||
|
||||
@@ -32,6 +32,8 @@ import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import jdk.internal.lang.CaseFolding;
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
import jdk.internal.vm.annotation.IntrinsicCandidate;
|
||||
|
||||
@@ -179,6 +181,128 @@ final class StringLatin1 {
|
||||
return len1 - len2;
|
||||
}
|
||||
|
||||
private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
|
||||
int k1 = off, k2 = ooff;
|
||||
boolean lo1 = false, lo2 = false; // true if we have a leftover 's' from u+00df -> ss
|
||||
while ((k1 < last || lo1) && (k2 < olast || lo2)) {
|
||||
int c1, c2;
|
||||
if (lo1) {
|
||||
c1 = 0x73; // leftover 's'
|
||||
lo1 = false;
|
||||
} else {
|
||||
c1 = getChar(value, k1++);
|
||||
if (c1 == 0xdf) {
|
||||
c1 = 0x73;
|
||||
lo1 = true;
|
||||
}
|
||||
}
|
||||
if (lo2) {
|
||||
c2 = 0x73; // 's'
|
||||
lo2 = false;
|
||||
} else {
|
||||
c2 = getChar(other, k2++);
|
||||
if (c2 == 0xdf) {
|
||||
c2 = 0x73;
|
||||
lo2 = true;
|
||||
}
|
||||
}
|
||||
if (!CharacterDataLatin1.equalsIgnoreCase((byte)c1, (byte)c2)) {
|
||||
return Character.toLowerCase(c1) - Character.toLowerCase(c2);
|
||||
}
|
||||
}
|
||||
if (k1 < last || lo1) {
|
||||
return 1;
|
||||
}
|
||||
if (k2 < olast || lo2) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int compareToFC(byte[] value, byte[] other) {
|
||||
int len = value.length;
|
||||
int olen = other.length;
|
||||
int lim = Math.min(len, olen);
|
||||
for (int k = 0; k < lim; k++) {
|
||||
byte b1 = value[k];
|
||||
byte b2 = other[k];
|
||||
if (!CharacterDataLatin1.equalsIgnoreCase(b1, b2)) {
|
||||
int c1 = b1 & 0xff;
|
||||
int c2 = b2 & 0xff;
|
||||
if (c1 == 0xdf || c2 == 0xdf) { // 0xdf is the only 1:M in latin1 range
|
||||
return compareToFC0(value, k, len, other, k, olen);
|
||||
}
|
||||
return Character.toLowerCase(c1) - Character.toLowerCase(c2);
|
||||
}
|
||||
}
|
||||
return len - olen;
|
||||
}
|
||||
|
||||
private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
|
||||
int f1 = 0, f2 = 0;
|
||||
int k1 = off, k2 = ooff;
|
||||
while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
|
||||
int c1, c2;
|
||||
if (f1 != 0) {
|
||||
c1 = (f1 & 0xffff); f1 >>>= 16;
|
||||
} else {
|
||||
c1 = getChar(value, k1++);
|
||||
var f = CaseFolding.fold(c1);
|
||||
if (CaseFolding.isSingleCodePoint(f)) {
|
||||
c1 = (int)(f & 0xfffff);
|
||||
} else {
|
||||
c1 = (int)f & 0xffff;
|
||||
f1 = (int)(f >>> 16);
|
||||
}
|
||||
}
|
||||
if (f2 != 0) {
|
||||
c2 = f2 & 0xffff; f2 >>>= 16;
|
||||
} else {
|
||||
c2 = StringUTF16.codePointAt(other, k2, olast, true);
|
||||
k2 += Character.charCount(c2);
|
||||
var f = CaseFolding.fold(c2);
|
||||
if (CaseFolding.isSingleCodePoint(f)) {
|
||||
c2 = (int)(f & 0xfffff);
|
||||
} else {
|
||||
c2 = (int)(f & 0xffff);
|
||||
f2 = (int)(f >>> 16);
|
||||
}
|
||||
}
|
||||
if (c1 != c2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
}
|
||||
if (k1 < last || f1 != 0) {
|
||||
return 1;
|
||||
}
|
||||
if (k2 < olast || f2 != 0) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// latin1 vs utf16
|
||||
static int compareToFC_UTF16(byte[] value, byte[] other) {
|
||||
int last = length(value);
|
||||
int olast = StringUTF16.length(other);
|
||||
int lim = Math.min(last, olast);
|
||||
for (int k = 0; k < lim; k++) {
|
||||
int cp1 = getChar(value, k);
|
||||
int cp2 = StringUTF16.codePointAt(other, k, olast, true);
|
||||
if (cp1 != cp2) {
|
||||
long cf1 = CaseFolding.fold(cp1);
|
||||
long cf2 = CaseFolding.fold(cp2);
|
||||
if (cf1 != cf2) {
|
||||
if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
|
||||
return compareToFC0_UTF16(value, k, last, other, k, olast);
|
||||
}
|
||||
return (int)(cf1 - cf2);
|
||||
}
|
||||
}
|
||||
}
|
||||
return last - olast;
|
||||
}
|
||||
|
||||
static int hashCode(byte[] value) {
|
||||
return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0);
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ import java.util.function.IntConsumer;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import jdk.internal.lang.CaseFolding;
|
||||
import jdk.internal.misc.Unsafe;
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
import jdk.internal.vm.annotation.ForceInline;
|
||||
@@ -93,7 +94,7 @@ final class StringUTF16 {
|
||||
return value.length >> 1;
|
||||
}
|
||||
|
||||
private static int codePointAt(byte[] value, int index, int end, boolean checked) {
|
||||
static int codePointAt(byte[] value, int index, int end, boolean checked) {
|
||||
assert index < end;
|
||||
if (checked) {
|
||||
checkIndex(index, value);
|
||||
@@ -592,6 +593,77 @@ final class StringUTF16 {
|
||||
return -StringLatin1.compareToCI_UTF16(other, value);
|
||||
}
|
||||
|
||||
public static int compareToFC_Latin1(byte[] value, byte[] other) {
|
||||
return -StringLatin1.compareToFC_UTF16(other, value);
|
||||
}
|
||||
|
||||
private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
|
||||
int f1 = 0, f2 = 0;
|
||||
int k1 = off, k2 = ooff;
|
||||
while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
|
||||
int c1, c2;
|
||||
if (f1 != 0) {
|
||||
c1 = f1 & 0xffff; f1 >>>= 16;
|
||||
} else {
|
||||
c1 = StringUTF16.codePointAt(value, k1, last, true);
|
||||
k1 += Character.charCount(c1);
|
||||
var f = CaseFolding.fold(c1);
|
||||
if (CaseFolding.isSingleCodePoint(f)) {
|
||||
c1 = (int)(f & 0xfffff);
|
||||
} else {
|
||||
c1 = (int)(f & 0xffff);
|
||||
f1 = (int)(f >> 16);
|
||||
}
|
||||
}
|
||||
if (f2 != 0) {
|
||||
c2 = f2 & 0xffff; f2 >>>= 16;
|
||||
} else {
|
||||
c2 = StringUTF16.codePointAt(other, k2, olast, true);
|
||||
k2 += Character.charCount(c2);
|
||||
var f = CaseFolding.fold(c2);
|
||||
if (CaseFolding.isSingleCodePoint(f)) {
|
||||
c2 = (int)(f & 0xfffff);
|
||||
} else {
|
||||
c2 = (int)(f & 0xffff);
|
||||
f2 = (int)(f >>> 16);
|
||||
}
|
||||
}
|
||||
if (c1 != c2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
}
|
||||
if (k1 < last || f1 != 0) {
|
||||
return 1;
|
||||
}
|
||||
if (k2 < olast || f2 != 0) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static int compareToFC(byte[] value, byte[] other) {
|
||||
int tlast = length(value);
|
||||
int olast = length(other);
|
||||
int lim = Math.min(tlast, olast);
|
||||
int k = 0;
|
||||
while (k < lim) {
|
||||
int cp1 = codePointAt(value, k, tlast, true);
|
||||
int cp2 = codePointAt(other, k, olast, true);
|
||||
if (cp1 != cp2) {
|
||||
long cf1 = CaseFolding.fold(cp1);
|
||||
long cf2 = CaseFolding.fold(cp2);
|
||||
if (cf1 != cf2) {
|
||||
if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
|
||||
return compareToFC0(value, k, tlast, other, k, olast);
|
||||
}
|
||||
return (int) cf1 - (int) cf2;
|
||||
}
|
||||
}
|
||||
k += Character.charCount(cp1);
|
||||
}
|
||||
return tlast - olast;
|
||||
}
|
||||
|
||||
static int hashCode(byte[] value) {
|
||||
return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0);
|
||||
}
|
||||
|
||||
@@ -43,8 +43,8 @@ import java.util.function.Predicate;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import jdk.internal.lang.CaseFolding;
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
import jdk.internal.util.regex.CaseFolding;
|
||||
import jdk.internal.util.regex.Grapheme;
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package jdk.internal.lang;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import static java.util.Map.entry;
|
||||
|
||||
/**
|
||||
* Utility class that handles Unicode case folding properties defined in
|
||||
* CasingFolding.txt, including 1:M full case folding.
|
||||
*/
|
||||
public final class CaseFolding {
|
||||
|
||||
private CaseFolding() {}
|
||||
|
||||
/**
|
||||
* Tests whether the specified code point has a folding mapping entry defined.
|
||||
*
|
||||
* @param cp
|
||||
* the Unicode code point to test
|
||||
* @return {@code true} if the given code point has a case folding mapping entry
|
||||
* defined in (@code caseFoldingMap}, {@code false} otherwise
|
||||
*/
|
||||
public static boolean isDefined(int cp) {
|
||||
return getDefined(cp) != -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the case-folded form of the specified code point according
|
||||
* to the Unicode case folding mappings.
|
||||
* <p>
|
||||
* If the code point has no case folding mapping defined, this method returns
|
||||
* the original code point.
|
||||
*
|
||||
* Possible combinations of the returning case-folding form as a long value
|
||||
*
|
||||
* +---+---------+--------+---------+--------+--------+
|
||||
* | 1:1 mapping | 0000 | 0000 | 000x | xxxx | 0041 => 0061 or 1E921 => 1E943
|
||||
* +---+---------+--------+---------+--------+--------+
|
||||
* | 1:2 mapping | 0002 | 0000 | xxxx | xxxx | FB02 => 0066 006C
|
||||
* +---+---------+--------+---------+--------+--------+
|
||||
* | 1:3 mapping | 0003 | xxxx | xxxx | xxxx | FB03 => 0066 0066 0069
|
||||
* +---+---------+--------+---------+--------+--------+
|
||||
*
|
||||
* @param cp
|
||||
* the Unicode code point to fold
|
||||
* @return a long value representing the case-folded form of the input
|
||||
* code point, encoded as TBD
|
||||
*/
|
||||
public static long fold(int cp) {
|
||||
var fold = getDefined(cp);
|
||||
return fold == -1 ? cp : fold;
|
||||
}
|
||||
|
||||
public static boolean isSingleCodePoint(long fold) {
|
||||
return (fold >> 48) == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
|
||||
* matching, according to the
|
||||
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
|
||||
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
|
||||
* <p>
|
||||
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
|
||||
* be applied to literals and (optionally) to character classes. When applied to character classes, each
|
||||
* character class is expected to be closed under simple case folding. See the standard for the
|
||||
* detailed explanation and example of "closed".
|
||||
* <p>
|
||||
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
|
||||
* <ol>
|
||||
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
|
||||
* <li>Specify which character properties or constructs are closed under the matching.</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
|
||||
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
|
||||
* family may appears independently or within a class.
|
||||
* <p>
|
||||
* For loose/case-insensitive matching, the back-refs, slices and singles apply {@code toUpperCase} and
|
||||
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
|
||||
* matching.
|
||||
* <p>
|
||||
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
|
||||
* if their behavior is clearly specified.
|
||||
* <p>
|
||||
* This method addresses that requirement for the "range" construct within in character class by computing
|
||||
* the additional characters that should be included to close the range under simple case folding:
|
||||
* <p>
|
||||
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
|
||||
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
|
||||
* character is not already in the range, then that mapped character (typically lowercase) is added to
|
||||
* the expansion set.
|
||||
* <p>
|
||||
* This allows regex character class "range" implementation to use the returned expansion set to support
|
||||
* additional case-insensitive matching, without duplicating characters already covered by the existing
|
||||
* regex range implementation. The expectation is the matching is done using both the uppercase and
|
||||
* lowercase forms of the input character, for example
|
||||
*
|
||||
* <pre>{@code
|
||||
*
|
||||
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
|
||||
* inRange(lower, Character.toLower(ch), upper) ||
|
||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
|
||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
|
||||
* }</pre>
|
||||
*
|
||||
* @param start the starting code point of the character range
|
||||
* @param end the ending code point of the character range
|
||||
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
|
||||
* those already in the range
|
||||
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
|
||||
*/
|
||||
public static int[] getClassRangeClosingCharacters(int start, int end) {
|
||||
int[] expanded = new int[expanded_case_cps.length];
|
||||
int off = 0;
|
||||
for (int cp : expanded_case_cps) {
|
||||
if (cp >= start && cp <= end) {
|
||||
int folding = expanded_case_map.get(cp);
|
||||
if (folding < start || folding > end) {
|
||||
expanded[off++] = folding;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Arrays.copyOf(expanded, off);
|
||||
}
|
||||
|
||||
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
|
||||
%%%Expanded_Case_Map_Entries
|
||||
);
|
||||
|
||||
private static final int[] expanded_case_cps = expanded_case_map.keySet()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
|
||||
private static final int HASH_CP = 0;
|
||||
private static final int HASH_INDEX = 1;
|
||||
private static final int HASH_NEXT = 2;
|
||||
|
||||
private static int[][] hashKeys(int[] keys) {
|
||||
var hashes = new int[keys.length << 1][3]; // cp + hash + next
|
||||
var off = keys.length;
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
var cp = keys[i];
|
||||
var hash = cp % keys.length;
|
||||
while (hashes[hash][HASH_CP] != 0) {
|
||||
var next = hashes[hash][HASH_NEXT];
|
||||
if (next == 0) {
|
||||
hashes[hash][HASH_NEXT] = off;
|
||||
hash = off++;
|
||||
break;
|
||||
} else {
|
||||
hash = next;
|
||||
}
|
||||
}
|
||||
hashes[hash][HASH_CP] = cp;
|
||||
hashes[hash][HASH_INDEX] = i;
|
||||
}
|
||||
return Arrays.copyOf(hashes, off);
|
||||
}
|
||||
|
||||
private static long getDefined(int cp) {
|
||||
var hashes = CASE_FOLDING_HASHES;
|
||||
var length = CASE_FOLDING_CPS.length; // hashed based on total defined.
|
||||
var hash = cp % length;
|
||||
while (hashes[hash][HASH_CP] != cp) {
|
||||
var next = hashes[hash][HASH_NEXT];
|
||||
if (next == 0) {
|
||||
return -1; // hash miss
|
||||
}
|
||||
hash = next;
|
||||
}
|
||||
var index = hashes[hash][HASH_INDEX];
|
||||
return CASE_FOLDING_VALUES[index];
|
||||
}
|
||||
|
||||
%%%Entries
|
||||
|
||||
private static final int[][] CASE_FOLDING_HASHES = hashKeys(CASE_FOLDING_CPS);
|
||||
}
|
||||
@@ -1,116 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package jdk.internal.util.regex;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static java.util.Map.entry;
|
||||
|
||||
public final class CaseFolding {
|
||||
|
||||
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
|
||||
%%%Entries
|
||||
);
|
||||
|
||||
private static final int[] expanded_case_cps = expanded_case_map.keySet()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
|
||||
private CaseFolding() {}
|
||||
|
||||
/**
|
||||
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
|
||||
* matching, according to the
|
||||
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
|
||||
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
|
||||
* <p>
|
||||
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
|
||||
* be applied to literals and (optionally) to character classes. When applied to character classes, each
|
||||
* character class is expected to be closed under simple case folding. See the standard for the
|
||||
* detailed explanation and example of "closed".
|
||||
* <p>
|
||||
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
|
||||
* <ol>
|
||||
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
|
||||
* <li>Specify which character properties or constructs are closed under the matching.</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
|
||||
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
|
||||
* family may appears independently or within a class.
|
||||
* <p>
|
||||
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
|
||||
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
|
||||
* matching.
|
||||
* <p>
|
||||
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
|
||||
* if their behavior is clearly specified.
|
||||
* <p>
|
||||
* This method addresses that requirement for the "range" construct within in character class by computing
|
||||
* the additional characters that should be included to close the range under simple case folding:
|
||||
* <p>
|
||||
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
|
||||
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
|
||||
* character is not already in the range, then that mapped character (typically lowercase) is added to
|
||||
* the expansion set.
|
||||
* <p>
|
||||
* This allows regex character class "range" implementation to use the returned expansion set to support
|
||||
* additional case-insensitive matching, without duplicating characters already covered by the existing
|
||||
* regex range implementation. The expectation is the matching is done using both the uppercase and
|
||||
* lowercase forms of the input character, for example
|
||||
*
|
||||
* <pre>{@code
|
||||
*
|
||||
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
|
||||
* inRange(lower, Character.toLower(ch), upper) ||
|
||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
|
||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
|
||||
* }</pre>
|
||||
*
|
||||
* <p>
|
||||
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
|
||||
* @param start the starting code point of the character range
|
||||
* @param end the ending code point of the character range
|
||||
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
|
||||
* those already in the range
|
||||
*/
|
||||
public static int[] getClassRangeClosingCharacters(int start, int end) {
|
||||
int[] expanded = new int[expanded_case_cps.length];
|
||||
int off = 0;
|
||||
for (int cp : expanded_case_cps) {
|
||||
if (cp >= start && cp <= end) {
|
||||
int folding = expanded_case_map.get(cp);
|
||||
if (folding < start || folding > end) {
|
||||
expanded[off++] = folding;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Arrays.copyOf(expanded, off);
|
||||
}
|
||||
}
|
||||
329
test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
Normal file
329
test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
Normal file
@@ -0,0 +1,329 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary tests unicode case-folding based String comparison and equality
|
||||
* @bug 4397357
|
||||
* @library /lib/testlibrary/java/lang
|
||||
* @modules java.base/jdk.internal.lang:+open
|
||||
* @run junit/othervm
|
||||
* UnicodeCaseFoldingTest
|
||||
*/
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import jdk.internal.lang.CaseFolding;
|
||||
|
||||
public class UnicodeCaseFoldingTest {
|
||||
|
||||
@Test
|
||||
void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable {
|
||||
var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding
|
||||
var results = Files.lines(UCDFiles.CASEFOLDING)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(filter))
|
||||
.map(line -> {
|
||||
var fields = line.split("; ");
|
||||
var cp = Integer.parseInt(fields[0], 16);
|
||||
fields = fields[2].trim().split(" ");
|
||||
var folding = new int[fields.length];
|
||||
for (int i = 0; i < folding.length; i++) {
|
||||
folding[i] = Integer.parseInt(fields[i], 16);
|
||||
}
|
||||
var source = new String(Character.toChars(cp));
|
||||
var expected = new String(folding, 0, folding.length);
|
||||
// (1) Verify the folding result matches expected
|
||||
assertEquals(expected, foldCase(source), "CaseFolding.fold(): ");
|
||||
|
||||
// (2) Verify compareToFoldCase() result
|
||||
assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
|
||||
assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
|
||||
|
||||
// (3) Verify equalsFoldCase() result
|
||||
assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
|
||||
assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
|
||||
return null;
|
||||
})
|
||||
.filter(error -> error != null)
|
||||
.toArray();
|
||||
assertEquals(0, results.length);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable {
|
||||
// S=simple, for simple case folding. The simple case folding should still matches
|
||||
var filter = "^.*; [S]; .*$";
|
||||
var results = Files.lines(UCDFiles.CASEFOLDING)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(filter))
|
||||
.map(line -> {
|
||||
var fields = line.split("; ");
|
||||
var cp = Integer.parseInt(fields[0], 16);
|
||||
fields = fields[2].trim().split(" ");
|
||||
var folding = new int[fields.length];
|
||||
for (int i = 0; i < folding.length; i++) {
|
||||
folding[i] = Integer.parseInt(fields[i], 16);
|
||||
}
|
||||
var source = new String(Character.toChars(cp));
|
||||
var expected = new String(folding, 0, folding.length);
|
||||
|
||||
// (1) Verify compareToFoldCase() result
|
||||
assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
|
||||
assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
|
||||
|
||||
// (2) Verify equalsFoldCase() result
|
||||
assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
|
||||
assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
|
||||
return null;
|
||||
})
|
||||
.filter(error -> error != null)
|
||||
.toArray();
|
||||
assertEquals(0, results.length);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception {
|
||||
// Collect all code points that appear in CaseFolding.txt
|
||||
var listed = Files.lines(UCDFiles.CASEFOLDING)
|
||||
.filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$"))
|
||||
.map(line -> Integer.parseInt(line.split("; ")[0], 16))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
var failures = new ArrayList<String>();
|
||||
|
||||
// Scan BMP + Supplementary Plane 1 (U+0000..U+1FFFF)
|
||||
for (int cp = Character.MIN_CODE_POINT; cp <= 0x1FFFF; cp++) {
|
||||
if (!Character.isDefined(cp)) {
|
||||
continue; // skip undefined
|
||||
}
|
||||
if (Character.isSurrogate((char) cp)) {
|
||||
continue; // skip surrogate code units
|
||||
}
|
||||
if (listed.contains(cp)) {
|
||||
continue; // already tested separately
|
||||
}
|
||||
String s = new String(Character.toChars(cp));
|
||||
String folded = foldCase(s);
|
||||
if (!s.equals(folded)) {
|
||||
failures.add(String.format("Unexpected folding: U+%04X '%s' → '%s'", cp, s, folded));
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(0, failures.size(),
|
||||
() -> "Some unlisted code points folded unexpectedly:\n"
|
||||
+ String.join("\n", failures));
|
||||
}
|
||||
|
||||
@ParameterizedTest(name = "CaseFold \"{0}\" → \"{1}\"")
|
||||
@MethodSource("caseFoldTestCases")
|
||||
void testIndividualCaseFolding(String input, String expected) {
|
||||
assertEquals(expected, foldCase(input));
|
||||
}
|
||||
|
||||
static Stream<Arguments> caseFoldTestCases() {
|
||||
return Stream.of(
|
||||
// ASCII simple cases
|
||||
Arguments.of("ABC", "abc"),
|
||||
Arguments.of("already", "already"),
|
||||
Arguments.of("MiXeD123", "mixed123"),
|
||||
// --- Latin-1 to non-Latin-1 fold ---
|
||||
Arguments.of("aBc\u00B5Efg", "abc\u03BCefg"), // "µ" → "μ"
|
||||
Arguments.of("test\u00B5\ud801\udc00X", "test\u03bc\ud801\udc28x"),
|
||||
// German Eszett
|
||||
Arguments.of("Stra\u00DFe", "strasse"), // "Straße"
|
||||
Arguments.of("\u1E9E", "ss"), // "ẞ" capital sharp S
|
||||
// Turkish dotted I / dotless i
|
||||
Arguments.of("I", "i"),
|
||||
Arguments.of("\u0130", "i\u0307"), // capital dotted I → "i + dot above"
|
||||
Arguments.of("\u0069\u0307", "i\u0307"), // small i + dot above remains
|
||||
Arguments.of("\u0131", "\u0131"), // "ı" (dotless i stays dotless)
|
||||
|
||||
// Greek special cases ---
|
||||
Arguments.of("\u039F\u03A3", "\u03BF\u03C3"), // "ΟΣ" → "οσ" final sigma always folds to normal sigma
|
||||
Arguments.of("\u1F88", "\u1F00\u03B9"), // "ᾈ" → "ἀι" Alpha with psili + ypogegrammeni
|
||||
Arguments.of("\u039C\u03AC\u03CA\u03BF\u03C2", "\u03BC\u03AC\u03CA\u03BF\u03C3"), // "Μάϊος" → "μάϊοσ"
|
||||
Arguments.of("\u1F08", "\u1F00"), // Ἀ (Capital Alpha with psili) → ἀ
|
||||
|
||||
// Supplementary Plane characters
|
||||
Arguments.of("\uD801\uDC00", "\uD801\uDC28"), // Deseret Capital Letter Long I → Small
|
||||
Arguments.of("\uD801\uDC01", "\uD801\uDC29"), // Deseret Capital Letter Long E → Small
|
||||
|
||||
// Supplementary inside ASCII
|
||||
Arguments.of("abc\uD801\uDC00def", "abc\uD801\uDC28def"),
|
||||
// Ligatures and compatibility folds
|
||||
Arguments.of("\uFB00", "ff"), // ff → ff
|
||||
Arguments.of("\uFB03", "ffi"), // ffi → ffi
|
||||
Arguments.of("\u212A", "k"), // Kelvin sign → k
|
||||
|
||||
Arguments.of("abc\uFB00def", "abcffdef"), // ff → ff
|
||||
Arguments.of("abc\uFB03def", "abcffidef"), // ffi → ffi
|
||||
Arguments.of("abc\u212Adef", "abckdef"), // Kelvin sign → k
|
||||
|
||||
// --- Fullwidth ---
|
||||
Arguments.of("\uFF21\uFF22\uFF23", "\uFF41\uFF42\uFF43"), // "ABC" → "abc"
|
||||
|
||||
// --- Armenian ---
|
||||
Arguments.of("\u0531", "\u0561"), // "Ա" → "ա"
|
||||
|
||||
// --- Cherokee ---
|
||||
Arguments.of("\u13A0", "\u13A0"), // Capital Cherokee A folds to itself
|
||||
Arguments.of("\uAB70", "\u13A0") // Small Cherokee A folds Capital Cherokee A
|
||||
);
|
||||
}
|
||||
|
||||
static Stream<Arguments> caseFoldEqualProvider() {
|
||||
return Stream.of(
|
||||
Arguments.of("abc", "ABC"),
|
||||
Arguments.of("aBcDe", "AbCdE"),
|
||||
Arguments.of("\u00C0\u00E7", "\u00E0\u00C7"), // Àç vs àÇ
|
||||
Arguments.of("straße", "STRASSE"), // ß → ss
|
||||
Arguments.of("\uD83C\uDDE6", "\uD83C\uDDE6"), // 🇦 vs 🇦
|
||||
Arguments.of("\u1E9E", "ss"), // ẞ (capital sharp S)
|
||||
Arguments.of("\u03A3", "\u03C3"), // Σ vs σ (Greek Sigma)
|
||||
Arguments.of("\u03C3", "\u03C2"), // σ vs ς (Greek sigma/final sigma)
|
||||
Arguments.of("\u212B", "\u00E5"), // Å (Angstrom sign) vs å
|
||||
Arguments.of("\uFB00", "ff"), // ff (ligature)
|
||||
Arguments.of("\u01C5", "\u01C5"), // Dž (Latin capital D with small z with caron)
|
||||
Arguments.of("Caf\u00E9", "CAF\u00C9"), // Café vs CAFÉ
|
||||
Arguments.of("\u03BA\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\u03B1", "\u039A\u0391\u039B\u0397\u039C\u0388\u03A1\u0391"), // καλημέρα vs ΚΑΛΗΜΕΡΑ
|
||||
Arguments.of("\u4E2D\u56FD", "\u4E2D\u56FD"), // 中国
|
||||
Arguments.of("\u03B1", "\u0391"), // α vs Α (Greek alpha)
|
||||
Arguments.of("\u212B", "\u00C5"), // Å vs Å
|
||||
// from StringCompareToIgnoreCase
|
||||
Arguments.of("\u0100\u0102\u0104\u0106\u0108", "\u0100\u0102\u0104\u0106\u0109"), // ĀĂĄĆĈ vs ĀĂĄĆĉ
|
||||
Arguments.of("\u0101\u0103\u0105\u0107\u0109", "\u0100\u0102\u0104\u0106\u0109"), // āăąćĉ vs ĀĂĄĆĉ
|
||||
Arguments.of("\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04",
|
||||
"\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"), // 𐐀𐐁𐐂𐐃𐐄 vs 𐐀𐐁𐐂𐐃𐐬
|
||||
Arguments.of("\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c",
|
||||
"\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c") // 𐐨𐐩𐐪𐐫𐐬 vs 𐐀𐐁𐐂𐐃𐐬
|
||||
);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("caseFoldEqualProvider")
|
||||
void testcompareToFoldCaseEquals(String s1, String s2) {
|
||||
assertEquals(0, s1.compareToFoldCase(s2));
|
||||
assertEquals(0, s2.compareToFoldCase(s1));
|
||||
assertEquals(true, s1.equalsFoldCase(s2));
|
||||
assertEquals(true, s2.equalsFoldCase(s1));
|
||||
assertEquals(foldCase(s1), foldCase(s2));
|
||||
}
|
||||
|
||||
static Stream<Arguments> caseFoldOrderingProvider() {
|
||||
return Stream.of(
|
||||
Arguments.of("asa", "aß", -1), // ß → ss → "asa" < "ass"
|
||||
Arguments.of("aß", "asa", +1),
|
||||
Arguments.of("a\u00DF", "ass", 0), // aß vs ass
|
||||
Arguments.of("\uFB03", "ffi", 0), // ffi (ligature)
|
||||
Arguments.of("\u00C5", "Z", 1), // Å vs Z
|
||||
Arguments.of("A", "\u00C0", -1), // A vs À
|
||||
Arguments.of("\u03A9", "\u03C9", 0), // Ω vs ω
|
||||
Arguments.of("\u03C2", "\u03C3", 0), // ς vs σ
|
||||
Arguments.of("\uD835\uDD23", "R", 1), // 𝔯 (fraktur r) vs R
|
||||
Arguments.of("\uFF26", "E", 1), // F (full-width F) vs E
|
||||
Arguments.of("\u00C9clair", "Eclair", 1), // Éclair vs Eclair
|
||||
Arguments.of("\u03bc\u00df", "\u00b5s", 1),
|
||||
Arguments.of("\u00b5s", "\u03bc\u00df", -1)
|
||||
);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("caseFoldOrderingProvider")
|
||||
void testcompareToFoldCaseOrdering(String s1, String s2, int expectedSign) {
|
||||
int cmp = s1.compareToFoldCase(s2);
|
||||
assertEquals(expectedSign, Integer.signum(cmp));
|
||||
}
|
||||
|
||||
static Stream<Arguments> roundTripProvider() {
|
||||
return Stream.of(
|
||||
Arguments.of("abc"),
|
||||
Arguments.of("ABC"),
|
||||
Arguments.of("straße"),
|
||||
Arguments.of("Àç"),
|
||||
Arguments.of("aß"),
|
||||
Arguments.of("\uFB02uff"), // fluff (ligature in "fluff")
|
||||
Arguments.of("\u00C9COLE") // ÉCOLE
|
||||
);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("roundTripProvider")
|
||||
void testCaseFoldRoundTrip(String s) {
|
||||
String folded = foldCase(s);
|
||||
assertEquals(0, s.compareToFoldCase(folded));
|
||||
assertEquals(0, folded.compareToFoldCase(s));
|
||||
assertEquals(true, s.equalsFoldCase(folded));
|
||||
assertEquals(true, folded.equalsFoldCase(s));
|
||||
}
|
||||
|
||||
// helper to test the integrity of folding mapping
|
||||
private static int[] longToFolding(long value) {
|
||||
int len = (int) (value >>> 48);
|
||||
if (len == 0) {
|
||||
return new int[]{(int) (value & 0xFFFFF)};
|
||||
} else {
|
||||
var folding = new int[len];
|
||||
for (int i = 0; i < len; i++) {
|
||||
folding[i] = (int) (value & 0xFFFF);
|
||||
value >>= 16;
|
||||
}
|
||||
return folding;
|
||||
}
|
||||
}
|
||||
|
||||
private static String foldCase(String s) {
|
||||
int first;
|
||||
int len = s.length();
|
||||
int cpCnt = 1;
|
||||
for (first = 0; first < len; first += cpCnt) {
|
||||
int cp = s.codePointAt(first);
|
||||
if (CaseFolding.isDefined(cp)) {
|
||||
break;
|
||||
}
|
||||
cpCnt = Character.charCount(cp);
|
||||
}
|
||||
if (first == len) {
|
||||
return s;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder(len);
|
||||
sb.append(s, 0, first);
|
||||
for (int i = first; i < len; i += cpCnt) {
|
||||
int cp = s.codePointAt(i);
|
||||
int[] folded = longToFolding(CaseFolding.fold(cp));
|
||||
for (int f : folded) {
|
||||
sb.appendCodePoint(f);
|
||||
}
|
||||
cpCnt = Character.charCount(cp);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.java.lang;
|
||||
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/*
|
||||
* This benchmark naively explores String::compareToFoldCase performance
|
||||
*/
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@State(Scope.Thread)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(3)
|
||||
public class StringCompareToFoldCase {
|
||||
|
||||
private String asciiUpper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
private String asciiUpperLower = "ABCDEFGHIJKLMNOpqrstuvwxyz";
|
||||
private String asciiLower = "abcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
private String asciiWithDF = "abcdßßßßßßßßßßßßßßßßWXYZ";
|
||||
private String asciiWithDFSS = "abcdssssssssssssssssßßßßßßßßWXYZ";
|
||||
|
||||
private String asciiLatine1 = "ABCDEFGHIJKLMNOpqrstuvwxyz0";
|
||||
private String asciiLatin1UTF16 = "abcdefghijklmnopqrstuvwxyz\u0391";
|
||||
|
||||
private String greekUpper = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u0395"; // ΑΒΓΔΕ
|
||||
private String greekUpperLower = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u03B5"; // ΑΒΓΔε
|
||||
private String greekLower = "\u03B1\u03B2\u03B3\u03B4\u03B5\u03B1\u03B2\u03B3\u03B4\u03B5"; // αβγδε
|
||||
|
||||
public String supUpper = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04";
|
||||
public String supUpperLower = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c";
|
||||
public String supLower = "\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c";
|
||||
|
||||
@Benchmark
|
||||
public int asciiUpperLower() {
|
||||
return asciiUpper.compareToIgnoreCase(asciiUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int asciiLower() {
|
||||
return asciiUpper.compareToIgnoreCase(asciiLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int greekUpperLower() {
|
||||
return greekUpper.compareToIgnoreCase(greekUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int greekLower() {
|
||||
return greekUpper.compareToIgnoreCase(greekLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int latin1UTF16() {
|
||||
return asciiLatine1.compareToIgnoreCase(asciiLatin1UTF16);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int supUpperLower() {
|
||||
return supUpper.compareToIgnoreCase(supUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int supLower() {
|
||||
return supUpper.compareToIgnoreCase(supLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int asciiUpperLowerFC() {
|
||||
return asciiUpper.compareToFoldCase(asciiUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int asciiLowerFC() {
|
||||
return asciiUpper.compareToFoldCase(asciiLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int asciiWithDFFC() {
|
||||
return asciiWithDF.compareToFoldCase(asciiWithDFSS);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int greekUpperLowerFC() {
|
||||
return greekUpper.compareToFoldCase(greekUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int greekLowerFC() {
|
||||
return greekUpper.compareToFoldCase(greekLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int latin1UTF16FC() {
|
||||
return asciiLatine1.compareToFoldCase(asciiLatin1UTF16); }
|
||||
|
||||
@Benchmark
|
||||
public int supUpperLowerFC() {
|
||||
return supUpper.compareToFoldCase(supUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int supLowerFC() {
|
||||
return supUpper.compareToFoldCase(supLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean asciiUpperLowerEQ() {
|
||||
return asciiUpper.equalsIgnoreCase(asciiUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean asciiLowerEQ() {
|
||||
return asciiUpper.equalsIgnoreCase(asciiLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean greekUpperLowerEQ() {
|
||||
return greekUpper.equalsIgnoreCase(greekUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean greekLowerEQ() {
|
||||
return greekUpper.equalsIgnoreCase(greekLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean latin1UTF16EQ() {
|
||||
return asciiLatine1.equalsIgnoreCase(asciiLatin1UTF16);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean supUpperLowerEQ() {
|
||||
return supUpper.equalsIgnoreCase(supUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean supLowerEQ() {
|
||||
return supUpper.equalsIgnoreCase(supLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean asciiUpperLowerEQFC() {
|
||||
return asciiUpper.equalsFoldCase(asciiUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean asciiLowerEQFC() {
|
||||
return asciiUpper.equalsFoldCase(asciiLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean greekUpperLowerEQFC() {
|
||||
return greekUpper.equalsFoldCase(greekUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean greekLowerEQFC() {
|
||||
return greekUpper.equalsFoldCase(greekLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean latin1UTF16EQFC() {
|
||||
return asciiLatine1.equalsFoldCase(asciiLatin1UTF16);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean supUpperLowerEQFC() {
|
||||
return supUpper.equalsFoldCase(supUpperLower);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean supLowerEQFC() {
|
||||
return supUpper.equalsFoldCase(supLower);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user