8365675: Add String Unicode Case-Folding Support

Reviewed-by: rriggs, naoto, ihse
This commit is contained in:
Xueming Shen
2025-12-02 19:47:18 +00:00
parent 618732ffc0
commit b97ed667db
13 changed files with 1245 additions and 212 deletions

View File

@@ -79,7 +79,7 @@ TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_too
build.tools.generateextraproperties.GenerateExtraProperties
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.generatecharacter.CaseFolding
build.tools.generatecharacter.GenerateCaseFolding
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.makezipreproducible.MakeZipReproducible

View File

@@ -1,73 +0,0 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package build.tools.generatecharacter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class CaseFolding {
public static void main(String[] args) throws Throwable {
if (args.length != 3) {
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
System.exit(1);
}
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
var supportedTypes = "^.*; [CTS]; .*$";
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[] {cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}

View File

@@ -0,0 +1,134 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package build.tools.generatecharacter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class GenerateCaseFolding {
public static void main(String[] args) throws Throwable {
if (args.length != 3) {
System.err.println("Usage: java GenerateCaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
System.exit(1);
}
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
// java.lang
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
String[][] caseFoldings = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
var fields = line.split("; ");
var cp = fields[0];
fields = fields[2].trim().split(" ");
var folding = new String[fields.length + 1];
folding[0] = cp;
System.arraycopy(fields, 0, folding, 1, fields.length);
return folding;
})
.toArray(size -> new String[size][]);
// util.regex
var expandedSupportedTypes = "^.*; [CTS]; .*$";
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[]{cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? genFoldingEntries(caseFoldings) : line)
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
private static long foldingToLong(String[] folding) {
int cp = Integer.parseInt(folding[0], 16);
long value = (long)Integer.parseInt(folding[1], 16);
if (!Character.isSupplementaryCodePoint(cp) && folding.length != 2) {
var shift = 16;
for (int j = 2; j < folding.length; j++) {
value |= (long)Integer.parseInt(folding[j], 16) << shift;
shift <<= 1;
}
value = value | (long) (folding.length - 1) << 48;
}
return value;
}
private static String genFoldingEntries(String[][] foldings) {
StringBuilder sb = new StringBuilder();
sb.append(" private static final int[] CASE_FOLDING_CPS = {\n");
int width = 10;
for (int i = 0; i < foldings.length; i++) {
if (i % width == 0)
sb.append(" ");
sb.append(String.format("0X%s", foldings[i][0]));
if (i < foldings.length - 1)
sb.append(", ");
if (i % width == width - 1 || i == foldings.length - 1)
sb.append("\n");
}
sb.append(" };\n\n");
sb.append(" private static final long[] CASE_FOLDING_VALUES = {\n");
width = 6;
for (int i = 0; i < foldings.length; i++) {
if (i % width == 0)
sb.append(" "); // indent
sb.append(String.format("0x%013xL", foldingToLong(foldings[i])));
if (i < foldings.length - 1)
sb.append(", ");
if (i % width == width - 1 || i == foldings.length - 1) {
sb.append("\n");
}
}
sb.append(" };\n");
return sb.toString();
}
}

View File

@@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)
################################################################################
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/lang/CaseFolding.java
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(STRINGCASEFOLDING_TEMPLATE) \
$(CASEFOLDINGTXT) \
$(GENSRC_STRINGCASEFOLDING)
TARGETS += $(GENSRC_STRINGCASEFOLDING)
endif # include guard
include MakeIncludeEnd.gmk

View File

@@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
################################################################################
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(CASEFOLDINGTEMP) \
$(CASEFOLDINGTXT) \
$(GENSRC_CASEFOLDING)
TARGETS += $(GENSRC_CASEFOLDING)
################################################################################
endif # include guard
include MakeIncludeEnd.gmk

View File

@@ -117,9 +117,38 @@ import sun.nio.cs.UTF_8;
* Unicode code points (i.e., characters), in addition to those for
* dealing with Unicode code units (i.e., {@code char} values).
*
* <p>Unless otherwise noted, methods for comparing Strings do not take locale
* into account. The {@link java.text.Collator} class provides methods for
* finer-grain, locale-sensitive String comparison.
* <p><b>String comparison and case-insensitive matching</b>
*
* <p>There are several related ways to compare {@code String} values; choose
* the one whose semantics fit your purpose:
*
* <ul>
* <li><b>Exact content equality</b> {@link #equals(Object)} checks that two
* strings contain the identical char sequence of UTF-16 code units. This is
* a strict, case-sensitive comparison suitable for exact matching, hashing
* and any situation that requires bit-for-bit stability.</li>
*
* <li><b>Simple case-insensitive equality</b> {@link #equalsIgnoreCase(String)}
* (and the corresponding {@link #compareToIgnoreCase(String)} and {@link #CASE_INSENSITIVE_ORDER})
* performs a per-code-point, locale-independent comparison using
* {@link Character#toUpperCase(int)} and {@link Character#toLowerCase(int)}.
* It is convenient for many common case-insensitive checks.</li>
*
* <li><b>Unicode case-folded equivalence</b> {@link #equalsFoldCase(String)}
* (and the corresponding {@link #compareToFoldCase(String)} and {@link #UNICODE_CASEFOLD_ORDER})
* implement the Unicode <em>{@index "full case folding"}</em> rules defined in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">Unicode CaseFolding.txt</a>.
* Case folding is locale-independent and language-neutral and may map a single code
* point to multiple code points (1:M mappings). For example, the German sharp
* s ({@code U+00DF}) is folded to the sequence {@code "ss"}.
* Use these methods when you need Unicode-compliant
* <a href="https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790">
* caseless matching</a>, searching, or ordering.</li>
* </ul>
*
* <p>Unless otherwise noted, methods for comparing Strings do not take locale into
* account. The {@link java.text.Collator} class provides methods for finer-grain,
* locale-sensitive String comparison.
*
* @implNote The implementation of the string concatenation operator is left to
* the discretion of a Java compiler, as long as the compiler ultimately conforms
@@ -2179,6 +2208,7 @@ public final class String
* false} otherwise
*
* @see #equals(Object)
* @see #equalsFoldCase(String)
* @see #codePoints()
*/
public boolean equalsIgnoreCase(String anotherString) {
@@ -2188,6 +2218,57 @@ public final class String
&& regionMatches(true, 0, anotherString, 0, length());
}
/**
* Compares this {@code String} to another {@code String} for equality,
* using <em>{@index "Unicode case folding"}</em>. Two strings are considered equal
* by this method if their case-folded forms are identical.
* <p>
* Case folding is defined by the Unicode Standard in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")}
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
* to {@code "ss"}.
* <p>
* Case folding is locale-independent and language-neutral, unlike
* locale-sensitive transformations such as {@link #toLowerCase()} or
* {@link #toUpperCase()}. It is intended for caseless matching,
* searching, and indexing.
*
* @apiNote
* This method is the Unicode-compliant alternative to
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
* defined by the Unicode Standard, which may differ from the simpler
* per-character mapping performed by {@code equalsIgnoreCase}.
* For example:
* {@snippet lang=java :
* String a = "Fuß";
* String b = "FUSS";
* boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
* boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
* }
*
* @param anotherString
* The {@code String} to compare this {@code String} against
*
* @return {@code true} if the given object is not {@code null} and represents
* the same sequence of characters as this string under Unicode case
* folding; {@code false} otherwise.
*
* @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
* @see #compareToFoldCase(String)
* @see #equalsIgnoreCase(String)
* @since 26
*/
public boolean equalsFoldCase(String anotherString) {
if (this == anotherString) {
return true;
}
if (anotherString == null) {
return false;
}
return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
}
/**
* Compares two strings lexicographically.
* The comparison is based on the Unicode value of each character in
@@ -2303,12 +2384,86 @@ public final class String
* than this String, ignoring case considerations.
* @see java.text.Collator
* @see #codePoints()
* @see #compareToFoldCase(String)
* @since 1.2
*/
public int compareToIgnoreCase(String str) {
return CASE_INSENSITIVE_ORDER.compare(this, str);
}
/**
* A Comparator that orders {@code String} objects as by
* {@link #compareToFoldCase(String) compareToFoldCase()}.
*
* @see #compareToFoldCase(String)
* @since 26
*/
public static final Comparator<String> UNICODE_CASEFOLD_ORDER
= new FoldCaseComparator();
private static class FoldCaseComparator implements Comparator<String> {
@Override
public int compare(String s1, String s2) {
byte[] v1 = s1.value;
byte[] v2 = s2.value;
if (s1.coder == s2.coder()) {
return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2)
: StringUTF16.compareToFC(v1, v2);
}
return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2)
: StringUTF16.compareToFC_Latin1(v1, v2);
}
}
/**
* Compares two strings lexicographically using <em>{@index "Unicode case folding"}</em>.
* This method returns an integer whose sign is that of calling {@code compareTo}
* on the Unicode case folded version of the strings. Unicode Case folding
* eliminates differences in case according to the Unicode Standard, using the
* mappings defined in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including 1:M mappings, such as {@code"ß"} {@code }"ss"}.
* <p>
* Case folding is a locale-independent, language-neutral form of case mapping,
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
* which applies a simpler locale-insensitive uppercase mapping. This method
* follows the Unicode <em>{@index "full"}</em> case folding, providing stable and
* consistent results across all environments.
* <p>
* Note that this method does <em>not</em> take locale into account, and may
* produce results that differ from locale-sensitive ordering. Use
* {@link java.text.Collator} for locale-sensitive comparison.
*
* @apiNote
* This method is the Unicode-compliant alternative to
* {@link #compareToIgnoreCase(String)}. It implements the
* <em>{@index "full case folding"}</em> as defined by the Unicode Standard, which
* may differ from the simpler per-character mapping performed by
* {@code compareToIgnoreCase}.
* For example:
* {@snippet lang=java :
* String a = "Fuß";
* String b = "FUSS";
* int cmpFoldCase = a.compareToFoldCase(b); // returns 0
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
* }
*
* @param str the {@code String} to be compared.
* @return a negative integer, zero, or a positive integer as the specified
* String is greater than, equal to, or less than this String,
* ignoring case considerations by case folding.
*
* @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
* @see java.text.Collator
* @see #compareToIgnoreCase(String)
* @see #equalsFoldCase(String)
* @since 26
*/
public int compareToFoldCase(String str) {
return UNICODE_CASEFOLD_ORDER.compare(this, str);
}
/**
* Tests if two string regions are equal.
* <p>

View File

@@ -32,6 +32,8 @@ import java.util.function.Consumer;
import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import jdk.internal.lang.CaseFolding;
import jdk.internal.util.ArraysSupport;
import jdk.internal.vm.annotation.IntrinsicCandidate;
@@ -179,6 +181,128 @@ final class StringLatin1 {
return len1 - len2;
}
private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
int k1 = off, k2 = ooff;
boolean lo1 = false, lo2 = false; // true if we have a leftover 's' from u+00df -> ss
while ((k1 < last || lo1) && (k2 < olast || lo2)) {
int c1, c2;
if (lo1) {
c1 = 0x73; // leftover 's'
lo1 = false;
} else {
c1 = getChar(value, k1++);
if (c1 == 0xdf) {
c1 = 0x73;
lo1 = true;
}
}
if (lo2) {
c2 = 0x73; // 's'
lo2 = false;
} else {
c2 = getChar(other, k2++);
if (c2 == 0xdf) {
c2 = 0x73;
lo2 = true;
}
}
if (!CharacterDataLatin1.equalsIgnoreCase((byte)c1, (byte)c2)) {
return Character.toLowerCase(c1) - Character.toLowerCase(c2);
}
}
if (k1 < last || lo1) {
return 1;
}
if (k2 < olast || lo2) {
return -1;
}
return 0;
}
static int compareToFC(byte[] value, byte[] other) {
int len = value.length;
int olen = other.length;
int lim = Math.min(len, olen);
for (int k = 0; k < lim; k++) {
byte b1 = value[k];
byte b2 = other[k];
if (!CharacterDataLatin1.equalsIgnoreCase(b1, b2)) {
int c1 = b1 & 0xff;
int c2 = b2 & 0xff;
if (c1 == 0xdf || c2 == 0xdf) { // 0xdf is the only 1:M in latin1 range
return compareToFC0(value, k, len, other, k, olen);
}
return Character.toLowerCase(c1) - Character.toLowerCase(c2);
}
}
return len - olen;
}
private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
int f1 = 0, f2 = 0;
int k1 = off, k2 = ooff;
while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
int c1, c2;
if (f1 != 0) {
c1 = (f1 & 0xffff); f1 >>>= 16;
} else {
c1 = getChar(value, k1++);
var f = CaseFolding.fold(c1);
if (CaseFolding.isSingleCodePoint(f)) {
c1 = (int)(f & 0xfffff);
} else {
c1 = (int)f & 0xffff;
f1 = (int)(f >>> 16);
}
}
if (f2 != 0) {
c2 = f2 & 0xffff; f2 >>>= 16;
} else {
c2 = StringUTF16.codePointAt(other, k2, olast, true);
k2 += Character.charCount(c2);
var f = CaseFolding.fold(c2);
if (CaseFolding.isSingleCodePoint(f)) {
c2 = (int)(f & 0xfffff);
} else {
c2 = (int)(f & 0xffff);
f2 = (int)(f >>> 16);
}
}
if (c1 != c2) {
return c1 - c2;
}
}
if (k1 < last || f1 != 0) {
return 1;
}
if (k2 < olast || f2 != 0) {
return -1;
}
return 0;
}
// latin1 vs utf16
static int compareToFC_UTF16(byte[] value, byte[] other) {
int last = length(value);
int olast = StringUTF16.length(other);
int lim = Math.min(last, olast);
for (int k = 0; k < lim; k++) {
int cp1 = getChar(value, k);
int cp2 = StringUTF16.codePointAt(other, k, olast, true);
if (cp1 != cp2) {
long cf1 = CaseFolding.fold(cp1);
long cf2 = CaseFolding.fold(cp2);
if (cf1 != cf2) {
if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
return compareToFC0_UTF16(value, k, last, other, k, olast);
}
return (int)(cf1 - cf2);
}
}
}
return last - olast;
}
static int hashCode(byte[] value) {
return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0);
}

View File

@@ -34,6 +34,7 @@ import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import jdk.internal.lang.CaseFolding;
import jdk.internal.misc.Unsafe;
import jdk.internal.util.ArraysSupport;
import jdk.internal.vm.annotation.ForceInline;
@@ -93,7 +94,7 @@ final class StringUTF16 {
return value.length >> 1;
}
private static int codePointAt(byte[] value, int index, int end, boolean checked) {
static int codePointAt(byte[] value, int index, int end, boolean checked) {
assert index < end;
if (checked) {
checkIndex(index, value);
@@ -592,6 +593,77 @@ final class StringUTF16 {
return -StringLatin1.compareToCI_UTF16(other, value);
}
public static int compareToFC_Latin1(byte[] value, byte[] other) {
return -StringLatin1.compareToFC_UTF16(other, value);
}
private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
int f1 = 0, f2 = 0;
int k1 = off, k2 = ooff;
while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
int c1, c2;
if (f1 != 0) {
c1 = f1 & 0xffff; f1 >>>= 16;
} else {
c1 = StringUTF16.codePointAt(value, k1, last, true);
k1 += Character.charCount(c1);
var f = CaseFolding.fold(c1);
if (CaseFolding.isSingleCodePoint(f)) {
c1 = (int)(f & 0xfffff);
} else {
c1 = (int)(f & 0xffff);
f1 = (int)(f >> 16);
}
}
if (f2 != 0) {
c2 = f2 & 0xffff; f2 >>>= 16;
} else {
c2 = StringUTF16.codePointAt(other, k2, olast, true);
k2 += Character.charCount(c2);
var f = CaseFolding.fold(c2);
if (CaseFolding.isSingleCodePoint(f)) {
c2 = (int)(f & 0xfffff);
} else {
c2 = (int)(f & 0xffff);
f2 = (int)(f >>> 16);
}
}
if (c1 != c2) {
return c1 - c2;
}
}
if (k1 < last || f1 != 0) {
return 1;
}
if (k2 < olast || f2 != 0) {
return -1;
}
return 0;
}
public static int compareToFC(byte[] value, byte[] other) {
int tlast = length(value);
int olast = length(other);
int lim = Math.min(tlast, olast);
int k = 0;
while (k < lim) {
int cp1 = codePointAt(value, k, tlast, true);
int cp2 = codePointAt(other, k, olast, true);
if (cp1 != cp2) {
long cf1 = CaseFolding.fold(cp1);
long cf2 = CaseFolding.fold(cp2);
if (cf1 != cf2) {
if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
return compareToFC0(value, k, tlast, other, k, olast);
}
return (int) cf1 - (int) cf2;
}
}
k += Character.charCount(cp1);
}
return tlast - olast;
}
static int hashCode(byte[] value) {
return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0);
}

View File

@@ -43,8 +43,8 @@ import java.util.function.Predicate;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import jdk.internal.lang.CaseFolding;
import jdk.internal.util.ArraysSupport;
import jdk.internal.util.regex.CaseFolding;
import jdk.internal.util.regex.Grapheme;
/**

View File

@@ -0,0 +1,208 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.internal.lang;
import java.util.Arrays;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static java.util.Map.entry;
/**
* Utility class that handles Unicode case folding properties defined in
* CasingFolding.txt, including 1:M full case folding.
*/
public final class CaseFolding {
private CaseFolding() {}
/**
* Tests whether the specified code point has a folding mapping entry defined.
*
* @param cp
* the Unicode code point to test
* @return {@code true} if the given code point has a case folding mapping entry
* defined in (@code caseFoldingMap}, {@code false} otherwise
*/
public static boolean isDefined(int cp) {
return getDefined(cp) != -1;
}
/**
* Returns the case-folded form of the specified code point according
* to the Unicode case folding mappings.
* <p>
* If the code point has no case folding mapping defined, this method returns
* the original code point.
*
* Possible combinations of the returning case-folding form as a long value
*
* +---+---------+--------+---------+--------+--------+
* | 1:1 mapping | 0000 | 0000 | 000x | xxxx | 0041 => 0061 or 1E921 => 1E943
* +---+---------+--------+---------+--------+--------+
* | 1:2 mapping | 0002 | 0000 | xxxx | xxxx | FB02 => 0066 006C
* +---+---------+--------+---------+--------+--------+
* | 1:3 mapping | 0003 | xxxx | xxxx | xxxx | FB03 => 0066 0066 0069
* +---+---------+--------+---------+--------+--------+
*
* @param cp
* the Unicode code point to fold
* @return a long value representing the case-folded form of the input
* code point, encoded as TBD
*/
public static long fold(int cp) {
var fold = getDefined(cp);
return fold == -1 ? cp : fold;
}
public static boolean isSingleCodePoint(long fold) {
return (fold >> 48) == 0;
}
/**
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
* matching, according to the
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
* <p>
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
* be applied to literals and (optionally) to character classes. When applied to character classes, each
* character class is expected to be closed under simple case folding. See the standard for the
* detailed explanation and example of "closed".
* <p>
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
* <ol>
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
* <li>Specify which character properties or constructs are closed under the matching.</li>
* </ol>
* <p>
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
* family may appears independently or within a class.
* <p>
* For loose/case-insensitive matching, the back-refs, slices and singles apply {@code toUpperCase} and
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
* matching.
* <p>
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
* if their behavior is clearly specified.
* <p>
* This method addresses that requirement for the "range" construct within in character class by computing
* the additional characters that should be included to close the range under simple case folding:
* <p>
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
* character is not already in the range, then that mapped character (typically lowercase) is added to
* the expansion set.
* <p>
* This allows regex character class "range" implementation to use the returned expansion set to support
* additional case-insensitive matching, without duplicating characters already covered by the existing
* regex range implementation. The expectation is the matching is done using both the uppercase and
* lowercase forms of the input character, for example
*
* <pre>{@code
*
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
* inRange(lower, Character.toLower(ch), upper) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
* }</pre>
*
* @param start the starting code point of the character range
* @param end the ending code point of the character range
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
* those already in the range
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
*/
public static int[] getClassRangeClosingCharacters(int start, int end) {
int[] expanded = new int[expanded_case_cps.length];
int off = 0;
for (int cp : expanded_case_cps) {
if (cp >= start && cp <= end) {
int folding = expanded_case_map.get(cp);
if (folding < start || folding > end) {
expanded[off++] = folding;
}
}
}
return Arrays.copyOf(expanded, off);
}
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
%%%Expanded_Case_Map_Entries
);
private static final int[] expanded_case_cps = expanded_case_map.keySet()
.stream()
.mapToInt(Integer::intValue)
.toArray();
private static final int HASH_CP = 0;
private static final int HASH_INDEX = 1;
private static final int HASH_NEXT = 2;
private static int[][] hashKeys(int[] keys) {
var hashes = new int[keys.length << 1][3]; // cp + hash + next
var off = keys.length;
for (int i = 0; i < keys.length; i++) {
var cp = keys[i];
var hash = cp % keys.length;
while (hashes[hash][HASH_CP] != 0) {
var next = hashes[hash][HASH_NEXT];
if (next == 0) {
hashes[hash][HASH_NEXT] = off;
hash = off++;
break;
} else {
hash = next;
}
}
hashes[hash][HASH_CP] = cp;
hashes[hash][HASH_INDEX] = i;
}
return Arrays.copyOf(hashes, off);
}
private static long getDefined(int cp) {
var hashes = CASE_FOLDING_HASHES;
var length = CASE_FOLDING_CPS.length; // hashed based on total defined.
var hash = cp % length;
while (hashes[hash][HASH_CP] != cp) {
var next = hashes[hash][HASH_NEXT];
if (next == 0) {
return -1; // hash miss
}
hash = next;
}
var index = hashes[hash][HASH_INDEX];
return CASE_FOLDING_VALUES[index];
}
%%%Entries
private static final int[][] CASE_FOLDING_HASHES = hashKeys(CASE_FOLDING_CPS);
}

View File

@@ -1,116 +0,0 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.internal.util.regex;
import java.util.Arrays;
import java.util.Map;
import java.util.Objects;
import static java.util.Map.entry;
public final class CaseFolding {
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
%%%Entries
);
private static final int[] expanded_case_cps = expanded_case_map.keySet()
.stream()
.mapToInt(Integer::intValue)
.toArray();
private CaseFolding() {}
/**
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
* matching, according to the
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
* <p>
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
* be applied to literals and (optionally) to character classes. When applied to character classes, each
* character class is expected to be closed under simple case folding. See the standard for the
* detailed explanation and example of "closed".
* <p>
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
* <ol>
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
* <li>Specify which character properties or constructs are closed under the matching.</li>
* </ol>
* <p>
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
* family may appears independently or within a class.
* <p>
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
* matching.
* <p>
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
* if their behavior is clearly specified.
* <p>
* This method addresses that requirement for the "range" construct within in character class by computing
* the additional characters that should be included to close the range under simple case folding:
* <p>
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
* character is not already in the range, then that mapped character (typically lowercase) is added to
* the expansion set.
* <p>
* This allows regex character class "range" implementation to use the returned expansion set to support
* additional case-insensitive matching, without duplicating characters already covered by the existing
* regex range implementation. The expectation is the matching is done using both the uppercase and
* lowercase forms of the input character, for example
*
* <pre>{@code
*
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
* inRange(lower, Character.toLower(ch), upper) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
* }</pre>
*
* <p>
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
* @param start the starting code point of the character range
* @param end the ending code point of the character range
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
* those already in the range
*/
public static int[] getClassRangeClosingCharacters(int start, int end) {
int[] expanded = new int[expanded_case_cps.length];
int off = 0;
for (int cp : expanded_case_cps) {
if (cp >= start && cp <= end) {
int folding = expanded_case_map.get(cp);
if (folding < start || folding > end) {
expanded[off++] = folding;
}
}
}
return Arrays.copyOf(expanded, off);
}
}

View File

@@ -0,0 +1,329 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @summary tests unicode case-folding based String comparison and equality
* @bug 4397357
* @library /lib/testlibrary/java/lang
* @modules java.base/jdk.internal.lang:+open
* @run junit/othervm
* UnicodeCaseFoldingTest
*/
import java.nio.file.Files;
import java.util.stream.Stream;
import java.util.stream.Collectors;
import java.util.ArrayList;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import jdk.internal.lang.CaseFolding;
public class UnicodeCaseFoldingTest {
@Test
void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable {
var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding
var results = Files.lines(UCDFiles.CASEFOLDING)
.filter(line -> !line.startsWith("#") && line.matches(filter))
.map(line -> {
var fields = line.split("; ");
var cp = Integer.parseInt(fields[0], 16);
fields = fields[2].trim().split(" ");
var folding = new int[fields.length];
for (int i = 0; i < folding.length; i++) {
folding[i] = Integer.parseInt(fields[i], 16);
}
var source = new String(Character.toChars(cp));
var expected = new String(folding, 0, folding.length);
// (1) Verify the folding result matches expected
assertEquals(expected, foldCase(source), "CaseFolding.fold(): ");
// (2) Verify compareToFoldCase() result
assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
// (3) Verify equalsFoldCase() result
assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
return null;
})
.filter(error -> error != null)
.toArray();
assertEquals(0, results.length);
}
@Test
void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable {
// S=simple, for simple case folding. The simple case folding should still matches
var filter = "^.*; [S]; .*$";
var results = Files.lines(UCDFiles.CASEFOLDING)
.filter(line -> !line.startsWith("#") && line.matches(filter))
.map(line -> {
var fields = line.split("; ");
var cp = Integer.parseInt(fields[0], 16);
fields = fields[2].trim().split(" ");
var folding = new int[fields.length];
for (int i = 0; i < folding.length; i++) {
folding[i] = Integer.parseInt(fields[i], 16);
}
var source = new String(Character.toChars(cp));
var expected = new String(folding, 0, folding.length);
// (1) Verify compareToFoldCase() result
assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
// (2) Verify equalsFoldCase() result
assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
return null;
})
.filter(error -> error != null)
.toArray();
assertEquals(0, results.length);
}
@Test
public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception {
// Collect all code points that appear in CaseFolding.txt
var listed = Files.lines(UCDFiles.CASEFOLDING)
.filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$"))
.map(line -> Integer.parseInt(line.split("; ")[0], 16))
.collect(Collectors.toSet());
var failures = new ArrayList<String>();
// Scan BMP + Supplementary Plane 1 (U+0000..U+1FFFF)
for (int cp = Character.MIN_CODE_POINT; cp <= 0x1FFFF; cp++) {
if (!Character.isDefined(cp)) {
continue; // skip undefined
}
if (Character.isSurrogate((char) cp)) {
continue; // skip surrogate code units
}
if (listed.contains(cp)) {
continue; // already tested separately
}
String s = new String(Character.toChars(cp));
String folded = foldCase(s);
if (!s.equals(folded)) {
failures.add(String.format("Unexpected folding: U+%04X '%s' → '%s'", cp, s, folded));
}
}
assertEquals(0, failures.size(),
() -> "Some unlisted code points folded unexpectedly:\n"
+ String.join("\n", failures));
}
@ParameterizedTest(name = "CaseFold \"{0}\"\"{1}\"")
@MethodSource("caseFoldTestCases")
void testIndividualCaseFolding(String input, String expected) {
assertEquals(expected, foldCase(input));
}
static Stream<Arguments> caseFoldTestCases() {
return Stream.of(
// ASCII simple cases
Arguments.of("ABC", "abc"),
Arguments.of("already", "already"),
Arguments.of("MiXeD123", "mixed123"),
// --- Latin-1 to non-Latin-1 fold ---
Arguments.of("aBc\u00B5Efg", "abc\u03BCefg"), // "µ" "μ"
Arguments.of("test\u00B5\ud801\udc00X", "test\u03bc\ud801\udc28x"),
// German Eszett
Arguments.of("Stra\u00DFe", "strasse"), // "Straße"
Arguments.of("\u1E9E", "ss"), // "" capital sharp S
// Turkish dotted I / dotless i
Arguments.of("I", "i"),
Arguments.of("\u0130", "i\u0307"), // capital dotted I "i + dot above"
Arguments.of("\u0069\u0307", "i\u0307"), // small i + dot above remains
Arguments.of("\u0131", "\u0131"), // "ı" (dotless i stays dotless)
// Greek special cases ---
Arguments.of("\u039F\u03A3", "\u03BF\u03C3"), // "ΟΣ" "οσ" final sigma always folds to normal sigma
Arguments.of("\u1F88", "\u1F00\u03B9"), // "" "ἀι" Alpha with psili + ypogegrammeni
Arguments.of("\u039C\u03AC\u03CA\u03BF\u03C2", "\u03BC\u03AC\u03CA\u03BF\u03C3"), // "Μάϊος" "μάϊοσ"
Arguments.of("\u1F08", "\u1F00"), // (Capital Alpha with psili)
// Supplementary Plane characters
Arguments.of("\uD801\uDC00", "\uD801\uDC28"), // Deseret Capital Letter Long I Small
Arguments.of("\uD801\uDC01", "\uD801\uDC29"), // Deseret Capital Letter Long E Small
// Supplementary inside ASCII
Arguments.of("abc\uD801\uDC00def", "abc\uD801\uDC28def"),
// Ligatures and compatibility folds
Arguments.of("\uFB00", "ff"), // ff
Arguments.of("\uFB03", "ffi"), // ffi
Arguments.of("\u212A", "k"), // Kelvin sign k
Arguments.of("abc\uFB00def", "abcffdef"), // ff
Arguments.of("abc\uFB03def", "abcffidef"), // ffi
Arguments.of("abc\u212Adef", "abckdef"), // Kelvin sign k
// --- Fullwidth ---
Arguments.of("\uFF21\uFF22\uFF23", "\uFF41\uFF42\uFF43"), // "" ""
// --- Armenian ---
Arguments.of("\u0531", "\u0561"), // "Ա" "ա"
// --- Cherokee ---
Arguments.of("\u13A0", "\u13A0"), // Capital Cherokee A folds to itself
Arguments.of("\uAB70", "\u13A0") // Small Cherokee A folds Capital Cherokee A
);
}
static Stream<Arguments> caseFoldEqualProvider() {
return Stream.of(
Arguments.of("abc", "ABC"),
Arguments.of("aBcDe", "AbCdE"),
Arguments.of("\u00C0\u00E7", "\u00E0\u00C7"), // Àç vs àÇ
Arguments.of("straße", "STRASSE"), // ß ss
Arguments.of("\uD83C\uDDE6", "\uD83C\uDDE6"), // 🇦 vs 🇦
Arguments.of("\u1E9E", "ss"), // (capital sharp S)
Arguments.of("\u03A3", "\u03C3"), // Σ vs σ (Greek Sigma)
Arguments.of("\u03C3", "\u03C2"), // σ vs ς (Greek sigma/final sigma)
Arguments.of("\u212B", "\u00E5"), // (Angstrom sign) vs å
Arguments.of("\uFB00", "ff"), // (ligature)
Arguments.of("\u01C5", "\u01C5"), // Dž (Latin capital D with small z with caron)
Arguments.of("Caf\u00E9", "CAF\u00C9"), // Café vs CAFÉ
Arguments.of("\u03BA\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\u03B1", "\u039A\u0391\u039B\u0397\u039C\u0388\u03A1\u0391"), // καλημέρα vs ΚΑΛΗΜΕΡΑ
Arguments.of("\u4E2D\u56FD", "\u4E2D\u56FD"), // 中国
Arguments.of("\u03B1", "\u0391"), // α vs Α (Greek alpha)
Arguments.of("\u212B", "\u00C5"), // vs Å
// from StringCompareToIgnoreCase
Arguments.of("\u0100\u0102\u0104\u0106\u0108", "\u0100\u0102\u0104\u0106\u0109"), // ĀĂĄĆĈ vs ĀĂĄĆĉ
Arguments.of("\u0101\u0103\u0105\u0107\u0109", "\u0100\u0102\u0104\u0106\u0109"), // āăąćĉ vs ĀĂĄĆĉ
Arguments.of("\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04",
"\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"), // 𐐀𐐁𐐂𐐃𐐄 vs 𐐀𐐁𐐂𐐃𐐬
Arguments.of("\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c",
"\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c") // 𐐨𐐩𐐪𐐫𐐬 vs 𐐀𐐁𐐂𐐃𐐬
);
}
@ParameterizedTest
@MethodSource("caseFoldEqualProvider")
void testcompareToFoldCaseEquals(String s1, String s2) {
assertEquals(0, s1.compareToFoldCase(s2));
assertEquals(0, s2.compareToFoldCase(s1));
assertEquals(true, s1.equalsFoldCase(s2));
assertEquals(true, s2.equalsFoldCase(s1));
assertEquals(foldCase(s1), foldCase(s2));
}
static Stream<Arguments> caseFoldOrderingProvider() {
return Stream.of(
Arguments.of("asa", "", -1), // ß ss "asa" < "ass"
Arguments.of("", "asa", +1),
Arguments.of("a\u00DF", "ass", 0), // vs ass
Arguments.of("\uFB03", "ffi", 0), // (ligature)
Arguments.of("\u00C5", "Z", 1), // Å vs Z
Arguments.of("A", "\u00C0", -1), // A vs À
Arguments.of("\u03A9", "\u03C9", 0), // Ω vs ω
Arguments.of("\u03C2", "\u03C3", 0), // ς vs σ
Arguments.of("\uD835\uDD23", "R", 1), // 𝔯 (fraktur r) vs R
Arguments.of("\uFF26", "E", 1), // (full-width F) vs E
Arguments.of("\u00C9clair", "Eclair", 1), // Éclair vs Eclair
Arguments.of("\u03bc\u00df", "\u00b5s", 1),
Arguments.of("\u00b5s", "\u03bc\u00df", -1)
);
}
@ParameterizedTest
@MethodSource("caseFoldOrderingProvider")
void testcompareToFoldCaseOrdering(String s1, String s2, int expectedSign) {
int cmp = s1.compareToFoldCase(s2);
assertEquals(expectedSign, Integer.signum(cmp));
}
static Stream<Arguments> roundTripProvider() {
return Stream.of(
Arguments.of("abc"),
Arguments.of("ABC"),
Arguments.of("straße"),
Arguments.of("Àç"),
Arguments.of(""),
Arguments.of("\uFB02uff"), // fluff (ligature in "fluff")
Arguments.of("\u00C9COLE") // ÉCOLE
);
}
@ParameterizedTest
@MethodSource("roundTripProvider")
void testCaseFoldRoundTrip(String s) {
String folded = foldCase(s);
assertEquals(0, s.compareToFoldCase(folded));
assertEquals(0, folded.compareToFoldCase(s));
assertEquals(true, s.equalsFoldCase(folded));
assertEquals(true, folded.equalsFoldCase(s));
}
// helper to test the integrity of folding mapping
private static int[] longToFolding(long value) {
int len = (int) (value >>> 48);
if (len == 0) {
return new int[]{(int) (value & 0xFFFFF)};
} else {
var folding = new int[len];
for (int i = 0; i < len; i++) {
folding[i] = (int) (value & 0xFFFF);
value >>= 16;
}
return folding;
}
}
private static String foldCase(String s) {
int first;
int len = s.length();
int cpCnt = 1;
for (first = 0; first < len; first += cpCnt) {
int cp = s.codePointAt(first);
if (CaseFolding.isDefined(cp)) {
break;
}
cpCnt = Character.charCount(cp);
}
if (first == len) {
return s;
}
StringBuilder sb = new StringBuilder(len);
sb.append(s, 0, first);
for (int i = first; i < len; i += cpCnt) {
int cp = s.codePointAt(i);
int[] folded = longToFolding(CaseFolding.fold(cp));
for (int f : folded) {
sb.appendCodePoint(f);
}
cpCnt = Character.charCount(cp);
}
return sb.toString();
}
}

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
/*
* This benchmark naively explores String::compareToFoldCase performance
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(3)
public class StringCompareToFoldCase {
private String asciiUpper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
private String asciiUpperLower = "ABCDEFGHIJKLMNOpqrstuvwxyz";
private String asciiLower = "abcdefghijklmnopqrstuvwxyz";
private String asciiWithDF = "abcdßßßßßßßßßßßßßßßßWXYZ";
private String asciiWithDFSS = "abcdssssssssssssssssßßßßßßßßWXYZ";
private String asciiLatine1 = "ABCDEFGHIJKLMNOpqrstuvwxyz0";
private String asciiLatin1UTF16 = "abcdefghijklmnopqrstuvwxyz\u0391";
private String greekUpper = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u0395"; // ΑΒΓΔΕ
private String greekUpperLower = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u03B5"; // ΑΒΓΔε
private String greekLower = "\u03B1\u03B2\u03B3\u03B4\u03B5\u03B1\u03B2\u03B3\u03B4\u03B5"; // αβγδε
public String supUpper = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04";
public String supUpperLower = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c";
public String supLower = "\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c";
@Benchmark
public int asciiUpperLower() {
return asciiUpper.compareToIgnoreCase(asciiUpperLower);
}
@Benchmark
public int asciiLower() {
return asciiUpper.compareToIgnoreCase(asciiLower);
}
@Benchmark
public int greekUpperLower() {
return greekUpper.compareToIgnoreCase(greekUpperLower);
}
@Benchmark
public int greekLower() {
return greekUpper.compareToIgnoreCase(greekLower);
}
@Benchmark
public int latin1UTF16() {
return asciiLatine1.compareToIgnoreCase(asciiLatin1UTF16);
}
@Benchmark
public int supUpperLower() {
return supUpper.compareToIgnoreCase(supUpperLower);
}
@Benchmark
public int supLower() {
return supUpper.compareToIgnoreCase(supLower);
}
@Benchmark
public int asciiUpperLowerFC() {
return asciiUpper.compareToFoldCase(asciiUpperLower);
}
@Benchmark
public int asciiLowerFC() {
return asciiUpper.compareToFoldCase(asciiLower);
}
@Benchmark
public int asciiWithDFFC() {
return asciiWithDF.compareToFoldCase(asciiWithDFSS);
}
@Benchmark
public int greekUpperLowerFC() {
return greekUpper.compareToFoldCase(greekUpperLower);
}
@Benchmark
public int greekLowerFC() {
return greekUpper.compareToFoldCase(greekLower);
}
@Benchmark
public int latin1UTF16FC() {
return asciiLatine1.compareToFoldCase(asciiLatin1UTF16); }
@Benchmark
public int supUpperLowerFC() {
return supUpper.compareToFoldCase(supUpperLower);
}
@Benchmark
public int supLowerFC() {
return supUpper.compareToFoldCase(supLower);
}
@Benchmark
public boolean asciiUpperLowerEQ() {
return asciiUpper.equalsIgnoreCase(asciiUpperLower);
}
@Benchmark
public boolean asciiLowerEQ() {
return asciiUpper.equalsIgnoreCase(asciiLower);
}
@Benchmark
public boolean greekUpperLowerEQ() {
return greekUpper.equalsIgnoreCase(greekUpperLower);
}
@Benchmark
public boolean greekLowerEQ() {
return greekUpper.equalsIgnoreCase(greekLower);
}
@Benchmark
public boolean latin1UTF16EQ() {
return asciiLatine1.equalsIgnoreCase(asciiLatin1UTF16);
}
@Benchmark
public boolean supUpperLowerEQ() {
return supUpper.equalsIgnoreCase(supUpperLower);
}
@Benchmark
public boolean supLowerEQ() {
return supUpper.equalsIgnoreCase(supLower);
}
@Benchmark
public boolean asciiUpperLowerEQFC() {
return asciiUpper.equalsFoldCase(asciiUpperLower);
}
@Benchmark
public boolean asciiLowerEQFC() {
return asciiUpper.equalsFoldCase(asciiLower);
}
@Benchmark
public boolean greekUpperLowerEQFC() {
return greekUpper.equalsFoldCase(greekUpperLower);
}
@Benchmark
public boolean greekLowerEQFC() {
return greekUpper.equalsFoldCase(greekLower);
}
@Benchmark
public boolean latin1UTF16EQFC() {
return asciiLatine1.equalsFoldCase(asciiLatin1UTF16);
}
@Benchmark
public boolean supUpperLowerEQFC() {
return supUpper.equalsFoldCase(supUpperLower);
}
@Benchmark
public boolean supLowerEQFC() {
return supUpper.equalsFoldCase(supLower);
}
}