mirror of
https://github.com/JetBrains/JetBrainsRuntime.git
synced 2025-12-08 02:19:37 +01:00
Compare commits
58 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf948548c3 | ||
|
|
7ece9e90c0 | ||
|
|
9320ef9b29 | ||
|
|
2a565ff368 | ||
|
|
493b5bd2fd | ||
|
|
f573f6d233 | ||
|
|
8a0a6f8c25 | ||
|
|
3d9d353edb | ||
|
|
1b621f5527 | ||
|
|
5463c9cd9a | ||
|
|
ac07355f55 | ||
|
|
4fb5c12813 | ||
|
|
d5a96e3f49 | ||
|
|
aadf36809c | ||
|
|
a3447ec656 | ||
|
|
b25ed57b76 | ||
|
|
df4ed7eff7 | ||
|
|
5718039a46 | ||
|
|
c51685267c | ||
|
|
7d903964fb | ||
|
|
6f4824068d | ||
|
|
e1fd663f22 | ||
|
|
d5214a4288 | ||
|
|
2611a49ea1 | ||
|
|
b8c0b2fd8c | ||
|
|
973bcdab81 | ||
|
|
6359b4ec23 | ||
|
|
ce4b257fa5 | ||
|
|
b270f30d10 | ||
|
|
486594d427 | ||
|
|
ce8399fd60 | ||
|
|
3c6459e1de | ||
|
|
92fd490f22 | ||
|
|
d13302f8b0 | ||
|
|
ce108446ca | ||
|
|
5c12a182e3 | ||
|
|
71800884f6 | ||
|
|
0c178beb69 | ||
|
|
6c13a3032f | ||
|
|
5e6bfc5eaa | ||
|
|
2c2d4d2cde | ||
|
|
0eb299af79 | ||
|
|
b893a2b2f7 | ||
|
|
05f950934e | ||
|
|
701bc3bbbe | ||
|
|
9e48b90c7f | ||
|
|
bad5edf146 | ||
|
|
f577385fc8 | ||
|
|
86623aa41d | ||
|
|
af5c49226c | ||
|
|
cb7e3d263a | ||
|
|
25dc4762b4 | ||
|
|
11e4a925be | ||
|
|
354ea4c28f | ||
|
|
959a443a9e | ||
|
|
4ed38f5ad5 | ||
|
|
fe4c0a2f04 | ||
|
|
519ecd352a |
@@ -1,7 +1,7 @@
|
||||
[general]
|
||||
project=jdk
|
||||
jbs=JDK
|
||||
version=22
|
||||
version=23
|
||||
|
||||
[checks]
|
||||
error=author,committer,reviewers,merge,issues,executable,symlink,message,hg-tag,whitespace,problemlists
|
||||
|
||||
@@ -58,7 +58,7 @@ DEMO_MANIFEST := $(SUPPORT_OUTPUTDIR)/demos/java-main-manifest.mf
|
||||
# This rule will be depended on due to the MANIFEST line in SetupBuildDemo
|
||||
# and SetupBuildJvmtiDemo.
|
||||
$(eval $(call SetupTextFileProcessing, BUILD_JAVA_MANIFEST, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf.template, \
|
||||
OUTPUT_FILE := $(DEMO_MANIFEST), \
|
||||
REPLACEMENTS := \
|
||||
@@VERSION_SPECIFICATION@@ => $(VERSION_SPECIFICATION) ; \
|
||||
|
||||
@@ -33,7 +33,7 @@ include TextFileProcessing.gmk
|
||||
|
||||
# This rule will be depended on due to the MANIFEST line
|
||||
$(eval $(call SetupTextFileProcessing, BUILD_JAVA_MANIFEST, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf.template, \
|
||||
OUTPUT_FILE := $(SUPPORT_OUTPUTDIR)/java-main-manifest.mf, \
|
||||
REPLACEMENTS := \
|
||||
@@VERSION_SPECIFICATION@@ => $(VERSION_SPECIFICATION) ; \
|
||||
|
||||
@@ -69,7 +69,7 @@ ifeq ($(call isTargetOs, macosx), true)
|
||||
))
|
||||
|
||||
$(eval $(call SetupTextFileProcessing, BUILD_JDK_PLIST, \
|
||||
SOURCE_FILES := $(MACOSX_PLIST_SRC)/JDK-Info.plist, \
|
||||
SOURCE_FILES := $(MACOSX_PLIST_SRC)/JDK-Info.plist.template, \
|
||||
OUTPUT_FILE := $(JDK_MACOSX_CONTENTS_DIR)/Info.plist, \
|
||||
REPLACEMENTS := \
|
||||
@@ID@@ => $(MACOSX_BUNDLE_ID_BASE).jdk ; \
|
||||
@@ -82,7 +82,7 @@ ifeq ($(call isTargetOs, macosx), true)
|
||||
))
|
||||
|
||||
$(eval $(call SetupTextFileProcessing, BUILD_JRE_PLIST, \
|
||||
SOURCE_FILES := $(MACOSX_PLIST_SRC)/JRE-Info.plist, \
|
||||
SOURCE_FILES := $(MACOSX_PLIST_SRC)/JRE-Info.plist.template, \
|
||||
OUTPUT_FILE := $(JRE_MACOSX_CONTENTS_DIR)/Info.plist, \
|
||||
REPLACEMENTS := \
|
||||
@@ID@@ => $(MACOSX_BUNDLE_ID_BASE).jre ; \
|
||||
|
||||
@@ -744,9 +744,16 @@ endif
|
||||
|
||||
$(eval $(call SetupTarget, build-test-lib, \
|
||||
MAKEFILE := test/BuildTestLib, \
|
||||
TARGET := build-test-lib, \
|
||||
DEPS := exploded-image, \
|
||||
))
|
||||
|
||||
$(eval $(call SetupTarget, test-image-lib, \
|
||||
MAKEFILE := test/BuildTestLib, \
|
||||
TARGET := test-image-lib, \
|
||||
DEPS := build-test-lib, \
|
||||
))
|
||||
|
||||
ifeq ($(BUILD_FAILURE_HANDLER), true)
|
||||
# Builds the failure handler jtreg extension
|
||||
$(eval $(call SetupTarget, build-test-failure-handler, \
|
||||
@@ -781,7 +788,7 @@ endif
|
||||
|
||||
$(eval $(call SetupTarget, build-microbenchmark, \
|
||||
MAKEFILE := test/BuildMicrobenchmark, \
|
||||
DEPS := interim-langtools exploded-image, \
|
||||
DEPS := interim-langtools exploded-image build-test-lib, \
|
||||
))
|
||||
|
||||
################################################################################
|
||||
@@ -1264,7 +1271,7 @@ all-docs-bundles: docs-jdk-bundles docs-javase-bundles docs-reference-bundles
|
||||
# This target builds the test image
|
||||
test-image: prepare-test-image test-image-jdk-jtreg-native \
|
||||
test-image-demos-jdk test-image-libtest-jtreg-native \
|
||||
test-image-lib-native
|
||||
test-image-lib test-image-lib-native
|
||||
|
||||
ifneq ($(JVM_TEST_IMAGE_TARGETS), )
|
||||
# If JVM_TEST_IMAGE_TARGETS is externally defined, use it instead of the
|
||||
|
||||
@@ -448,17 +448,17 @@ AC_DEFUN_ONCE([BASIC_SETUP_OUTPUT_DIR],
|
||||
AC_SUBST(CONFIGURESUPPORT_OUTPUTDIR)
|
||||
|
||||
# The spec.gmk file contains all variables for the make system.
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/spec.gmk:$AUTOCONF_DIR/spec.gmk.in])
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/spec.gmk:$AUTOCONF_DIR/spec.gmk.template])
|
||||
# The bootcycle-spec.gmk file contains support for boot cycle builds.
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/bootcycle-spec.gmk:$AUTOCONF_DIR/bootcycle-spec.gmk.in])
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/bootcycle-spec.gmk:$AUTOCONF_DIR/bootcycle-spec.gmk.template])
|
||||
# The buildjdk-spec.gmk file contains support for building a buildjdk when cross compiling.
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/buildjdk-spec.gmk:$AUTOCONF_DIR/buildjdk-spec.gmk.in])
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/buildjdk-spec.gmk:$AUTOCONF_DIR/buildjdk-spec.gmk.template])
|
||||
# The compare.sh is used to compare the build output to other builds.
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/compare.sh:$AUTOCONF_DIR/compare.sh.in])
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/compare.sh:$AUTOCONF_DIR/compare.sh.template])
|
||||
# The generated Makefile knows where the spec.gmk is and where the source is.
|
||||
# You can run make from the OUTPUTDIR, or from the top-level Makefile
|
||||
# which will look for generated configurations
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/Makefile:$AUTOCONF_DIR/Makefile.in])
|
||||
AC_CONFIG_FILES([$OUTPUTDIR/Makefile:$AUTOCONF_DIR/Makefile.template])
|
||||
])
|
||||
|
||||
###############################################################################
|
||||
|
||||
@@ -110,6 +110,15 @@ AC_DEFUN_ONCE([JDKVER_SETUP_JDK_VERSION_NUMBERS],
|
||||
CHECK_VALUE: [UTIL_CHECK_STRING_NON_EMPTY_PRINTABLE])
|
||||
AC_SUBST(COMPANY_NAME)
|
||||
|
||||
# Set the JDK RC Company name
|
||||
# Otherwise uses the value set for "vendor-name".
|
||||
UTIL_ARG_WITH(NAME: jdk-rc-company-name, TYPE: string,
|
||||
DEFAULT: $COMPANY_NAME,
|
||||
DESC: [Set JDK RC company name. This is used for CompanyName properties of MS Windows binaries.],
|
||||
DEFAULT_DESC: [from branding.conf],
|
||||
CHECK_VALUE: [UTIL_CHECK_STRING_NON_EMPTY_PRINTABLE])
|
||||
AC_SUBST(JDK_RC_COMPANY_NAME)
|
||||
|
||||
# The vendor URL, if any
|
||||
# Only set VENDOR_URL if '--with-vendor-url' was used and is not empty.
|
||||
# Otherwise we will use the value from "branding.conf" included above.
|
||||
|
||||
@@ -191,6 +191,7 @@ PRODUCT_NAME := @PRODUCT_NAME@
|
||||
PRODUCT_SUFFIX := @PRODUCT_SUFFIX@
|
||||
JDK_RC_PLATFORM_NAME := @JDK_RC_PLATFORM_NAME@
|
||||
JDK_RC_NAME := @JDK_RC_NAME@
|
||||
JDK_RC_COMPANY_NAME:=@JDK_RC_COMPANY_NAME@
|
||||
COMPANY_NAME := @COMPANY_NAME@
|
||||
HOTSPOT_VM_DISTRO := @HOTSPOT_VM_DISTRO@
|
||||
MACOSX_BUNDLE_NAME_BASE := @MACOSX_BUNDLE_NAME_BASE@
|
||||
@@ -98,7 +98,7 @@ GLOBAL_VERSION_INFO_RESOURCE := $(TOPDIR)/src/java.base/windows/native/common/ve
|
||||
|
||||
JDK_RCFLAGS=$(RCFLAGS) \
|
||||
-D"JDK_VERSION_STRING=$(VERSION_STRING)" \
|
||||
-D"JDK_COMPANY=$(COMPANY_NAME)" \
|
||||
-D"JDK_COMPANY=$(JDK_RC_COMPANY_NAME)" \
|
||||
-D"JDK_VER=$(VERSION_NUMBER_FOUR_POSITIONS)" \
|
||||
-D"JDK_COPYRIGHT=Copyright \xA9 $(COPYRIGHT_YEAR)" \
|
||||
-D"JDK_NAME=$(JDK_RC_NAME) $(VERSION_SHORT)" \
|
||||
|
||||
@@ -112,7 +112,7 @@ define SetupBuildLauncherBody
|
||||
$1_PLIST_FILE := $$(SUPPORT_OUTPUTDIR)/native/$$(MODULE)/$1/Info.plist
|
||||
|
||||
$$(eval $$(call SetupTextFileProcessing, BUILD_PLIST_$1, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/data/bundle/cmdline-Info.plist, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/data/bundle/cmdline-Info.plist.template, \
|
||||
OUTPUT_FILE := $$($1_PLIST_FILE), \
|
||||
REPLACEMENTS := \
|
||||
@@ID@@ => $(MACOSX_BUNDLE_ID_BASE).$1 ; \
|
||||
|
||||
@@ -1206,7 +1206,7 @@ var getJibProfilesDependencies = function (input, common) {
|
||||
|
||||
jcov: {
|
||||
organization: common.organization,
|
||||
revision: "3.0-15-jdk-asm+1.0",
|
||||
revision: "3.0-16-jdk-asm+1.0",
|
||||
ext: "zip",
|
||||
environment_name: "JCOV_HOME",
|
||||
},
|
||||
|
||||
@@ -26,17 +26,17 @@
|
||||
# Default version, product, and vendor information to use,
|
||||
# unless overridden by configure
|
||||
|
||||
DEFAULT_VERSION_FEATURE=22
|
||||
DEFAULT_VERSION_FEATURE=23
|
||||
DEFAULT_VERSION_INTERIM=0
|
||||
DEFAULT_VERSION_UPDATE=0
|
||||
DEFAULT_VERSION_PATCH=0
|
||||
DEFAULT_VERSION_EXTRA1=0
|
||||
DEFAULT_VERSION_EXTRA2=0
|
||||
DEFAULT_VERSION_EXTRA3=0
|
||||
DEFAULT_VERSION_DATE=2024-03-19
|
||||
DEFAULT_VERSION_CLASSFILE_MAJOR=66 # "`$EXPR $DEFAULT_VERSION_FEATURE + 44`"
|
||||
DEFAULT_VERSION_DATE=2024-09-17
|
||||
DEFAULT_VERSION_CLASSFILE_MAJOR=67 # "`$EXPR $DEFAULT_VERSION_FEATURE + 44`"
|
||||
DEFAULT_VERSION_CLASSFILE_MINOR=0
|
||||
DEFAULT_VERSION_DOCS_API_SINCE=11
|
||||
DEFAULT_ACCEPTABLE_BOOT_VERSIONS="21 22"
|
||||
DEFAULT_JDK_SOURCE_TARGET_VERSION=22
|
||||
DEFAULT_ACCEPTABLE_BOOT_VERSIONS="21 22 23"
|
||||
DEFAULT_JDK_SOURCE_TARGET_VERSION=23
|
||||
DEFAULT_PROMOTED_VERSION_PRE=ea
|
||||
|
||||
@@ -48,7 +48,7 @@ $(eval $(call IncludeCustomExtension, hotspot/gensrc/GenerateSources.gmk))
|
||||
|
||||
# Setup the hotspot launcher script for developer use
|
||||
$(eval $(call SetupTextFileProcessing, CREATE_HOTSPOT_LAUNCHER, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/scripts/hotspot.sh, \
|
||||
SOURCE_FILES := $(TOPDIR)/make/scripts/hotspot.sh.template, \
|
||||
OUTPUT_FILE := $(JVM_OUTPUTDIR)/hotspot, \
|
||||
REPLACEMENTS := \
|
||||
@@LIBARCH@@ => $(OPENJDK_TARGET_CPU_LEGACY_LIB) ; \
|
||||
|
||||
@@ -245,7 +245,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
|
||||
TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
|
||||
OPTIMIZATION := HIGH, \
|
||||
CFLAGS := $(CFLAGS_JDKLIB), \
|
||||
CXXFLAGS := $(CXXFLAGS_JDKLIB), \
|
||||
CXXFLAGS := $(CXXFLAGS_JDKLIB) -std=c++17, \
|
||||
LDFLAGS := $(LDFLAGS_JDKLIB) \
|
||||
$(call SET_SHARED_LIBRARY_ORIGIN), \
|
||||
LIBS := $(LIBCXX), \
|
||||
|
||||
@@ -53,11 +53,10 @@ JMH_UNPACKED_DIR := $(MICROBENCHMARK_OUTPUT)/jmh_jars
|
||||
JMH_UNPACKED_JARS_DONE := $(JMH_UNPACKED_DIR)/_unpacked.marker
|
||||
|
||||
# External dependencies
|
||||
JMH_COMPILE_JARS := $(JMH_CORE_JAR) $(JMH_GENERATOR_JAR)
|
||||
WHITEBOX_JAR := $(SUPPORT_OUTPUTDIR)/test/lib/wb.jar
|
||||
JMH_COMPILE_JARS := $(JMH_CORE_JAR) $(JMH_GENERATOR_JAR) $(WHITEBOX_JAR)
|
||||
JMH_RUNTIME_JARS := $(JMH_CORE_JAR) $(JMH_COMMONS_MATH_JAR) $(JMH_JOPT_SIMPLE_JAR)
|
||||
|
||||
MICROBENCHMARK_CLASSPATH := $(call PathList, $(JMH_COMPILE_JARS))
|
||||
|
||||
# Native dependencies
|
||||
MICROBENCHMARK_NATIVE_SRC_DIRS := $(MICROBENCHMARK_SRC)
|
||||
MICROBENCHMARK_NATIVE_OUTPUT := $(MICROBENCHMARK_OUTPUT)/native
|
||||
@@ -92,24 +91,28 @@ $(eval $(call SetupJavaCompilation, BUILD_INDIFY, \
|
||||
$(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
|
||||
TARGET_RELEASE := $(TARGET_RELEASE_NEWJDK_UPGRADED), \
|
||||
SMALL_JAVA := false, \
|
||||
CLASSPATH := $(MICROBENCHMARK_CLASSPATH), \
|
||||
DISABLED_WARNINGS := restricted this-escape processing rawtypes cast serial preview, \
|
||||
CLASSPATH := $(JMH_COMPILE_JARS), \
|
||||
DISABLED_WARNINGS := restricted this-escape processing rawtypes cast \
|
||||
serial preview, \
|
||||
SRC := $(MICROBENCHMARK_SRC), \
|
||||
BIN := $(MICROBENCHMARK_CLASSES), \
|
||||
JAVAC_FLAGS := --add-exports java.base/sun.security.util=ALL-UNNAMED \
|
||||
--add-exports java.base/sun.invoke.util=ALL-UNNAMED \
|
||||
JAVAC_FLAGS := \
|
||||
--add-exports java.base/jdk.internal.classfile.impl=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.org.objectweb.asm=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.misc=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.event=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.foreign=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.misc=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.org.objectweb.asm=ALL-UNNAMED \
|
||||
--add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
|
||||
--add-exports java.base/sun.invoke.util=ALL-UNNAMED \
|
||||
--add-exports java.base/sun.security.util=ALL-UNNAMED \
|
||||
--enable-preview \
|
||||
-processor org.openjdk.jmh.generators.BenchmarkProcessor, \
|
||||
JAVA_FLAGS := --add-modules jdk.unsupported --limit-modules java.management \
|
||||
JAVA_FLAGS := \
|
||||
--add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
|
||||
--enable-preview, \
|
||||
--add-modules jdk.unsupported \
|
||||
--enable-preview \
|
||||
--limit-modules java.management, \
|
||||
))
|
||||
|
||||
$(BUILD_JDK_MICROBENCHMARK): $(JMH_COMPILE_JARS)
|
||||
|
||||
@@ -23,12 +23,22 @@
|
||||
# questions.
|
||||
#
|
||||
|
||||
################################################################################
|
||||
# This file builds the Java components of testlib.
|
||||
# It also covers the test-image part, where the built files are copied to the
|
||||
# test image.
|
||||
################################################################################
|
||||
|
||||
default: all
|
||||
|
||||
include $(SPEC)
|
||||
include MakeBase.gmk
|
||||
include JavaCompilation.gmk
|
||||
|
||||
################################################################################
|
||||
# Targets for building the test lib jars
|
||||
################################################################################
|
||||
|
||||
TARGETS :=
|
||||
|
||||
TEST_LIB_SOURCE_DIR := $(TOPDIR)/test/lib
|
||||
@@ -63,8 +73,21 @@ $(eval $(call SetupJavaCompilation, BUILD_TEST_LIB_JAR, \
|
||||
|
||||
TARGETS += $(BUILD_TEST_LIB_JAR)
|
||||
|
||||
##########################################################################################
|
||||
build-test-lib: $(TARGETS)
|
||||
|
||||
all: $(TARGETS)
|
||||
################################################################################
|
||||
# Targets for building test-image.
|
||||
################################################################################
|
||||
|
||||
.PHONY: default all
|
||||
# Copy the jars to the test image.
|
||||
$(eval $(call SetupCopyFiles, COPY_LIBTEST_JARS, \
|
||||
DEST := $(TEST_IMAGE_DIR)/lib-test, \
|
||||
FILES := $(BUILD_WB_JAR_JAR) $(BUILD_TEST_LIB_JAR_JAR), \
|
||||
))
|
||||
#
|
||||
|
||||
test-image-lib: $(COPY_LIBTEST_JARS)
|
||||
|
||||
all: build-test-lib
|
||||
|
||||
.PHONY: default all build-test-lib test-image-lib
|
||||
|
||||
@@ -193,4 +193,9 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Is SIMD sort supported for this CPU?
|
||||
static bool supports_simd_sort(BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP
|
||||
|
||||
@@ -303,15 +303,19 @@ void InterpreterMacroAssembler::load_field_entry(Register cache, Register index,
|
||||
}
|
||||
|
||||
void InterpreterMacroAssembler::load_method_entry(Register cache, Register index, int bcp_offset) {
|
||||
assert_different_registers(cache, index);
|
||||
|
||||
// Get index out of bytecode pointer
|
||||
get_index_at_bcp(index, bcp_offset, cache /* as tmp */, sizeof(u2));
|
||||
|
||||
// sizeof(ResolvedMethodEntry) is not a power of 2 on Arm, so can't use shift
|
||||
mov(cache, sizeof(ResolvedMethodEntry));
|
||||
mul(index, index, cache); // Scale the index to be the entry index * sizeof(ResolvedMethodEntry)
|
||||
|
||||
// load constant pool cache pointer
|
||||
ldr(cache, Address(FP, frame::interpreter_frame_cache_offset * wordSize));
|
||||
// Get address of method entries array
|
||||
ldr(cache, Address(cache, ConstantPoolCache::method_entries_offset()));
|
||||
ldr(cache, Address(cache, in_bytes(ConstantPoolCache::method_entries_offset())));
|
||||
add(cache, cache, Array<ResolvedMethodEntry>::base_offset_in_bytes());
|
||||
add(cache, cache, index);
|
||||
}
|
||||
|
||||
@@ -186,4 +186,9 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Is SIMD sort supported for this CPU?
|
||||
static bool supports_simd_sort(BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // CPU_ARM_MATCHER_ARM_HPP
|
||||
|
||||
@@ -370,17 +370,16 @@ address TemplateInterpreterGenerator::generate_return_entry_for(TosState state,
|
||||
if (index_size == sizeof(u4)) {
|
||||
__ load_resolved_indy_entry(Rcache, Rindex);
|
||||
__ ldrh(Rcache, Address(Rcache, in_bytes(ResolvedIndyEntry::num_parameters_offset())));
|
||||
__ check_stack_top();
|
||||
__ add(Rstack_top, Rstack_top, AsmOperand(Rcache, lsl, Interpreter::logStackElementSize));
|
||||
} else {
|
||||
// Pop N words from the stack
|
||||
assert(index_size == sizeof(u2), "Can only be u2");
|
||||
__ load_method_entry(Rcache, Rindex);
|
||||
__ ldrh(Rcache, Address(Rcache, in_bytes(ResolvedIndyEntry::num_parameters_offset())));
|
||||
__ check_stack_top();
|
||||
__ add(Rstack_top, Rstack_top, AsmOperand(Rcache, lsl, Interpreter::logStackElementSize));
|
||||
__ ldrh(Rcache, Address(Rcache, in_bytes(ResolvedMethodEntry::num_parameters_offset())));
|
||||
}
|
||||
|
||||
__ check_stack_top();
|
||||
__ add(Rstack_top, Rstack_top, AsmOperand(Rcache, lsl, Interpreter::logStackElementSize));
|
||||
|
||||
__ convert_retval_to_tos(state);
|
||||
|
||||
__ check_and_handle_popframe();
|
||||
|
||||
@@ -3666,15 +3666,15 @@ void TemplateTable::prepare_invoke(Register Rcache, Register recv) {
|
||||
// load receiver if needed (after extra argument is pushed so parameter size is correct)
|
||||
if (load_receiver) {
|
||||
__ ldrh(recv, Address(Rcache, in_bytes(ResolvedMethodEntry::num_parameters_offset())));
|
||||
Address recv_addr = __ receiver_argument_address(Rstack_top, Rtemp, recv);
|
||||
__ ldr(recv, recv_addr);
|
||||
__ add(recv, Rstack_top, AsmOperand(recv, lsl, Interpreter::logStackElementSize));
|
||||
__ ldr(recv, Address(recv, -Interpreter::stackElementSize));
|
||||
__ verify_oop(recv);
|
||||
}
|
||||
|
||||
// load return address
|
||||
{ const address table = (address) Interpreter::invoke_return_entry_table_for(code);
|
||||
__ mov_slow(Rtemp, table);
|
||||
__ ldr(LR, Address::indexed_ptr(Rtemp, ret_type));
|
||||
__ mov_slow(LR, table);
|
||||
__ ldr(LR, Address::indexed_ptr(LR, ret_type));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3744,10 +3744,13 @@ void TemplateTable::invokevirtual(int byte_no) {
|
||||
void TemplateTable::invokespecial(int byte_no) {
|
||||
transition(vtos, vtos);
|
||||
assert(byte_no == f1_byte, "use this argument");
|
||||
|
||||
const Register Rrecv = R2_tmp;
|
||||
load_resolved_method_entry_special_or_static(R2_tmp, // ResolvedMethodEntry*
|
||||
const Register Rflags = R3_tmp;
|
||||
|
||||
load_resolved_method_entry_special_or_static(Rrecv, // ResolvedMethodEntry*
|
||||
Rmethod, // Method*
|
||||
R3_tmp); // Flags
|
||||
Rflags); // Flags
|
||||
prepare_invoke(Rrecv, Rrecv);
|
||||
__ verify_oop(Rrecv);
|
||||
__ null_check(Rrecv, Rtemp);
|
||||
@@ -3760,12 +3763,16 @@ void TemplateTable::invokespecial(int byte_no) {
|
||||
void TemplateTable::invokestatic(int byte_no) {
|
||||
transition(vtos, vtos);
|
||||
assert(byte_no == f1_byte, "use this argument");
|
||||
load_resolved_method_entry_special_or_static(R2_tmp, // ResolvedMethodEntry*
|
||||
|
||||
const Register Rrecv = R2_tmp;
|
||||
const Register Rflags = R3_tmp;
|
||||
|
||||
load_resolved_method_entry_special_or_static(Rrecv, // ResolvedMethodEntry*
|
||||
Rmethod, // Method*
|
||||
R3_tmp); // Flags
|
||||
prepare_invoke(R2_tmp, R2_tmp);
|
||||
Rflags); // Flags
|
||||
prepare_invoke(Rrecv, Rrecv);
|
||||
// do the call
|
||||
__ profile_call(R2_tmp);
|
||||
__ profile_call(Rrecv);
|
||||
__ jump_from_interpreted(Rmethod);
|
||||
}
|
||||
|
||||
@@ -3788,10 +3795,10 @@ void TemplateTable::invokeinterface(int byte_no) {
|
||||
const Register Rflags = R3_tmp;
|
||||
const Register Rklass = R2_tmp; // Note! Same register with Rrecv
|
||||
|
||||
load_resolved_method_entry_interface(R2_tmp, // ResolvedMethodEntry*
|
||||
R1_tmp, // Klass*
|
||||
load_resolved_method_entry_interface(Rrecv, // ResolvedMethodEntry*
|
||||
Rinterf, // Klass*
|
||||
Rmethod, // Method* or itable/vtable index
|
||||
R3_tmp); // Flags
|
||||
Rflags); // Flags
|
||||
prepare_invoke(Rrecv, Rrecv);
|
||||
|
||||
// First check for Object case, then private interface method,
|
||||
|
||||
@@ -195,4 +195,9 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Is SIMD sort supported for this CPU?
|
||||
static bool supports_simd_sort(BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // CPU_PPC_MATCHER_PPC_HPP
|
||||
|
||||
@@ -1459,6 +1459,112 @@ void C2_MacroAssembler::string_equals(Register a1, Register a2,
|
||||
BLOCK_COMMENT("} string_equals");
|
||||
}
|
||||
|
||||
// jdk.internal.util.ArraysSupport.vectorizedHashCode
|
||||
void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
|
||||
Register tmp1, Register tmp2, Register tmp3,
|
||||
Register tmp4, Register tmp5, Register tmp6,
|
||||
BasicType eltype)
|
||||
{
|
||||
assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
|
||||
|
||||
const int elsize = arrays_hashcode_elsize(eltype);
|
||||
const int chunks_end_shift = exact_log2(elsize);
|
||||
|
||||
switch (eltype) {
|
||||
case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
|
||||
case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
|
||||
case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
|
||||
case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
|
||||
case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
const int stride = 4;
|
||||
const Register pow31_4 = tmp1;
|
||||
const Register pow31_3 = tmp2;
|
||||
const Register pow31_2 = tmp3;
|
||||
const Register chunks = tmp4;
|
||||
const Register chunks_end = chunks;
|
||||
|
||||
Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
|
||||
|
||||
// result has a value initially
|
||||
|
||||
beqz(cnt, DONE);
|
||||
|
||||
andi(chunks, cnt, ~(stride-1));
|
||||
beqz(chunks, TAIL);
|
||||
|
||||
mv(pow31_4, 923521); // [31^^4]
|
||||
mv(pow31_3, 29791); // [31^^3]
|
||||
mv(pow31_2, 961); // [31^^2]
|
||||
|
||||
slli(chunks_end, chunks, chunks_end_shift);
|
||||
add(chunks_end, ary, chunks_end);
|
||||
andi(cnt, cnt, stride-1); // don't forget about tail!
|
||||
|
||||
bind(WIDE_LOOP);
|
||||
mulw(result, result, pow31_4); // 31^^4 * h
|
||||
arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
|
||||
arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
|
||||
arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
|
||||
arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
|
||||
mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
|
||||
addw(result, result, t0);
|
||||
mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
|
||||
addw(result, result, t1);
|
||||
slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2]
|
||||
subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2]
|
||||
addw(result, result, tmp5);
|
||||
addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
|
||||
// + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
|
||||
addi(ary, ary, elsize * stride);
|
||||
bne(ary, chunks_end, WIDE_LOOP);
|
||||
beqz(cnt, DONE);
|
||||
|
||||
bind(TAIL);
|
||||
slli(chunks_end, cnt, chunks_end_shift);
|
||||
add(chunks_end, ary, chunks_end);
|
||||
|
||||
bind(TAIL_LOOP);
|
||||
arrays_hashcode_elload(t0, Address(ary), eltype);
|
||||
slli(t1, result, 5); // optimize 31 * result
|
||||
subw(result, t1, result); // with result<<5 - result
|
||||
addw(result, result, t0);
|
||||
addi(ary, ary, elsize);
|
||||
bne(ary, chunks_end, TAIL_LOOP);
|
||||
|
||||
bind(DONE);
|
||||
BLOCK_COMMENT("} // arrays_hashcode");
|
||||
}
|
||||
|
||||
int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
|
||||
switch (eltype) {
|
||||
case T_BOOLEAN: return sizeof(jboolean);
|
||||
case T_BYTE: return sizeof(jbyte);
|
||||
case T_SHORT: return sizeof(jshort);
|
||||
case T_CHAR: return sizeof(jchar);
|
||||
case T_INT: return sizeof(jint);
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
|
||||
switch (eltype) {
|
||||
// T_BOOLEAN used as surrogate for unsigned byte
|
||||
case T_BOOLEAN: lbu(dst, src); break;
|
||||
case T_BYTE: lb(dst, src); break;
|
||||
case T_SHORT: lh(dst, src); break;
|
||||
case T_CHAR: lhu(dst, src); break;
|
||||
case T_INT: lw(dst, src); break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
|
||||
typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
|
||||
bool is_far, bool is_unordered);
|
||||
|
||||
@@ -82,6 +82,15 @@
|
||||
Register result, Register cnt1,
|
||||
int elem_size);
|
||||
|
||||
void arrays_hashcode(Register ary, Register cnt, Register result,
|
||||
Register tmp1, Register tmp2,
|
||||
Register tmp3, Register tmp4,
|
||||
Register tmp5, Register tmp6,
|
||||
BasicType eltype);
|
||||
// helper function for arrays_hashcode
|
||||
int arrays_hashcode_elsize(BasicType eltype);
|
||||
void arrays_hashcode_elload(Register dst, Address src, BasicType eltype);
|
||||
|
||||
void string_equals(Register r1, Register r2,
|
||||
Register result, Register cnt1,
|
||||
int elem_size);
|
||||
|
||||
@@ -192,4 +192,9 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Is SIMD sort supported for this CPU?
|
||||
static bool supports_simd_sort(BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // CPU_RISCV_MATCHER_RISCV_HPP
|
||||
|
||||
@@ -10371,6 +10371,26 @@ instruct array_equalsC(iRegP_R11 ary1, iRegP_R12 ary2, iRegI_R10 result,
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
// fast ArraysSupport.vectorizedHashCode
|
||||
instruct arrays_hashcode(iRegP_R11 ary, iRegI_R12 cnt, iRegI_R10 result, immI basic_type,
|
||||
iRegLNoSp tmp1, iRegLNoSp tmp2,
|
||||
iRegLNoSp tmp3, iRegLNoSp tmp4,
|
||||
iRegLNoSp tmp5, iRegLNoSp tmp6, rFlagsReg cr)
|
||||
%{
|
||||
match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type)));
|
||||
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6,
|
||||
USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr);
|
||||
|
||||
format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result // KILL all" %}
|
||||
ins_encode %{
|
||||
__ arrays_hashcode($ary$$Register, $cnt$$Register, $result$$Register,
|
||||
$tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
|
||||
$tmp4$$Register, $tmp5$$Register, $tmp6$$Register,
|
||||
(BasicType)$basic_type$$constant);
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
// ============================================================================
|
||||
// Safepoint Instructions
|
||||
|
||||
|
||||
@@ -315,6 +315,10 @@ void VM_Version::c2_initialize() {
|
||||
}
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic)) {
|
||||
FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, true);
|
||||
}
|
||||
|
||||
if (!UseZicbop) {
|
||||
if (!FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
|
||||
warning("Zicbop is not available on this CPU");
|
||||
|
||||
@@ -184,4 +184,9 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Is SIMD sort supported for this CPU?
|
||||
static bool supports_simd_sort(BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // CPU_S390_MATCHER_S390_HPP
|
||||
|
||||
@@ -920,6 +920,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
|
||||
case 0x11: // movups
|
||||
case 0x12: // movlps
|
||||
case 0x28: // movaps
|
||||
case 0x29: // movaps
|
||||
case 0x2E: // ucomiss
|
||||
case 0x2F: // comiss
|
||||
case 0x54: // andps
|
||||
@@ -969,7 +970,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
|
||||
assert(which == call32_operand, "jcc has no disp32 or imm");
|
||||
return ip;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
fatal("not handled: 0x0F%2X", 0xFF & *(ip-1));
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
@@ -248,4 +248,17 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Is SIMD sort supported for this CPU?
|
||||
static bool supports_simd_sort(BasicType bt) {
|
||||
if (VM_Version::supports_avx512dq()) {
|
||||
return true;
|
||||
}
|
||||
else if (VM_Version::supports_avx2() && !is_double_word_type(bt)) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CPU_X86_MATCHER_X86_HPP
|
||||
|
||||
@@ -4193,22 +4193,23 @@ void StubGenerator::generate_compiler_stubs() {
|
||||
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
|
||||
}
|
||||
|
||||
// Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics
|
||||
if (VM_Version::is_intel() && VM_Version::supports_avx512dq()) {
|
||||
// Load x86_64_sort library on supported hardware to enable SIMD sort and partition intrinsics
|
||||
|
||||
if (VM_Version::is_intel() && (VM_Version::supports_avx512dq() || VM_Version::supports_avx2())) {
|
||||
void *libsimdsort = nullptr;
|
||||
char ebuf_[1024];
|
||||
char dll_name_simd_sort[JVM_MAXPATHLEN];
|
||||
if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
|
||||
libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
|
||||
}
|
||||
// Get addresses for avx512 sort and partition routines
|
||||
// Get addresses for SIMD sort and partition routines
|
||||
if (libsimdsort != nullptr) {
|
||||
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
|
||||
|
||||
snprintf(ebuf_, sizeof(ebuf_), "avx512_sort");
|
||||
snprintf(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512dq() ? "avx512_sort" : "avx2_sort");
|
||||
StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_);
|
||||
|
||||
snprintf(ebuf_, sizeof(ebuf_), "avx512_partition");
|
||||
snprintf(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512dq() ? "avx512_partition" : "avx2_partition");
|
||||
StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -858,7 +858,7 @@ void VM_Version::get_processor_features() {
|
||||
|
||||
// Check if processor has Intel Ecore
|
||||
if (FLAG_IS_DEFAULT(EnableX86ECoreOpts) && is_intel() && cpu_family() == 6 &&
|
||||
(_model == 0x97 || _model == 0xAC || _model == 0xAF)) {
|
||||
(_model == 0x97 || _model == 0xAA || _model == 0xAC || _model == 0xAF)) {
|
||||
FLAG_SET_DEFAULT(EnableX86ECoreOpts, true);
|
||||
}
|
||||
|
||||
@@ -1130,6 +1130,7 @@ void VM_Version::get_processor_features() {
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
// ChaCha20 Intrinsics
|
||||
// As long as the system supports AVX as a baseline we can do a
|
||||
// SIMD-enabled block function. StubGenerator makes the determination
|
||||
@@ -1145,6 +1146,13 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
|
||||
}
|
||||
#else
|
||||
// No support currently for ChaCha20 intrinsics on 32-bit platforms
|
||||
if (UseChaCha20Intrinsics) {
|
||||
warning("ChaCha20 intrinsics are not available on this CPU.");
|
||||
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
|
||||
}
|
||||
#endif // _LP64
|
||||
|
||||
// Base64 Intrinsics (Check the condition for which the intrinsic will be active)
|
||||
if (UseAVX >= 2) {
|
||||
|
||||
@@ -52,7 +52,7 @@ public:
|
||||
static void initialize() NOT_CDS_RETURN;
|
||||
static void check_system_property(const char* key, const char* value) NOT_CDS_RETURN;
|
||||
static void check_unsupported_dumping_properties() NOT_CDS_RETURN;
|
||||
static bool check_vm_args_consistency(bool patch_mod_javabase, bool mode_flag_cmd_line) NOT_CDS_RETURN_(false);
|
||||
static bool check_vm_args_consistency(bool patch_mod_javabase, bool mode_flag_cmd_line) NOT_CDS_RETURN_(true);
|
||||
|
||||
// Basic CDS features
|
||||
static bool is_dumping_archive() { return is_dumping_static_archive() || is_dumping_dynamic_archive(); }
|
||||
|
||||
@@ -129,7 +129,23 @@ CDSHeapVerifier::CDSHeapVerifier() : _archived_objs(0), _problems(0)
|
||||
// This just points to an empty Map
|
||||
ADD_EXCL("jdk/internal/reflect/Reflection", "methodFilterMap"); // E
|
||||
ADD_EXCL("jdk/internal/util/StaticProperty", "FILE_ENCODING", // C
|
||||
"JAVA_LOCALE_USE_OLD_ISO_CODES"); // C
|
||||
"JAVA_LOCALE_USE_OLD_ISO_CODES", // C
|
||||
"USER_LANGUAGE", // C
|
||||
"USER_LANGUAGE_DISPLAY", // C
|
||||
"USER_LANGUAGE_FORMAT", // C
|
||||
"USER_SCRIPT", // C
|
||||
"USER_SCRIPT_DISPLAY", // C
|
||||
"USER_SCRIPT_FORMAT", // C
|
||||
"USER_COUNTRY", // C
|
||||
"USER_COUNTRY_DISPLAY", // C
|
||||
"USER_COUNTRY_FORMAT", // C
|
||||
"USER_VARIANT", // C
|
||||
"USER_VARIANT_DISPLAY", // C
|
||||
"USER_VARIANT_FORMAT", // C
|
||||
"USER_EXTENSIONS", // C
|
||||
"USER_EXTENSIONS_DISPLAY", // C
|
||||
"USER_EXTENSIONS_FORMAT", // C
|
||||
"USER_REGION"); // C
|
||||
|
||||
// Integer for 0 and 1 are in java/lang/Integer$IntegerCache and are archived
|
||||
ADD_EXCL("sun/invoke/util/ValueConversions", "ONE_INT", // E
|
||||
|
||||
@@ -1465,7 +1465,7 @@ BitMapView FileMapRegion::ptrmap_view() {
|
||||
return bitmap_view(false);
|
||||
}
|
||||
|
||||
bool FileMapRegion::check_region_crc() const {
|
||||
bool FileMapRegion::check_region_crc(char* base) const {
|
||||
// This function should be called after the region has been properly
|
||||
// loaded into memory via FileMapInfo::map_region() or FileMapInfo::read_region().
|
||||
// I.e., this->mapped_base() must be valid.
|
||||
@@ -1474,8 +1474,8 @@ bool FileMapRegion::check_region_crc() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
assert(mapped_base() != nullptr, "must be initialized");
|
||||
int crc = ClassLoader::crc32(0, mapped_base(), (jint)sz);
|
||||
assert(base != nullptr, "must be initialized");
|
||||
int crc = ClassLoader::crc32(0, base, (jint)sz);
|
||||
if (crc != this->crc()) {
|
||||
log_warning(cds)("Checksum verification failed.");
|
||||
return false;
|
||||
@@ -1760,13 +1760,13 @@ bool FileMapInfo::read_region(int i, char* base, size_t size, bool do_commit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
r->set_mapped_from_file(false);
|
||||
r->set_mapped_base(base);
|
||||
|
||||
if (VerifySharedSpaces && !r->check_region_crc()) {
|
||||
if (VerifySharedSpaces && !r->check_region_crc(base)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
r->set_mapped_from_file(false);
|
||||
r->set_mapped_base(base);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1803,6 +1803,7 @@ MapArchiveResult FileMapInfo::map_region(int i, intx addr_delta, char* mapped_ba
|
||||
return MAP_ARCHIVE_OTHER_FAILURE; // oom or I/O error.
|
||||
} else {
|
||||
assert(r->mapped_base() != nullptr, "must be initialized");
|
||||
return MAP_ARCHIVE_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
// Note that this may either be a "fresh" mapping into unreserved address
|
||||
@@ -1817,15 +1818,16 @@ MapArchiveResult FileMapInfo::map_region(int i, intx addr_delta, char* mapped_ba
|
||||
_memory_mapping_failed = true;
|
||||
return MAP_ARCHIVE_MMAP_FAILURE;
|
||||
}
|
||||
|
||||
if (VerifySharedSpaces && !r->check_region_crc(requested_addr)) {
|
||||
return MAP_ARCHIVE_OTHER_FAILURE;
|
||||
}
|
||||
|
||||
r->set_mapped_from_file(true);
|
||||
r->set_mapped_base(requested_addr);
|
||||
}
|
||||
|
||||
if (VerifySharedSpaces && !r->check_region_crc()) {
|
||||
return MAP_ARCHIVE_OTHER_FAILURE;
|
||||
return MAP_ARCHIVE_SUCCESS;
|
||||
}
|
||||
|
||||
return MAP_ARCHIVE_SUCCESS;
|
||||
}
|
||||
|
||||
// The return value is the location of the archive relocation bitmap.
|
||||
@@ -1843,8 +1845,7 @@ char* FileMapInfo::map_bitmap_region() {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
r->set_mapped_base(bitmap_base);
|
||||
if (VerifySharedSpaces && !r->check_region_crc()) {
|
||||
if (VerifySharedSpaces && !r->check_region_crc(bitmap_base)) {
|
||||
log_error(cds)("relocation bitmap CRC error");
|
||||
if (!os::unmap_memory(bitmap_base, r->used_aligned())) {
|
||||
fatal("os::unmap_memory of relocation bitmap failed");
|
||||
@@ -1853,6 +1854,7 @@ char* FileMapInfo::map_bitmap_region() {
|
||||
}
|
||||
|
||||
r->set_mapped_from_file(true);
|
||||
r->set_mapped_base(bitmap_base);
|
||||
log_info(cds)("Mapped %s region #%d at base " INTPTR_FORMAT " top " INTPTR_FORMAT " (%s)",
|
||||
is_static() ? "static " : "dynamic",
|
||||
MetaspaceShared::bm, p2i(r->mapped_base()), p2i(r->mapped_end()),
|
||||
@@ -2128,13 +2130,14 @@ bool FileMapInfo::map_heap_region_impl() {
|
||||
return false;
|
||||
}
|
||||
|
||||
r->set_mapped_base(base);
|
||||
if (VerifySharedSpaces && !r->check_region_crc()) {
|
||||
if (VerifySharedSpaces && !r->check_region_crc(base)) {
|
||||
dealloc_heap_region();
|
||||
log_info(cds)("UseSharedSpaces: mapped heap region is corrupt");
|
||||
return false;
|
||||
}
|
||||
|
||||
r->set_mapped_base(base);
|
||||
|
||||
// If the requested range is different from the range allocated by GC, then
|
||||
// the pointers need to be patched.
|
||||
address mapped_start = (address) _mapped_heap_memregion.start();
|
||||
|
||||
@@ -170,7 +170,7 @@ public:
|
||||
BitMapView ptrmap_view();
|
||||
bool has_ptrmap() { return _ptrmap_size_in_bits != 0; }
|
||||
|
||||
bool check_region_crc() const;
|
||||
bool check_region_crc(char* base) const;
|
||||
void print(outputStream* st, int region_index);
|
||||
};
|
||||
|
||||
|
||||
@@ -149,6 +149,8 @@
|
||||
|
||||
#define JAVA_22_VERSION 66
|
||||
|
||||
#define JAVA_23_VERSION 67
|
||||
|
||||
void ClassFileParser::set_class_bad_constant_seen(short bad_constant) {
|
||||
assert((bad_constant == JVM_CONSTANT_Module ||
|
||||
bad_constant == JVM_CONSTANT_Package) && _major_version >= JAVA_9_VERSION,
|
||||
|
||||
@@ -175,7 +175,7 @@ G1ConcurrentRefine::G1ConcurrentRefine(G1Policy* policy) :
|
||||
{}
|
||||
|
||||
jint G1ConcurrentRefine::initialize() {
|
||||
return _thread_control.initialize(this, max_num_threads());
|
||||
return _thread_control.initialize(this, G1ConcRefinementThreads);
|
||||
}
|
||||
|
||||
G1ConcurrentRefine* G1ConcurrentRefine::create(G1Policy* policy, jint* ecode) {
|
||||
@@ -199,10 +199,6 @@ void G1ConcurrentRefine::threads_do(ThreadClosure *tc) {
|
||||
_thread_control.worker_threads_do(tc);
|
||||
}
|
||||
|
||||
uint G1ConcurrentRefine::max_num_threads() {
|
||||
return G1ConcRefinementThreads;
|
||||
}
|
||||
|
||||
void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms,
|
||||
size_t processed_logged_cards,
|
||||
size_t predicted_thread_buffer_cards,
|
||||
|
||||
@@ -215,9 +215,6 @@ public:
|
||||
|
||||
// Iterate over all concurrent refinement threads applying the given closure.
|
||||
void threads_do(ThreadClosure *tc);
|
||||
|
||||
// Maximum number of refinement threads.
|
||||
static uint max_num_threads();
|
||||
};
|
||||
|
||||
#endif // SHARE_GC_G1_G1CONCURRENTREFINE_HPP
|
||||
|
||||
@@ -81,7 +81,7 @@ void G1FromCardCache::print(outputStream* out) {
|
||||
#endif
|
||||
|
||||
uint G1FromCardCache::num_par_rem_sets() {
|
||||
return G1DirtyCardQueueSet::num_par_ids() + G1ConcurrentRefine::max_num_threads() + MAX2(ConcGCThreads, ParallelGCThreads);
|
||||
return G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads + MAX2(ConcGCThreads, ParallelGCThreads);
|
||||
}
|
||||
|
||||
void G1FromCardCache::clear(uint region_idx) {
|
||||
|
||||
@@ -91,11 +91,6 @@ class G1RemSetScanState : public CHeapObj<mtGC> {
|
||||
|
||||
size_t _max_reserved_regions;
|
||||
|
||||
// Has this region that is part of the regions in the collection set been processed yet.
|
||||
typedef bool G1RemsetIterState;
|
||||
|
||||
G1RemsetIterState volatile* _collection_set_iter_state;
|
||||
|
||||
// Card table iteration claim for each heap region, from 0 (completely unscanned)
|
||||
// to (>=) HeapRegion::CardsPerRegion (completely scanned).
|
||||
uint volatile* _card_table_scan_state;
|
||||
|
||||
@@ -67,7 +67,7 @@ double G1RemSetSummary::rs_thread_vtime(uint thread) const {
|
||||
}
|
||||
|
||||
G1RemSetSummary::G1RemSetSummary(bool should_update) :
|
||||
_num_vtimes(G1ConcurrentRefine::max_num_threads()),
|
||||
_num_vtimes(G1ConcRefinementThreads),
|
||||
_rs_threads_vtimes(NEW_C_HEAP_ARRAY(double, _num_vtimes, mtGC)) {
|
||||
|
||||
memset(_rs_threads_vtimes, 0, sizeof(double) * _num_vtimes);
|
||||
|
||||
@@ -38,18 +38,16 @@ bool G1RemSetTrackingPolicy::needs_scan_for_rebuild(HeapRegion* r) const {
|
||||
}
|
||||
|
||||
void G1RemSetTrackingPolicy::update_at_allocate(HeapRegion* r) {
|
||||
if (r->is_young()) {
|
||||
// Always collect remembered set for young regions.
|
||||
r->rem_set()->set_state_complete();
|
||||
} else if (r->is_humongous()) {
|
||||
// Collect remembered sets for humongous regions by default to allow eager reclaim.
|
||||
r->rem_set()->set_state_complete();
|
||||
} else if (r->is_old()) {
|
||||
assert(r->is_young() || r->is_humongous() || r->is_old(),
|
||||
"Region %u with unexpected heap region type %s", r->hrm_index(), r->get_type_str());
|
||||
if (r->is_old()) {
|
||||
// By default, do not create remembered set for new old regions.
|
||||
r->rem_set()->set_state_untracked();
|
||||
} else {
|
||||
guarantee(false, "Unhandled region %u with heap region type %s", r->hrm_index(), r->get_type_str());
|
||||
return;
|
||||
}
|
||||
// Always collect remembered set for young regions and for humongous regions.
|
||||
// Humongous regions need that for eager reclaim.
|
||||
r->rem_set()->set_state_complete();
|
||||
}
|
||||
|
||||
void G1RemSetTrackingPolicy::update_at_free(HeapRegion* r) {
|
||||
|
||||
@@ -113,11 +113,11 @@
|
||||
"of the optimal occupancy to start marking.") \
|
||||
range(1, max_intx) \
|
||||
\
|
||||
product(uint, G1ConfidencePercent, 50, \
|
||||
product(uint, G1ConfidencePercent, 50, \
|
||||
"Confidence level for MMU/pause predictions") \
|
||||
range(0, 100) \
|
||||
\
|
||||
product(intx, G1SummarizeRSetStatsPeriod, 0, DIAGNOSTIC, \
|
||||
product(uintx, G1SummarizeRSetStatsPeriod, 0, DIAGNOSTIC, \
|
||||
"The period (in number of GCs) at which we will generate " \
|
||||
"update buffer processing info " \
|
||||
"(0 means do not periodically generate this info); " \
|
||||
@@ -148,7 +148,7 @@
|
||||
"Number of entries in an SATB log buffer.") \
|
||||
constraint(G1SATBBufferSizeConstraintFunc, AtParse) \
|
||||
\
|
||||
develop(intx, G1SATBProcessCompletedThreshold, 20, \
|
||||
develop(uintx, G1SATBProcessCompletedThreshold, 20, \
|
||||
"Number of completed buffers that triggers log processing.") \
|
||||
range(0, max_jint) \
|
||||
\
|
||||
|
||||
@@ -344,17 +344,11 @@ class AdaptiveSizePolicy : public CHeapObj<mtGC> {
|
||||
AdaptiveWeightedAverage* avg_eden_live() const { return _avg_eden_live; }
|
||||
AdaptiveWeightedAverage* avg_old_live() const { return _avg_old_live; }
|
||||
|
||||
AdaptivePaddedAverage* avg_survived() const { return _avg_survived; }
|
||||
AdaptivePaddedNoZeroDevAverage* avg_pretenured() { return _avg_pretenured; }
|
||||
|
||||
// Methods indicating events of interest to the adaptive size policy,
|
||||
// called by GC algorithms. It is the responsibility of users of this
|
||||
// policy to call these methods at the correct times!
|
||||
virtual void minor_collection_begin();
|
||||
virtual void minor_collection_end(GCCause::Cause gc_cause);
|
||||
virtual LinearLeastSquareFit* minor_pause_old_estimator() const {
|
||||
return _minor_pause_old_estimator;
|
||||
}
|
||||
|
||||
LinearLeastSquareFit* minor_pause_young_estimator() {
|
||||
return _minor_pause_young_estimator;
|
||||
@@ -404,10 +398,6 @@ class AdaptiveSizePolicy : public CHeapObj<mtGC> {
|
||||
_overhead_checker.set_gc_overhead_limit_exceeded(v);
|
||||
}
|
||||
|
||||
bool gc_overhead_limit_near() {
|
||||
return _overhead_checker.gc_overhead_limit_near();
|
||||
}
|
||||
|
||||
void reset_gc_overhead_limit_count() {
|
||||
_overhead_checker.reset_gc_overhead_limit_count();
|
||||
}
|
||||
|
||||
@@ -105,13 +105,15 @@ static void commit(HelperType& helper) {
|
||||
assert(thread != nullptr, "invariant");
|
||||
if (thread->is_Java_thread()) {
|
||||
JavaThread* jt = JavaThread::cast(thread);
|
||||
if (jt->thread_state() != _thread_in_vm) {
|
||||
assert(jt->thread_state() == _thread_in_native, "invariant");
|
||||
if (jt->thread_state() == _thread_in_native) {
|
||||
// For a JavaThread to take a JFR stacktrace, it must be in _thread_in_vm. Can safepoint here.
|
||||
ThreadInVMfromNative transition(jt);
|
||||
event.commit();
|
||||
return;
|
||||
}
|
||||
// If a thread comes here still _thread_in_Java, which can happen for example
|
||||
// when loading the disassembler library in response to traps in JIT code - all is ok.
|
||||
// Since there is no ljf, an event will be committed without a stacktrace.
|
||||
}
|
||||
event.commit();
|
||||
}
|
||||
|
||||
@@ -53,8 +53,8 @@
|
||||
// * store_at: Store a value in an internal pointer relative to a base object.
|
||||
// * atomic_cmpxchg: Atomically compare-and-swap a new value at an address if previous value matched the compared value.
|
||||
// * atomic_cmpxchg_at: Atomically compare-and-swap a new value at an internal pointer address if previous value matched the compared value.
|
||||
// * atomic_xchg: Atomically swap a new value at an address if previous value matched the compared value.
|
||||
// * atomic_xchg_at: Atomically swap a new value at an internal pointer address if previous value matched the compared value.
|
||||
// * atomic_xchg: Atomically swap a new value at an address without checking the previous value.
|
||||
// * atomic_xchg_at: Atomically swap a new value at an internal pointer address without checking the previous value.
|
||||
// * arraycopy: Copy data from one heap array to another heap array. The ArrayAccess class has convenience functions for this.
|
||||
// * clone: Clone the contents of an object to a newly allocated object.
|
||||
//
|
||||
@@ -83,12 +83,11 @@
|
||||
// and whether the access is performed on the heap or outside. Then the
|
||||
// appropriate BarrierSet::AccessBarrier is called to perform the access.
|
||||
//
|
||||
// The implementation of step 1-4 resides in in accessBackend.hpp, to allow selected
|
||||
// The implementation of step 1-4 resides in accessBackend.hpp, to allow selected
|
||||
// accesses to be accessible from only access.hpp, as opposed to access.inline.hpp.
|
||||
// Steps 5.a and 5.b require knowledge about the GC backends, and therefore needs to
|
||||
// include the various GC backend .inline.hpp headers. Their implementation resides in
|
||||
// access.inline.hpp. The accesses that are allowed through the access.hpp file
|
||||
// must be instantiated in access.cpp using the INSTANTIATE_HPP_ACCESS macro.
|
||||
// access.inline.hpp.
|
||||
|
||||
template <DecoratorSet decorators = DECORATORS_NONE>
|
||||
class Access: public AllStatic {
|
||||
|
||||
@@ -365,10 +365,10 @@
|
||||
"Level of detail of the ideal graph printout. " \
|
||||
"System-wide value, -1=printing is disabled, " \
|
||||
"0=print nothing except IGVPrintLevel directives, " \
|
||||
"5=all details printed. " \
|
||||
"6=all details printed. " \
|
||||
"Level of detail of printouts can be set on a per-method level " \
|
||||
"as well by using CompileCommand=option.") \
|
||||
range(-1, 5) \
|
||||
range(-1, 6) \
|
||||
\
|
||||
notproduct(intx, PrintIdealGraphPort, 4444, \
|
||||
"Ideal graph printer to network port") \
|
||||
|
||||
@@ -1041,6 +1041,10 @@ void Compile::Init(bool aliasing) {
|
||||
Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist));
|
||||
set_decompile_count(0);
|
||||
|
||||
#ifndef PRODUCT
|
||||
Copy::zero_to_bytes(_igv_phase_iter, sizeof(_igv_phase_iter));
|
||||
#endif
|
||||
|
||||
set_do_freq_based_layout(_directive->BlockLayoutByFrequencyOption);
|
||||
_loop_opts_cnt = LoopOptsCount;
|
||||
set_do_inlining(Inline);
|
||||
@@ -2397,6 +2401,7 @@ void Compile::Optimize() {
|
||||
if (failing()) return;
|
||||
|
||||
// Conditional Constant Propagation;
|
||||
print_method(PHASE_BEFORE_CCP1, 2);
|
||||
PhaseCCP ccp( &igvn );
|
||||
assert( true, "Break here to ccp.dump_nodes_and_types(_root,999,1)");
|
||||
{
|
||||
@@ -2972,6 +2977,8 @@ void Compile::Code_Gen() {
|
||||
if (failing()) {
|
||||
return;
|
||||
}
|
||||
|
||||
print_method(PHASE_REGISTER_ALLOCATION, 2);
|
||||
}
|
||||
|
||||
// Prior to register allocation we kept empty basic blocks in case the
|
||||
@@ -2989,6 +2996,7 @@ void Compile::Code_Gen() {
|
||||
cfg.fixup_flow();
|
||||
cfg.remove_unreachable_blocks();
|
||||
cfg.verify_dominator_tree();
|
||||
print_method(PHASE_BLOCK_ORDERING, 3);
|
||||
}
|
||||
|
||||
// Apply peephole optimizations
|
||||
@@ -2996,12 +3004,14 @@ void Compile::Code_Gen() {
|
||||
TracePhase tp("peephole", &timers[_t_peephole]);
|
||||
PhasePeephole peep( _regalloc, cfg);
|
||||
peep.do_transform();
|
||||
print_method(PHASE_PEEPHOLE, 3);
|
||||
}
|
||||
|
||||
// Do late expand if CPU requires this.
|
||||
if (Matcher::require_postalloc_expand) {
|
||||
TracePhase tp("postalloc_expand", &timers[_t_postalloc_expand]);
|
||||
cfg.postalloc_expand(_regalloc);
|
||||
print_method(PHASE_POSTALLOC_EXPAND, 3);
|
||||
}
|
||||
|
||||
// Convert Nodes to instruction bits in a buffer
|
||||
@@ -5102,6 +5112,10 @@ void Compile::print_method(CompilerPhaseType cpt, int level, Node* n) {
|
||||
ResourceMark rm;
|
||||
stringStream ss;
|
||||
ss.print_raw(CompilerPhaseTypeHelper::to_description(cpt));
|
||||
int iter = ++_igv_phase_iter[cpt];
|
||||
if (iter > 1) {
|
||||
ss.print(" %d", iter);
|
||||
}
|
||||
if (n != nullptr) {
|
||||
ss.print(": %d %s ", n->_idx, NodeClassNames[n->Opcode()]);
|
||||
}
|
||||
|
||||
@@ -343,6 +343,7 @@ class Compile : public Phase {
|
||||
bool _print_intrinsics; // True if we should print intrinsics for this compilation
|
||||
#ifndef PRODUCT
|
||||
uint _igv_idx; // Counter for IGV node identifiers
|
||||
uint _igv_phase_iter[PHASE_NUM_TYPES]; // Counters for IGV phase iterations
|
||||
bool _trace_opto_output;
|
||||
bool _parsed_irreducible_loop; // True if ciTypeFlow detected irreducible loops during parsing
|
||||
#endif
|
||||
@@ -531,6 +532,7 @@ private:
|
||||
|
||||
#ifndef PRODUCT
|
||||
IdealGraphPrinter* igv_printer() { return _igv_printer; }
|
||||
void reset_igv_phase_iter(CompilerPhaseType cpt) { _igv_phase_iter[cpt] = 0; }
|
||||
#endif
|
||||
|
||||
void log_late_inline(CallGenerator* cg);
|
||||
|
||||
@@ -1563,6 +1563,11 @@ Node* GraphKit::make_load(Node* ctl, Node* adr, const Type* t, BasicType bt,
|
||||
if (((bt == T_OBJECT) && C->do_escape_analysis()) || C->eliminate_boxing()) {
|
||||
// Improve graph before escape analysis and boxing elimination.
|
||||
record_for_igvn(ld);
|
||||
if (ld->is_DecodeN()) {
|
||||
// Also record the actual load (LoadN) in case ld is DecodeN
|
||||
assert(ld->in(1)->Opcode() == Op_LoadN, "Assumption invalid: input to DecodeN is not LoadN");
|
||||
record_for_igvn(ld->in(1));
|
||||
}
|
||||
}
|
||||
return ld;
|
||||
}
|
||||
|
||||
@@ -5387,6 +5387,10 @@ bool LibraryCallKit::inline_array_partition() {
|
||||
const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
|
||||
ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
|
||||
BasicType bt = elem_type->basic_type();
|
||||
// Disable the intrinsic if the CPU does not support SIMD sort
|
||||
if (!Matcher::supports_simd_sort(bt)) {
|
||||
return false;
|
||||
}
|
||||
address stubAddr = nullptr;
|
||||
stubAddr = StubRoutines::select_array_partition_function();
|
||||
// stub not loaded
|
||||
@@ -5440,6 +5444,10 @@ bool LibraryCallKit::inline_array_sort() {
|
||||
const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
|
||||
ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
|
||||
BasicType bt = elem_type->basic_type();
|
||||
// Disable the intrinsic if the CPU does not support SIMD sort
|
||||
if (!Matcher::supports_simd_sort(bt)) {
|
||||
return false;
|
||||
}
|
||||
address stubAddr = nullptr;
|
||||
stubAddr = StubRoutines::select_arraysort_function();
|
||||
//stub not loaded
|
||||
|
||||
@@ -1180,6 +1180,7 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
|
||||
}
|
||||
BoolNode* bol = test->as_Bool();
|
||||
if (invar.is_invariant(bol)) {
|
||||
C->print_method(PHASE_BEFORE_LOOP_PREDICATION_IC, 4, iff);
|
||||
// Invariant test
|
||||
new_predicate_proj = create_new_if_for_predicate(parse_predicate_proj, nullptr,
|
||||
reason,
|
||||
@@ -1197,6 +1198,9 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
|
||||
IfNode* new_predicate_iff = new_predicate_proj->in(0)->as_If();
|
||||
_igvn.hash_delete(new_predicate_iff);
|
||||
new_predicate_iff->set_req(1, new_predicate_bol);
|
||||
|
||||
C->print_method(PHASE_AFTER_LOOP_PREDICATION_IC, 4, new_predicate_proj->in(0));
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (TraceLoopPredicate) {
|
||||
tty->print("Predicate invariant if%s: %d ", negated ? " negated" : "", new_predicate_iff->_idx);
|
||||
@@ -1207,6 +1211,7 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
|
||||
}
|
||||
#endif
|
||||
} else if (cl != nullptr && loop->is_range_check_if(if_success_proj, this, invar DEBUG_ONLY(COMMA parse_predicate_proj))) {
|
||||
C->print_method(PHASE_BEFORE_LOOP_PREDICATION_RC, 4, iff);
|
||||
// Range check for counted loops
|
||||
assert(if_success_proj->is_IfTrue(), "trap must be on false projection for a range check");
|
||||
const Node* cmp = bol->in(1)->as_Cmp();
|
||||
@@ -1270,6 +1275,8 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
|
||||
new_predicate_proj = add_template_assertion_predicate(iff, loop, if_success_proj, parse_predicate_proj, upper_bound_proj, scale,
|
||||
offset, init, limit, stride, rng, overflow, reason);
|
||||
|
||||
C->print_method(PHASE_AFTER_LOOP_PREDICATION_RC, 4, new_predicate_proj->in(0));
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (TraceLoopOpts && !TraceLoopPredicate) {
|
||||
tty->print("Predicate RC ");
|
||||
|
||||
@@ -703,6 +703,9 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
|
||||
}
|
||||
#endif
|
||||
LoopNode* head = loop->_head->as_Loop();
|
||||
|
||||
C->print_method(PHASE_BEFORE_LOOP_PEELING, 4, head);
|
||||
|
||||
bool counted_loop = head->is_CountedLoop();
|
||||
if (counted_loop) {
|
||||
CountedLoopNode *cl = head->as_CountedLoop();
|
||||
@@ -795,6 +798,8 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
|
||||
peeled_dom_test_elim(loop,old_new);
|
||||
|
||||
loop->record_for_igvn();
|
||||
|
||||
C->print_method(PHASE_AFTER_LOOP_PEELING, 4, new_head);
|
||||
}
|
||||
|
||||
//------------------------------policy_maximally_unroll------------------------
|
||||
@@ -1629,6 +1634,8 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
|
||||
CountedLoopEndNode *main_end = main_head->loopexit();
|
||||
assert(main_end->outcnt() == 2, "1 true, 1 false path only");
|
||||
|
||||
C->print_method(PHASE_BEFORE_PRE_MAIN_POST, 4, main_head);
|
||||
|
||||
Node *pre_header= main_head->in(LoopNode::EntryControl);
|
||||
Node *init = main_head->init_trip();
|
||||
Node *incr = main_end ->incr();
|
||||
@@ -1825,6 +1832,8 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
|
||||
// finds some, but we _know_ they are all useless.
|
||||
peeled_dom_test_elim(loop,old_new);
|
||||
loop->record_for_igvn();
|
||||
|
||||
C->print_method(PHASE_AFTER_PRE_MAIN_POST, 4, main_head);
|
||||
}
|
||||
|
||||
//------------------------------insert_vector_post_loop------------------------
|
||||
@@ -2127,6 +2136,9 @@ void PhaseIdealLoop::do_unroll(IdealLoopTree *loop, Node_List &old_new, bool adj
|
||||
assert(LoopUnrollLimit, "");
|
||||
CountedLoopNode *loop_head = loop->_head->as_CountedLoop();
|
||||
CountedLoopEndNode *loop_end = loop_head->loopexit();
|
||||
|
||||
C->print_method(PHASE_BEFORE_LOOP_UNROLLING, 4, loop_head);
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (PrintOpto && VerifyLoopOptimizations) {
|
||||
tty->print("Unrolling ");
|
||||
@@ -2374,6 +2386,8 @@ void PhaseIdealLoop::do_unroll(IdealLoopTree *loop, Node_List &old_new, bool adj
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
C->print_method(PHASE_AFTER_LOOP_UNROLLING, 4, clone_head);
|
||||
}
|
||||
|
||||
//------------------------------do_maximally_unroll----------------------------
|
||||
@@ -3003,6 +3017,8 @@ void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
|
||||
// stride_con and scale_con can be negative which will flip about the
|
||||
// sense of the test.
|
||||
|
||||
C->print_method(PHASE_BEFORE_RANGE_CHECK_ELIMINATION, 4, iff);
|
||||
|
||||
// Perform the limit computations in jlong to avoid overflow
|
||||
jlong lscale_con = scale_con;
|
||||
Node* int_offset = offset;
|
||||
@@ -3103,6 +3119,9 @@ void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
|
||||
--imax;
|
||||
}
|
||||
}
|
||||
|
||||
C->print_method(PHASE_AFTER_RANGE_CHECK_ELIMINATION, 4, cl);
|
||||
|
||||
} // End of is IF
|
||||
}
|
||||
if (loop_entry != cl->skip_strip_mined()->in(LoopNode::EntryControl)) {
|
||||
|
||||
@@ -134,6 +134,8 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree *loop, Node_List &old_new) {
|
||||
}
|
||||
#endif
|
||||
|
||||
C->print_method(PHASE_BEFORE_LOOP_UNSWITCHING, 4, head);
|
||||
|
||||
// Need to revert back to normal loop
|
||||
if (head->is_CountedLoop() && !head->as_CountedLoop()->is_normal_loop()) {
|
||||
head->as_CountedLoop()->set_normal_loop();
|
||||
@@ -200,6 +202,8 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree *loop, Node_List &old_new) {
|
||||
}
|
||||
#endif
|
||||
|
||||
C->print_method(PHASE_AFTER_LOOP_UNSWITCHING, 4, head_clone);
|
||||
|
||||
C->set_major_progress();
|
||||
}
|
||||
|
||||
|
||||
@@ -1446,7 +1446,12 @@ void PhaseIdealLoop::split_if_with_blocks_post(Node *n) {
|
||||
}
|
||||
|
||||
// Now split the IF
|
||||
C->print_method(PHASE_BEFORE_SPLIT_IF, 4, iff);
|
||||
if ((PrintOpto && VerifyLoopOptimizations) || TraceLoopOpts) {
|
||||
tty->print_cr("Split-If");
|
||||
}
|
||||
do_split_if(iff);
|
||||
C->print_method(PHASE_AFTER_SPLIT_IF, 4, iff);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3625,6 +3630,9 @@ bool PhaseIdealLoop::partial_peel( IdealLoopTree *loop, Node_List &old_new ) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
C->print_method(PHASE_BEFORE_PARTIAL_PEELING, 4, head);
|
||||
|
||||
VectorSet peel;
|
||||
VectorSet not_peel;
|
||||
Node_List peel_list;
|
||||
@@ -3919,6 +3927,9 @@ bool PhaseIdealLoop::partial_peel( IdealLoopTree *loop, Node_List &old_new ) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
C->print_method(PHASE_AFTER_PARTIAL_PEELING, 4, new_head_clone);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -2779,7 +2779,7 @@ void Parse::do_one_bytecode() {
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
constexpr int perBytecode = 5;
|
||||
constexpr int perBytecode = 6;
|
||||
if (C->should_print_igv(perBytecode)) {
|
||||
IdealGraphPrinter* printer = C->igv_printer();
|
||||
char buffer[256];
|
||||
|
||||
@@ -894,7 +894,7 @@ void PhaseIterGVN::verify_step(Node* n) {
|
||||
void PhaseIterGVN::trace_PhaseIterGVN(Node* n, Node* nn, const Type* oldtype) {
|
||||
const Type* newtype = type_or_null(n);
|
||||
if (nn != n || oldtype != newtype) {
|
||||
C->print_method(PHASE_AFTER_ITER_GVN_STEP, 4, n);
|
||||
C->print_method(PHASE_AFTER_ITER_GVN_STEP, 5, n);
|
||||
}
|
||||
if (TraceIterativeGVN) {
|
||||
uint wlsize = _worklist.size();
|
||||
@@ -1025,6 +1025,7 @@ void PhaseIterGVN::trace_PhaseIterGVN_verbose(Node* n, int num_processed) {
|
||||
void PhaseIterGVN::optimize() {
|
||||
DEBUG_ONLY(uint num_processed = 0;)
|
||||
NOT_PRODUCT(init_verifyPhaseIterGVN();)
|
||||
NOT_PRODUCT(C->reset_igv_phase_iter(PHASE_AFTER_ITER_GVN_STEP);)
|
||||
C->print_method(PHASE_BEFORE_ITER_GVN, 3);
|
||||
if (StressIGVN) {
|
||||
shuffle_worklist();
|
||||
|
||||
@@ -28,51 +28,77 @@
|
||||
#include "utilities/bitMap.inline.hpp"
|
||||
|
||||
#define COMPILER_PHASES(flags) \
|
||||
flags(BEFORE_STRINGOPTS, "Before StringOpts") \
|
||||
flags(AFTER_STRINGOPTS, "After StringOpts") \
|
||||
flags(BEFORE_REMOVEUSELESS, "Before RemoveUseless") \
|
||||
flags(AFTER_PARSING, "After Parsing") \
|
||||
flags(BEFORE_ITER_GVN, "Before Iter GVN") \
|
||||
flags(ITER_GVN1, "Iter GVN 1") \
|
||||
flags(AFTER_ITER_GVN_STEP, "After Iter GVN Step") \
|
||||
flags(AFTER_ITER_GVN, "After Iter GVN") \
|
||||
flags(INCREMENTAL_INLINE_STEP, "Incremental Inline Step") \
|
||||
flags(INCREMENTAL_INLINE_CLEANUP, "Incremental Inline Cleanup") \
|
||||
flags(INCREMENTAL_INLINE, "Incremental Inline") \
|
||||
flags(INCREMENTAL_BOXING_INLINE, "Incremental Boxing Inline") \
|
||||
flags(EXPAND_VUNBOX, "Expand VectorUnbox") \
|
||||
flags(SCALARIZE_VBOX, "Scalarize VectorBox") \
|
||||
flags(INLINE_VECTOR_REBOX, "Inline Vector Rebox Calls") \
|
||||
flags(EXPAND_VBOX, "Expand VectorBox") \
|
||||
flags(ELIMINATE_VBOX_ALLOC, "Eliminate VectorBoxAllocate") \
|
||||
flags(ITER_GVN_BEFORE_EA, "Iter GVN before EA") \
|
||||
flags(ITER_GVN_AFTER_VECTOR, "Iter GVN after vector box elimination") \
|
||||
flags(BEFORE_BEAUTIFY_LOOPS, "Before beautify loops") \
|
||||
flags(AFTER_BEAUTIFY_LOOPS, "After beautify loops") \
|
||||
flags(BEFORE_CLOOPS, "Before CountedLoop") \
|
||||
flags(AFTER_CLOOPS, "After CountedLoop") \
|
||||
flags(PHASEIDEAL_BEFORE_EA, "PhaseIdealLoop before EA") \
|
||||
flags(AFTER_EA, "After Escape Analysis") \
|
||||
flags(ITER_GVN_AFTER_EA, "Iter GVN after EA") \
|
||||
flags(ITER_GVN_AFTER_ELIMINATION, "Iter GVN after eliminating allocations and locks") \
|
||||
flags(PHASEIDEALLOOP1, "PhaseIdealLoop 1") \
|
||||
flags(PHASEIDEALLOOP2, "PhaseIdealLoop 2") \
|
||||
flags(PHASEIDEALLOOP3, "PhaseIdealLoop 3") \
|
||||
flags(CCP1, "PhaseCCP 1") \
|
||||
flags(ITER_GVN2, "Iter GVN 2") \
|
||||
flags(PHASEIDEALLOOP_ITERATIONS, "PhaseIdealLoop iterations") \
|
||||
flags(MACRO_EXPANSION, "Macro expand") \
|
||||
flags(BARRIER_EXPANSION, "Barrier expand") \
|
||||
flags(OPTIMIZE_FINISHED, "Optimize finished") \
|
||||
flags(BEFORE_MATCHING, "Before matching") \
|
||||
flags(MATCHING, "After matching") \
|
||||
flags(GLOBAL_CODE_MOTION, "Global code motion") \
|
||||
flags(MACH_ANALYSIS, "After mach analysis") \
|
||||
flags(FINAL_CODE, "Final Code") \
|
||||
flags(END, "End") \
|
||||
flags(FAILURE, "Failure") \
|
||||
flags(ALL, "All") \
|
||||
flags(DEBUG, "Debug")
|
||||
flags(BEFORE_STRINGOPTS, "Before StringOpts") \
|
||||
flags(AFTER_STRINGOPTS, "After StringOpts") \
|
||||
flags(BEFORE_REMOVEUSELESS, "Before RemoveUseless") \
|
||||
flags(AFTER_PARSING, "After Parsing") \
|
||||
flags(BEFORE_ITER_GVN, "Before Iter GVN") \
|
||||
flags(ITER_GVN1, "Iter GVN 1") \
|
||||
flags(AFTER_ITER_GVN_STEP, "After Iter GVN Step") \
|
||||
flags(AFTER_ITER_GVN, "After Iter GVN") \
|
||||
flags(INCREMENTAL_INLINE_STEP, "Incremental Inline Step") \
|
||||
flags(INCREMENTAL_INLINE_CLEANUP, "Incremental Inline Cleanup") \
|
||||
flags(INCREMENTAL_INLINE, "Incremental Inline") \
|
||||
flags(INCREMENTAL_BOXING_INLINE, "Incremental Boxing Inline") \
|
||||
flags(EXPAND_VUNBOX, "Expand VectorUnbox") \
|
||||
flags(SCALARIZE_VBOX, "Scalarize VectorBox") \
|
||||
flags(INLINE_VECTOR_REBOX, "Inline Vector Rebox Calls") \
|
||||
flags(EXPAND_VBOX, "Expand VectorBox") \
|
||||
flags(ELIMINATE_VBOX_ALLOC, "Eliminate VectorBoxAllocate") \
|
||||
flags(ITER_GVN_BEFORE_EA, "Iter GVN before EA") \
|
||||
flags(ITER_GVN_AFTER_VECTOR, "Iter GVN after vector box elimination") \
|
||||
flags(BEFORE_BEAUTIFY_LOOPS, "Before beautify loops") \
|
||||
flags(AFTER_BEAUTIFY_LOOPS, "After beautify loops") \
|
||||
flags(BEFORE_LOOP_UNROLLING, "Before Loop Unrolling") \
|
||||
flags(AFTER_LOOP_UNROLLING, "After Loop Unrolling") \
|
||||
flags(BEFORE_SPLIT_IF, "Before Split-If") \
|
||||
flags(AFTER_SPLIT_IF, "After Split-If") \
|
||||
flags(BEFORE_LOOP_PREDICATION_IC, "Before Loop Predication IC") \
|
||||
flags(AFTER_LOOP_PREDICATION_IC, "After Loop Predication IC") \
|
||||
flags(BEFORE_LOOP_PREDICATION_RC, "Before Loop Predication RC") \
|
||||
flags(AFTER_LOOP_PREDICATION_RC, "After Loop Predication RC") \
|
||||
flags(BEFORE_PARTIAL_PEELING, "Before Partial Peeling") \
|
||||
flags(AFTER_PARTIAL_PEELING, "After Partial Peeling") \
|
||||
flags(BEFORE_LOOP_PEELING, "Before Loop Peeling") \
|
||||
flags(AFTER_LOOP_PEELING, "After Loop Peeling") \
|
||||
flags(BEFORE_LOOP_UNSWITCHING, "Before Loop Unswitching") \
|
||||
flags(AFTER_LOOP_UNSWITCHING, "After Loop Unswitching") \
|
||||
flags(BEFORE_RANGE_CHECK_ELIMINATION, "Before Range Check Elimination") \
|
||||
flags(AFTER_RANGE_CHECK_ELIMINATION, "After Range Check Elimination") \
|
||||
flags(BEFORE_PRE_MAIN_POST, "Before Pre/Main/Post Loops") \
|
||||
flags(AFTER_PRE_MAIN_POST, "After Pre/Main/Post Loops") \
|
||||
flags(SUPERWORD1_BEFORE_SCHEDULE, "Superword 1, Before Schedule") \
|
||||
flags(SUPERWORD2_BEFORE_OUTPUT, "Superword 2, Before Output") \
|
||||
flags(SUPERWORD3_AFTER_OUTPUT, "Superword 3, After Output") \
|
||||
flags(BEFORE_CLOOPS, "Before CountedLoop") \
|
||||
flags(AFTER_CLOOPS, "After CountedLoop") \
|
||||
flags(PHASEIDEAL_BEFORE_EA, "PhaseIdealLoop before EA") \
|
||||
flags(AFTER_EA, "After Escape Analysis") \
|
||||
flags(ITER_GVN_AFTER_EA, "Iter GVN after EA") \
|
||||
flags(ITER_GVN_AFTER_ELIMINATION, "Iter GVN after eliminating allocations and locks") \
|
||||
flags(PHASEIDEALLOOP1, "PhaseIdealLoop 1") \
|
||||
flags(PHASEIDEALLOOP2, "PhaseIdealLoop 2") \
|
||||
flags(PHASEIDEALLOOP3, "PhaseIdealLoop 3") \
|
||||
flags(BEFORE_CCP1, "Before PhaseCCP 1") \
|
||||
flags(CCP1, "PhaseCCP 1") \
|
||||
flags(ITER_GVN2, "Iter GVN 2") \
|
||||
flags(PHASEIDEALLOOP_ITERATIONS, "PhaseIdealLoop iterations") \
|
||||
flags(MACRO_EXPANSION, "Macro expand") \
|
||||
flags(BARRIER_EXPANSION, "Barrier expand") \
|
||||
flags(OPTIMIZE_FINISHED, "Optimize finished") \
|
||||
flags(BEFORE_MATCHING, "Before matching") \
|
||||
flags(MATCHING, "After matching") \
|
||||
flags(GLOBAL_CODE_MOTION, "Global code motion") \
|
||||
flags(REGISTER_ALLOCATION, "Register Allocation") \
|
||||
flags(BLOCK_ORDERING, "Block Ordering") \
|
||||
flags(PEEPHOLE, "Peephole") \
|
||||
flags(POSTALLOC_EXPAND, "Post-Allocation Expand") \
|
||||
flags(MACH_ANALYSIS, "After mach analysis") \
|
||||
flags(FINAL_CODE, "Final Code") \
|
||||
flags(END, "End") \
|
||||
flags(FAILURE, "Failure") \
|
||||
flags(ALL, "All") \
|
||||
flags(DEBUG, "Debug")
|
||||
|
||||
#define table_entry(name, description) PHASE_##name,
|
||||
enum CompilerPhaseType {
|
||||
|
||||
@@ -591,12 +591,6 @@ void PhaseIdealLoop::handle_use( Node *use, Node *def, small_cache *cache, Node
|
||||
// Found an If getting its condition-code input from a Phi in the same block.
|
||||
// Split thru the Region.
|
||||
void PhaseIdealLoop::do_split_if(Node* iff, RegionNode** new_false_region, RegionNode** new_true_region) {
|
||||
if (PrintOpto && VerifyLoopOptimizations) {
|
||||
tty->print_cr("Split-if");
|
||||
}
|
||||
if (TraceLoopOpts) {
|
||||
tty->print_cr("SplitIf");
|
||||
}
|
||||
|
||||
C->set_major_progress();
|
||||
RegionNode *region = iff->in(0)->as_Region();
|
||||
|
||||
@@ -2381,6 +2381,9 @@ void SuperWord::schedule() {
|
||||
}
|
||||
#endif
|
||||
|
||||
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
|
||||
_phase->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
|
||||
|
||||
// (4) Use the memops_schedule to re-order the memops in all slices.
|
||||
schedule_reorder_memops(memops_schedule);
|
||||
}
|
||||
@@ -2488,6 +2491,7 @@ bool SuperWord::output() {
|
||||
lpt()->dump_head();
|
||||
}
|
||||
#endif
|
||||
_phase->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
|
||||
|
||||
// Ensure main loop's initial value is properly aligned
|
||||
// (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
|
||||
@@ -2808,6 +2812,8 @@ bool SuperWord::output() {
|
||||
}
|
||||
}
|
||||
|
||||
_phase->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -390,7 +390,10 @@ UNSAFE_ENTRY_SCOPED(void, Unsafe_SetMemory0(JNIEnv *env, jobject unsafe, jobject
|
||||
oop base = JNIHandles::resolve(obj);
|
||||
void* p = index_oop_from_field_offset_long(base, offset);
|
||||
|
||||
Copy::fill_to_memory_atomic(p, sz, value);
|
||||
{
|
||||
GuardUnsafeAccess guard(thread);
|
||||
Copy::fill_to_memory_atomic(p, sz, value);
|
||||
}
|
||||
} UNSAFE_END
|
||||
|
||||
UNSAFE_ENTRY_SCOPED(void, Unsafe_CopyMemory0(JNIEnv *env, jobject unsafe, jobject srcObj, jlong srcOffset, jobject dstObj, jlong dstOffset, jlong size)) {
|
||||
|
||||
@@ -35,12 +35,6 @@
|
||||
|
||||
class Prefetch : AllStatic {
|
||||
public:
|
||||
enum style {
|
||||
do_none, // Do no prefetching
|
||||
do_read, // Do read prefetching
|
||||
do_write // Do write prefetching
|
||||
};
|
||||
|
||||
// Prefetch anticipating read; must not fault, semantically a no-op
|
||||
static void read(const void* loc, intx interval);
|
||||
|
||||
|
||||
@@ -1473,6 +1473,25 @@ void SymbolTableDumper::do_symbol(Symbol** p) {
|
||||
}
|
||||
}
|
||||
|
||||
// Support class used to generate HPROF_GC_CLASS_DUMP records
|
||||
|
||||
class ClassDumper : public KlassClosure {
|
||||
private:
|
||||
AbstractDumpWriter* _writer;
|
||||
AbstractDumpWriter* writer() const { return _writer; }
|
||||
|
||||
public:
|
||||
ClassDumper(AbstractDumpWriter* writer) : _writer(writer) {}
|
||||
|
||||
void do_klass(Klass* k) {
|
||||
if (k->is_instance_klass()) {
|
||||
DumperSupport::dump_instance_class(writer(), k);
|
||||
} else {
|
||||
DumperSupport::dump_array_class(writer(), k);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Support class used to generate HPROF_GC_ROOT_JNI_LOCAL records
|
||||
|
||||
class JNILocalsDumper : public OopClosure {
|
||||
@@ -1860,21 +1879,25 @@ vframe* ThreadDumper::get_top_frame() const {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Callback to dump thread-related data for unmounted virtual threads;
|
||||
// implemented by VM_HeapDumper.
|
||||
class UnmountedVThreadDumper {
|
||||
public:
|
||||
virtual void dump_vthread(oop vt, AbstractDumpWriter* segment_writer) = 0;
|
||||
};
|
||||
|
||||
class VM_HeapDumper;
|
||||
|
||||
// Support class using when iterating over the heap.
|
||||
// Support class used when iterating over the heap.
|
||||
class HeapObjectDumper : public ObjectClosure {
|
||||
private:
|
||||
AbstractDumpWriter* _writer;
|
||||
AbstractDumpWriter* writer() { return _writer; }
|
||||
UnmountedVThreadDumper* _vthread_dumper;
|
||||
|
||||
DumperClassCacheTable _class_cache;
|
||||
|
||||
public:
|
||||
HeapObjectDumper(AbstractDumpWriter* writer) {
|
||||
_writer = writer;
|
||||
}
|
||||
HeapObjectDumper(AbstractDumpWriter* writer, UnmountedVThreadDumper* vthread_dumper)
|
||||
: _writer(writer), _vthread_dumper(vthread_dumper) {}
|
||||
|
||||
// called for each object in the heap
|
||||
void do_object(oop o);
|
||||
@@ -1895,6 +1918,9 @@ void HeapObjectDumper::do_object(oop o) {
|
||||
if (o->is_instance()) {
|
||||
// create a HPROF_GC_INSTANCE record for each object
|
||||
DumperSupport::dump_instance(writer(), o, &_class_cache);
|
||||
if (java_lang_VirtualThread::is_instance(o) && ThreadDumper::should_dump_vthread(o)) {
|
||||
_vthread_dumper->dump_vthread(o, writer());
|
||||
}
|
||||
} else if (o->is_objArray()) {
|
||||
// create a HPROF_GC_OBJ_ARRAY_DUMP record for each object array
|
||||
DumperSupport::dump_object_array(writer(), objArrayOop(o));
|
||||
@@ -1908,16 +1934,52 @@ void HeapObjectDumper::do_object(oop o) {
|
||||
class DumperController : public CHeapObj<mtInternal> {
|
||||
private:
|
||||
Monitor* _lock;
|
||||
Mutex* _global_writer_lock;
|
||||
|
||||
const uint _dumper_number;
|
||||
uint _complete_number;
|
||||
|
||||
bool _started; // VM dumper started and acquired global writer lock
|
||||
|
||||
public:
|
||||
DumperController(uint number) :
|
||||
_lock(new (std::nothrow) PaddedMonitor(Mutex::safepoint, "DumperController_lock")),
|
||||
// _lock and _global_writer_lock are used for synchronization between GC worker threads inside safepoint,
|
||||
// so we lock with _no_safepoint_check_flag.
|
||||
// signal_start() acquires _lock when global writer is locked,
|
||||
// its rank must be less than _global_writer_lock rank.
|
||||
_lock(new (std::nothrow) PaddedMonitor(Mutex::nosafepoint - 1, "DumperController_lock")),
|
||||
_global_writer_lock(new (std::nothrow) Mutex(Mutex::nosafepoint, "DumpWriter_lock")),
|
||||
_dumper_number(number),
|
||||
_complete_number(0) { }
|
||||
_complete_number(0),
|
||||
_started(false)
|
||||
{}
|
||||
|
||||
~DumperController() { delete _lock; }
|
||||
~DumperController() {
|
||||
delete _lock;
|
||||
delete _global_writer_lock;
|
||||
}
|
||||
|
||||
// parallel (non VM) dumpers must wait until VM dumper acquires global writer lock
|
||||
void wait_for_start_signal() {
|
||||
MonitorLocker ml(_lock, Mutex::_no_safepoint_check_flag);
|
||||
while (_started == false) {
|
||||
ml.wait();
|
||||
}
|
||||
}
|
||||
|
||||
void signal_start() {
|
||||
MonitorLocker ml(_lock, Mutex::_no_safepoint_check_flag);
|
||||
_started = true;
|
||||
ml.notify_all();
|
||||
}
|
||||
|
||||
void lock_global_writer() {
|
||||
_global_writer_lock->lock_without_safepoint_check();
|
||||
}
|
||||
|
||||
void unlock_global_writer() {
|
||||
_global_writer_lock->unlock();
|
||||
}
|
||||
|
||||
void dumper_complete(DumpWriter* local_writer, DumpWriter* global_writer) {
|
||||
MonitorLocker ml(_lock, Mutex::_no_safepoint_check_flag);
|
||||
@@ -1946,7 +2008,7 @@ private:
|
||||
int _dump_seq;
|
||||
|
||||
private:
|
||||
void merge_file(char* path);
|
||||
void merge_file(const char* path);
|
||||
void merge_done();
|
||||
void set_error(const char* msg);
|
||||
|
||||
@@ -1958,8 +2020,28 @@ public:
|
||||
_dump_seq(dump_seq) {}
|
||||
|
||||
void do_merge();
|
||||
|
||||
// returns path for the parallel DumpWriter (resource allocated)
|
||||
static char* get_writer_path(const char* base_path, int seq);
|
||||
|
||||
};
|
||||
|
||||
char* DumpMerger::get_writer_path(const char* base_path, int seq) {
|
||||
// approximate required buffer size
|
||||
size_t buf_size = strlen(base_path)
|
||||
+ 2 // ".p"
|
||||
+ 10 // number (that's enough for 2^32 parallel dumpers)
|
||||
+ 1; // '\0'
|
||||
|
||||
char* path = NEW_RESOURCE_ARRAY(char, buf_size);
|
||||
memset(path, 0, buf_size);
|
||||
|
||||
os::snprintf(path, buf_size, "%s.p%d", base_path, seq);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
|
||||
void DumpMerger::merge_done() {
|
||||
// Writes the HPROF_HEAP_DUMP_END record.
|
||||
if (!_has_error) {
|
||||
@@ -1980,8 +2062,7 @@ void DumpMerger::set_error(const char* msg) {
|
||||
// Merge segmented heap files via sendfile, it's more efficient than the
|
||||
// read+write combination, which would require transferring data to and from
|
||||
// user space.
|
||||
void DumpMerger::merge_file(char* path) {
|
||||
assert(!SafepointSynchronize::is_at_safepoint(), "merging happens outside safepoint");
|
||||
void DumpMerger::merge_file(const char* path) {
|
||||
TraceTime timer("Merge segmented heap file directly", TRACETIME_LOG(Info, heapdump));
|
||||
|
||||
int segment_fd = os::open(path, O_RDONLY, 0);
|
||||
@@ -2018,8 +2099,7 @@ void DumpMerger::merge_file(char* path) {
|
||||
}
|
||||
#else
|
||||
// Generic implementation using read+write
|
||||
void DumpMerger::merge_file(char* path) {
|
||||
assert(!SafepointSynchronize::is_at_safepoint(), "merging happens outside safepoint");
|
||||
void DumpMerger::merge_file(const char* path) {
|
||||
TraceTime timer("Merge segmented heap file", TRACETIME_LOG(Info, heapdump));
|
||||
|
||||
fileStream segment_fs(path, "rb");
|
||||
@@ -2044,7 +2124,6 @@ void DumpMerger::merge_file(char* path) {
|
||||
#endif
|
||||
|
||||
void DumpMerger::do_merge() {
|
||||
assert(!SafepointSynchronize::is_at_safepoint(), "merging happens outside safepoint");
|
||||
TraceTime timer("Merge heap files complete", TRACETIME_LOG(Info, heapdump));
|
||||
|
||||
// Since contents in segmented heap file were already zipped, we don't need to zip
|
||||
@@ -2054,10 +2133,9 @@ void DumpMerger::do_merge() {
|
||||
|
||||
// Merge the content of the remaining files into base file. Regardless of whether
|
||||
// the merge process is successful or not, these segmented files will be deleted.
|
||||
char path[JVM_MAXPATHLEN];
|
||||
for (int i = 0; i < _dump_seq; i++) {
|
||||
memset(path, 0, JVM_MAXPATHLEN);
|
||||
os::snprintf(path, JVM_MAXPATHLEN, "%s.p%d", _path, i);
|
||||
ResourceMark rm;
|
||||
const char* path = get_writer_path(_path, i);
|
||||
if (!_has_error) {
|
||||
merge_file(path);
|
||||
}
|
||||
@@ -2087,7 +2165,7 @@ public:
|
||||
};
|
||||
|
||||
// The VM operation that performs the heap dump
|
||||
class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
|
||||
class VM_HeapDumper : public VM_GC_Operation, public WorkerTask, public UnmountedVThreadDumper {
|
||||
private:
|
||||
static VM_HeapDumper* _global_dumper;
|
||||
static DumpWriter* _global_writer;
|
||||
@@ -2107,10 +2185,15 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
|
||||
uint _num_dumper_threads;
|
||||
DumperController* _dumper_controller;
|
||||
ParallelObjectIterator* _poi;
|
||||
// worker id of VMDumper thread.
|
||||
static const size_t VMDumperWorkerId = 0;
|
||||
|
||||
// Dumper id of VMDumper thread.
|
||||
static const int VMDumperId = 0;
|
||||
// VM dumper dumps both heap and non-heap data, other dumpers dump heap-only data.
|
||||
static bool is_vm_dumper(uint worker_id) { return worker_id == VMDumperWorkerId; }
|
||||
static bool is_vm_dumper(int dumper_id) { return dumper_id == VMDumperId; }
|
||||
// the 1st dumper calling get_next_dumper_id becomes VM dumper
|
||||
int get_next_dumper_id() {
|
||||
return Atomic::fetch_then_add(&_dump_seq, 1);
|
||||
}
|
||||
|
||||
// accessors and setters
|
||||
static VM_HeapDumper* dumper() { assert(_global_dumper != nullptr, "Error"); return _global_dumper; }
|
||||
@@ -2129,17 +2212,11 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
|
||||
|
||||
bool skip_operation() const;
|
||||
|
||||
// create dump writer for every parallel dump thread
|
||||
DumpWriter* create_local_writer();
|
||||
|
||||
// writes a HPROF_LOAD_CLASS record
|
||||
// writes a HPROF_LOAD_CLASS record to global writer
|
||||
static void do_load_class(Klass* k);
|
||||
|
||||
// writes a HPROF_GC_CLASS_DUMP record for the given class
|
||||
static void do_class_dump(Klass* k);
|
||||
|
||||
// HPROF_GC_ROOT_THREAD_OBJ records for platform and mounted virtual threads
|
||||
void dump_threads();
|
||||
void dump_threads(AbstractDumpWriter* writer);
|
||||
|
||||
void add_class_serial_number(Klass* k, int serial_num) {
|
||||
_klass_map->at_put_grow(serial_num, k);
|
||||
@@ -2150,7 +2227,7 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
|
||||
}
|
||||
|
||||
// HPROF_TRACE and HPROF_FRAME records for platform and mounted virtual threads
|
||||
void dump_stack_traces();
|
||||
void dump_stack_traces(AbstractDumpWriter* writer);
|
||||
|
||||
public:
|
||||
VM_HeapDumper(DumpWriter* writer, bool gc_before_heap_dump, bool oome, uint num_dump_threads) :
|
||||
@@ -2168,7 +2245,7 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
|
||||
_thread_serial_num = 1;
|
||||
_frame_serial_num = 1;
|
||||
|
||||
_dump_seq = 0;
|
||||
_dump_seq = VMDumperId;
|
||||
_num_dumper_threads = num_dump_threads;
|
||||
_dumper_controller = nullptr;
|
||||
_poi = nullptr;
|
||||
@@ -2202,12 +2279,15 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
|
||||
}
|
||||
int dump_seq() { return _dump_seq; }
|
||||
bool is_parallel_dump() { return _num_dumper_threads > 1; }
|
||||
bool can_parallel_dump(WorkerThreads* workers);
|
||||
void prepare_parallel_dump(WorkerThreads* workers);
|
||||
|
||||
VMOp_Type type() const { return VMOp_HeapDumper; }
|
||||
virtual bool doit_prologue();
|
||||
void doit();
|
||||
void work(uint worker_id);
|
||||
|
||||
// UnmountedVThreadDumper implementation
|
||||
void dump_vthread(oop vt, AbstractDumpWriter* segment_writer);
|
||||
};
|
||||
|
||||
VM_HeapDumper* VM_HeapDumper::_global_dumper = nullptr;
|
||||
@@ -2251,22 +2331,13 @@ void VM_HeapDumper::do_load_class(Klass* k) {
|
||||
writer()->write_symbolID(name);
|
||||
}
|
||||
|
||||
// writes a HPROF_GC_CLASS_DUMP record for the given class
|
||||
void VM_HeapDumper::do_class_dump(Klass* k) {
|
||||
if (k->is_instance_klass()) {
|
||||
DumperSupport::dump_instance_class(writer(), k);
|
||||
} else {
|
||||
DumperSupport::dump_array_class(writer(), k);
|
||||
}
|
||||
}
|
||||
|
||||
// Write a HPROF_GC_ROOT_THREAD_OBJ record for platform/carrier and mounted virtual threads.
|
||||
// Then walk the stack so that locals and JNI locals are dumped.
|
||||
void VM_HeapDumper::dump_threads() {
|
||||
for (int i = 0; i < _thread_dumpers_count; i++) {
|
||||
_thread_dumpers[i]->dump_thread_obj(writer());
|
||||
_thread_dumpers[i]->dump_stack_refs(writer());
|
||||
}
|
||||
void VM_HeapDumper::dump_threads(AbstractDumpWriter* writer) {
|
||||
for (int i = 0; i < _thread_dumpers_count; i++) {
|
||||
_thread_dumpers[i]->dump_thread_obj(writer);
|
||||
_thread_dumpers[i]->dump_stack_refs(writer);
|
||||
}
|
||||
}
|
||||
|
||||
bool VM_HeapDumper::doit_prologue() {
|
||||
@@ -2280,31 +2351,21 @@ bool VM_HeapDumper::doit_prologue() {
|
||||
return VM_GC_Operation::doit_prologue();
|
||||
}
|
||||
|
||||
bool VM_HeapDumper::can_parallel_dump(WorkerThreads* workers) {
|
||||
bool can_parallel = true;
|
||||
void VM_HeapDumper::prepare_parallel_dump(WorkerThreads* workers) {
|
||||
uint num_active_workers = workers != nullptr ? workers->active_workers() : 0;
|
||||
uint num_requested_dump_threads = _num_dumper_threads;
|
||||
// check if we can dump in parallel based on requested and active threads
|
||||
if (num_active_workers <= 1 || num_requested_dump_threads <= 1) {
|
||||
_num_dumper_threads = 1;
|
||||
can_parallel = false;
|
||||
} else {
|
||||
// check if we have extra path room to accommodate segmented heap files
|
||||
const char* base_path = writer()->get_file_path();
|
||||
assert(base_path != nullptr, "sanity check");
|
||||
if ((strlen(base_path) + 7/*.p\d\d\d\d\0*/) >= JVM_MAXPATHLEN) {
|
||||
_num_dumper_threads = 1;
|
||||
can_parallel = false;
|
||||
} else {
|
||||
_num_dumper_threads = clamp(num_requested_dump_threads, 2U, num_active_workers);
|
||||
}
|
||||
_num_dumper_threads = clamp(num_requested_dump_threads, 2U, num_active_workers);
|
||||
}
|
||||
|
||||
_dumper_controller = new (std::nothrow) DumperController(_num_dumper_threads);
|
||||
bool can_parallel = _num_dumper_threads > 1;
|
||||
log_info(heapdump)("Requested dump threads %u, active dump threads %u, "
|
||||
"actual dump threads %u, parallelism %s",
|
||||
num_requested_dump_threads, num_active_workers,
|
||||
_num_dumper_threads, can_parallel ? "true" : "false");
|
||||
return can_parallel;
|
||||
}
|
||||
|
||||
// The VM operation that dumps the heap. The dump consists of the following
|
||||
@@ -2352,11 +2413,11 @@ void VM_HeapDumper::doit() {
|
||||
set_global_writer();
|
||||
|
||||
WorkerThreads* workers = ch->safepoint_workers();
|
||||
if (!can_parallel_dump(workers)) {
|
||||
work(VMDumperWorkerId);
|
||||
prepare_parallel_dump(workers);
|
||||
|
||||
if (!is_parallel_dump()) {
|
||||
work(VMDumperId);
|
||||
} else {
|
||||
uint heap_only_dumper_threads = _num_dumper_threads - 1 /* VMDumper thread */;
|
||||
_dumper_controller = new (std::nothrow) DumperController(heap_only_dumper_threads);
|
||||
ParallelObjectIterator poi(_num_dumper_threads);
|
||||
_poi = &poi;
|
||||
workers->run_task(this, _num_dumper_threads);
|
||||
@@ -2368,26 +2429,19 @@ void VM_HeapDumper::doit() {
|
||||
clear_global_writer();
|
||||
}
|
||||
|
||||
// prepare DumpWriter for every parallel dump thread
|
||||
DumpWriter* VM_HeapDumper::create_local_writer() {
|
||||
char* path = NEW_RESOURCE_ARRAY(char, JVM_MAXPATHLEN);
|
||||
memset(path, 0, JVM_MAXPATHLEN);
|
||||
|
||||
// generate segmented heap file path
|
||||
const char* base_path = writer()->get_file_path();
|
||||
// share global compressor, local DumpWriter is not responsible for its life cycle
|
||||
AbstractCompressor* compressor = writer()->compressor();
|
||||
int seq = Atomic::fetch_then_add(&_dump_seq, 1);
|
||||
os::snprintf(path, JVM_MAXPATHLEN, "%s.p%d", base_path, seq);
|
||||
|
||||
// create corresponding writer for that
|
||||
DumpWriter* local_writer = new DumpWriter(path, writer()->is_overwrite(), compressor);
|
||||
return local_writer;
|
||||
}
|
||||
|
||||
void VM_HeapDumper::work(uint worker_id) {
|
||||
// VM Dumper works on all non-heap data dumping and part of heap iteration.
|
||||
if (is_vm_dumper(worker_id)) {
|
||||
int dumper_id = get_next_dumper_id();
|
||||
|
||||
if (is_vm_dumper(dumper_id)) {
|
||||
// lock global writer, it will be unlocked after VM Dumper finishes with non-heap data
|
||||
_dumper_controller->lock_global_writer();
|
||||
_dumper_controller->signal_start();
|
||||
} else {
|
||||
_dumper_controller->wait_for_start_signal();
|
||||
}
|
||||
|
||||
if (is_vm_dumper(dumper_id)) {
|
||||
TraceTime timer("Dump non-objects", TRACETIME_LOG(Info, heapdump));
|
||||
// Write the file header - we always use 1.0.2
|
||||
const char* header = "JAVA PROFILE 1.0.2";
|
||||
@@ -2409,79 +2463,82 @@ void VM_HeapDumper::work(uint worker_id) {
|
||||
|
||||
// write HPROF_FRAME and HPROF_TRACE records
|
||||
// this must be called after _klass_map is built when iterating the classes above.
|
||||
dump_stack_traces();
|
||||
dump_stack_traces(writer());
|
||||
|
||||
// HPROF_HEAP_DUMP/HPROF_HEAP_DUMP_SEGMENT starts here
|
||||
|
||||
// Writes HPROF_GC_CLASS_DUMP records
|
||||
{
|
||||
LockedClassesDo locked_dump_class(&do_class_dump);
|
||||
ClassLoaderDataGraph::classes_do(&locked_dump_class);
|
||||
}
|
||||
|
||||
// HPROF_GC_ROOT_THREAD_OBJ + frames + jni locals
|
||||
dump_threads();
|
||||
|
||||
// HPROF_GC_ROOT_JNI_GLOBAL
|
||||
JNIGlobalsDumper jni_dumper(writer());
|
||||
JNIHandles::oops_do(&jni_dumper);
|
||||
// technically not jni roots, but global roots
|
||||
// for things like preallocated throwable backtraces
|
||||
Universe::vm_global()->oops_do(&jni_dumper);
|
||||
// HPROF_GC_ROOT_STICKY_CLASS
|
||||
// These should be classes in the null class loader data, and not all classes
|
||||
// if !ClassUnloading
|
||||
StickyClassDumper class_dumper(writer());
|
||||
ClassLoaderData::the_null_class_loader_data()->classes_do(&class_dumper);
|
||||
// unlock global writer, so parallel dumpers can dump stack traces of unmounted virtual threads
|
||||
_dumper_controller->unlock_global_writer();
|
||||
}
|
||||
|
||||
// Heap iteration.
|
||||
// writes HPROF_GC_INSTANCE_DUMP records.
|
||||
// After each sub-record is written check_segment_length will be invoked
|
||||
// to check if the current segment exceeds a threshold. If so, a new
|
||||
// segment is started.
|
||||
// The HPROF_GC_CLASS_DUMP and HPROF_GC_INSTANCE_DUMP are the vast bulk
|
||||
// of the heap dump.
|
||||
if (!is_parallel_dump()) {
|
||||
assert(is_vm_dumper(worker_id), "must be");
|
||||
// == Serial dump
|
||||
ResourceMark rm;
|
||||
TraceTime timer("Dump heap objects", TRACETIME_LOG(Info, heapdump));
|
||||
HeapObjectDumper obj_dumper(writer());
|
||||
Universe::heap()->object_iterate(&obj_dumper);
|
||||
writer()->finish_dump_segment();
|
||||
// Writes the HPROF_HEAP_DUMP_END record because merge does not happen in serial dump
|
||||
DumperSupport::end_of_dump(writer());
|
||||
writer()->flush();
|
||||
} else {
|
||||
// == Parallel dump
|
||||
ResourceMark rm;
|
||||
TraceTime timer("Dump heap objects in parallel", TRACETIME_LOG(Info, heapdump));
|
||||
DumpWriter* local_writer = is_vm_dumper(worker_id) ? writer() : create_local_writer();
|
||||
if (!local_writer->has_error()) {
|
||||
HeapObjectDumper obj_dumper(local_writer);
|
||||
_poi->object_iterate(&obj_dumper, worker_id);
|
||||
local_writer->finish_dump_segment();
|
||||
local_writer->flush();
|
||||
// HPROF_HEAP_DUMP/HPROF_HEAP_DUMP_SEGMENT starts here
|
||||
|
||||
ResourceMark rm;
|
||||
// share global compressor, local DumpWriter is not responsible for its life cycle
|
||||
DumpWriter segment_writer(DumpMerger::get_writer_path(writer()->get_file_path(), dumper_id),
|
||||
writer()->is_overwrite(), writer()->compressor());
|
||||
if (!segment_writer.has_error()) {
|
||||
if (is_vm_dumper(dumper_id)) {
|
||||
// dump some non-heap subrecords to heap dump segment
|
||||
TraceTime timer("Dump non-objects (part 2)", TRACETIME_LOG(Info, heapdump));
|
||||
// Writes HPROF_GC_CLASS_DUMP records
|
||||
ClassDumper class_dumper(&segment_writer);
|
||||
ClassLoaderDataGraph::classes_do(&class_dumper);
|
||||
|
||||
// HPROF_GC_ROOT_THREAD_OBJ + frames + jni locals
|
||||
dump_threads(&segment_writer);
|
||||
|
||||
// HPROF_GC_ROOT_JNI_GLOBAL
|
||||
JNIGlobalsDumper jni_dumper(&segment_writer);
|
||||
JNIHandles::oops_do(&jni_dumper);
|
||||
// technically not jni roots, but global roots
|
||||
// for things like preallocated throwable backtraces
|
||||
Universe::vm_global()->oops_do(&jni_dumper);
|
||||
// HPROF_GC_ROOT_STICKY_CLASS
|
||||
// These should be classes in the null class loader data, and not all classes
|
||||
// if !ClassUnloading
|
||||
StickyClassDumper stiky_class_dumper(&segment_writer);
|
||||
ClassLoaderData::the_null_class_loader_data()->classes_do(&stiky_class_dumper);
|
||||
}
|
||||
if (is_vm_dumper(worker_id)) {
|
||||
_dumper_controller->wait_all_dumpers_complete();
|
||||
|
||||
// Heap iteration.
|
||||
// writes HPROF_GC_INSTANCE_DUMP records.
|
||||
// After each sub-record is written check_segment_length will be invoked
|
||||
// to check if the current segment exceeds a threshold. If so, a new
|
||||
// segment is started.
|
||||
// The HPROF_GC_CLASS_DUMP and HPROF_GC_INSTANCE_DUMP are the vast bulk
|
||||
// of the heap dump.
|
||||
|
||||
TraceTime timer(is_parallel_dump() ? "Dump heap objects in parallel" : "Dump heap objects", TRACETIME_LOG(Info, heapdump));
|
||||
HeapObjectDumper obj_dumper(&segment_writer, this);
|
||||
if (!is_parallel_dump()) {
|
||||
Universe::heap()->object_iterate(&obj_dumper);
|
||||
} else {
|
||||
_dumper_controller->dumper_complete(local_writer, writer());
|
||||
delete local_writer;
|
||||
return;
|
||||
// == Parallel dump
|
||||
_poi->object_iterate(&obj_dumper, worker_id);
|
||||
}
|
||||
|
||||
segment_writer.finish_dump_segment();
|
||||
segment_writer.flush();
|
||||
}
|
||||
|
||||
_dumper_controller->dumper_complete(&segment_writer, writer());
|
||||
|
||||
if (is_vm_dumper(dumper_id)) {
|
||||
_dumper_controller->wait_all_dumpers_complete();
|
||||
|
||||
// flush global writer
|
||||
writer()->flush();
|
||||
|
||||
// At this point, all fragments of the heapdump have been written to separate files.
|
||||
// We need to merge them into a complete heapdump and write HPROF_HEAP_DUMP_END at that time.
|
||||
}
|
||||
// At this point, all fragments of the heapdump have been written to separate files.
|
||||
// We need to merge them into a complete heapdump and write HPROF_HEAP_DUMP_END at that time.
|
||||
}
|
||||
|
||||
void VM_HeapDumper::dump_stack_traces() {
|
||||
void VM_HeapDumper::dump_stack_traces(AbstractDumpWriter* writer) {
|
||||
// write a HPROF_TRACE record without any frames to be referenced as object alloc sites
|
||||
DumperSupport::write_header(writer(), HPROF_TRACE, 3 * sizeof(u4));
|
||||
writer()->write_u4((u4)STACK_TRACE_ID);
|
||||
writer()->write_u4(0); // thread number
|
||||
writer()->write_u4(0); // frame count
|
||||
DumperSupport::write_header(writer, HPROF_TRACE, 3 * sizeof(u4));
|
||||
writer->write_u4((u4)STACK_TRACE_ID);
|
||||
writer->write_u4(0); // thread number
|
||||
writer->write_u4(0); // frame count
|
||||
|
||||
// max number if every platform thread is carrier with mounted virtual thread
|
||||
_thread_dumpers = NEW_C_HEAP_ARRAY(ThreadDumper*, Threads::number_of_threads() * 2, mtInternal);
|
||||
@@ -2505,7 +2562,7 @@ void VM_HeapDumper::dump_stack_traces() {
|
||||
add_oom_frame = false;
|
||||
}
|
||||
thread_dumper->init_serial_nums(&_thread_serial_num, &_frame_serial_num);
|
||||
thread_dumper->dump_stack_traces(writer(), _klass_map);
|
||||
thread_dumper->dump_stack_traces(writer, _klass_map);
|
||||
}
|
||||
|
||||
// platform or carrier thread
|
||||
@@ -2515,11 +2572,27 @@ void VM_HeapDumper::dump_stack_traces() {
|
||||
thread_dumper->add_oom_frame(_oome_constructor);
|
||||
}
|
||||
thread_dumper->init_serial_nums(&_thread_serial_num, &_frame_serial_num);
|
||||
thread_dumper->dump_stack_traces(writer(), _klass_map);
|
||||
thread_dumper->dump_stack_traces(writer, _klass_map);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VM_HeapDumper::dump_vthread(oop vt, AbstractDumpWriter* segment_writer) {
|
||||
// unmounted vthread has no JavaThread
|
||||
ThreadDumper thread_dumper(ThreadDumper::ThreadType::UnmountedVirtual, nullptr, vt);
|
||||
thread_dumper.init_serial_nums(&_thread_serial_num, &_frame_serial_num);
|
||||
|
||||
// write HPROF_TRACE/HPROF_FRAME records to global writer
|
||||
_dumper_controller->lock_global_writer();
|
||||
thread_dumper.dump_stack_traces(writer(), _klass_map);
|
||||
_dumper_controller->unlock_global_writer();
|
||||
|
||||
// write HPROF_GC_ROOT_THREAD_OBJ/HPROF_GC_ROOT_JAVA_FRAME/HPROF_GC_ROOT_JNI_LOCAL subrecord
|
||||
// to segment writer
|
||||
thread_dumper.dump_thread_obj(segment_writer);
|
||||
thread_dumper.dump_stack_refs(segment_writer);
|
||||
}
|
||||
|
||||
// dump the heap to given path.
|
||||
int HeapDumper::dump(const char* path, outputStream* out, int compression, bool overwrite, uint num_dump_threads) {
|
||||
assert(path != nullptr && strlen(path) > 0, "path missing");
|
||||
@@ -2561,28 +2634,27 @@ int HeapDumper::dump(const char* path, outputStream* out, int compression, bool
|
||||
// record any error that the writer may have encountered
|
||||
set_error(writer.error());
|
||||
|
||||
// For serial dump, once VM_HeapDumper completes, the whole heap dump process
|
||||
// is done, no further phases needed. For parallel dump, the whole heap dump
|
||||
// process is done in two phases
|
||||
// Heap dump process is done in two phases
|
||||
//
|
||||
// Phase 1: Concurrent threads directly write heap data to multiple heap files.
|
||||
// This is done by VM_HeapDumper, which is performed within safepoint.
|
||||
//
|
||||
// Phase 2: Merge multiple heap files into one complete heap dump file.
|
||||
// This is done by DumpMerger, which is performed outside safepoint
|
||||
if (dumper.is_parallel_dump()) {
|
||||
DumpMerger merger(path, &writer, dumper.dump_seq());
|
||||
Thread* current_thread = Thread::current();
|
||||
if (current_thread->is_AttachListener_thread()) {
|
||||
// perform heapdump file merge operation in the current thread prevents us
|
||||
// from occupying the VM Thread, which in turn affects the occurrence of
|
||||
// GC and other VM operations.
|
||||
merger.do_merge();
|
||||
} else {
|
||||
// otherwise, performs it by VM thread
|
||||
VM_HeapDumpMerge op(&merger);
|
||||
VMThread::execute(&op);
|
||||
}
|
||||
|
||||
DumpMerger merger(path, &writer, dumper.dump_seq());
|
||||
Thread* current_thread = Thread::current();
|
||||
if (current_thread->is_AttachListener_thread()) {
|
||||
// perform heapdump file merge operation in the current thread prevents us
|
||||
// from occupying the VM Thread, which in turn affects the occurrence of
|
||||
// GC and other VM operations.
|
||||
merger.do_merge();
|
||||
} else {
|
||||
// otherwise, performs it by VM thread
|
||||
VM_HeapDumpMerge op(&merger);
|
||||
VMThread::execute(&op);
|
||||
}
|
||||
if (writer.error() != nullptr) {
|
||||
set_error(writer.error());
|
||||
}
|
||||
|
||||
|
||||
367
src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp
Normal file
367
src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp
Normal file
@@ -0,0 +1,367 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
#ifndef AVX2_QSORT_32BIT
|
||||
#define AVX2_QSORT_32BIT
|
||||
|
||||
#include "avx2-emu-funcs.hpp"
|
||||
#include "xss-common-qsort.h"
|
||||
|
||||
/*
|
||||
* Constants used in sorting 8 elements in a ymm registers. Based on Bitonic
|
||||
* sorting network (see
|
||||
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
|
||||
*/
|
||||
|
||||
// ymm 7, 6, 5, 4, 3, 2, 1, 0
|
||||
#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3
|
||||
#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7
|
||||
#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2
|
||||
#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4
|
||||
|
||||
/*
|
||||
* Assumes ymm is random and performs a full sorting network defined in
|
||||
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
|
||||
*/
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm) {
|
||||
const typename vtype::opmask_t oxAA = _mm256_set_epi32(
|
||||
0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
|
||||
const typename vtype::opmask_t oxCC = _mm256_set_epi32(
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
|
||||
const typename vtype::opmask_t oxF0 = _mm256_set_epi32(
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0, 0);
|
||||
|
||||
const typename vtype::ymmi_t rev_index = vtype::seti(NETWORK_32BIT_AVX2_2);
|
||||
ymm = cmp_merge<vtype>(
|
||||
ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
|
||||
ymm = cmp_merge<vtype>(
|
||||
ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_1), ymm), oxCC);
|
||||
ymm = cmp_merge<vtype>(
|
||||
ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
|
||||
ymm = cmp_merge<vtype>(ymm, vtype::permutexvar(rev_index, ymm), oxF0);
|
||||
ymm = cmp_merge<vtype>(
|
||||
ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_3), ymm), oxCC);
|
||||
ymm = cmp_merge<vtype>(
|
||||
ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
|
||||
return ymm;
|
||||
}
|
||||
|
||||
struct avx2_32bit_swizzle_ops;
|
||||
|
||||
template <>
|
||||
struct avx2_vector<int32_t> {
|
||||
using type_t = int32_t;
|
||||
using reg_t = __m256i;
|
||||
using ymmi_t = __m256i;
|
||||
using opmask_t = __m256i;
|
||||
static const uint8_t numlanes = 8;
|
||||
#ifdef XSS_MINIMAL_NETWORK_SORT
|
||||
static constexpr int network_sort_threshold = numlanes;
|
||||
#else
|
||||
static constexpr int network_sort_threshold = 256;
|
||||
#endif
|
||||
static constexpr int partition_unroll_factor = 4;
|
||||
|
||||
using swizzle_ops = avx2_32bit_swizzle_ops;
|
||||
|
||||
static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
|
||||
static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
|
||||
static reg_t zmm_max() {
|
||||
return _mm256_set1_epi32(type_max());
|
||||
} // TODO: this should broadcast bits as is?
|
||||
static opmask_t get_partial_loadmask(uint64_t num_to_read) {
|
||||
auto mask = ((0x1ull << num_to_read) - 0x1ull);
|
||||
return convert_int_to_avx2_mask(mask);
|
||||
}
|
||||
static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
|
||||
int v8) {
|
||||
return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
|
||||
}
|
||||
static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
|
||||
return _mm256_xor_si256(x, y);
|
||||
}
|
||||
static opmask_t ge(reg_t x, reg_t y) {
|
||||
opmask_t equal = eq(x, y);
|
||||
opmask_t greater = _mm256_cmpgt_epi32(x, y);
|
||||
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(equal),
|
||||
_mm256_castsi256_ps(greater)));
|
||||
}
|
||||
static opmask_t gt(reg_t x, reg_t y) { return _mm256_cmpgt_epi32(x, y); }
|
||||
static opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi32(x, y); }
|
||||
template <int scale>
|
||||
static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index,
|
||||
void const *base) {
|
||||
return _mm256_mask_i32gather_epi32(src, base, index, mask, scale);
|
||||
}
|
||||
template <int scale>
|
||||
static reg_t i64gather(__m256i index, void const *base) {
|
||||
return _mm256_i32gather_epi32((int const *)base, index, scale);
|
||||
}
|
||||
static reg_t loadu(void const *mem) {
|
||||
return _mm256_loadu_si256((reg_t const *)mem);
|
||||
}
|
||||
static reg_t max(reg_t x, reg_t y) { return _mm256_max_epi32(x, y); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
|
||||
return avx2_emu_mask_compressstoreu32<type_t>(mem, mask, x);
|
||||
}
|
||||
static reg_t maskz_loadu(opmask_t mask, void const *mem) {
|
||||
return _mm256_maskload_epi32((const int *)mem, mask);
|
||||
}
|
||||
static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
|
||||
reg_t dst = _mm256_maskload_epi32((type_t *)mem, mask);
|
||||
return mask_mov(x, mask, dst);
|
||||
}
|
||||
static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
|
||||
return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(x),
|
||||
_mm256_castsi256_ps(y),
|
||||
_mm256_castsi256_ps(mask)));
|
||||
}
|
||||
static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
|
||||
return _mm256_maskstore_epi32((type_t *)mem, mask, x);
|
||||
}
|
||||
static reg_t min(reg_t x, reg_t y) { return _mm256_min_epi32(x, y); }
|
||||
static reg_t permutexvar(__m256i idx, reg_t ymm) {
|
||||
return _mm256_permutevar8x32_epi32(ymm, idx);
|
||||
// return avx2_emu_permutexvar_epi32(idx, ymm);
|
||||
}
|
||||
static reg_t permutevar(reg_t ymm, __m256i idx) {
|
||||
return _mm256_permutevar8x32_epi32(ymm, idx);
|
||||
}
|
||||
static reg_t reverse(reg_t ymm) {
|
||||
const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
|
||||
return permutexvar(rev_index, ymm);
|
||||
}
|
||||
static type_t reducemax(reg_t v) {
|
||||
return avx2_emu_reduce_max32<type_t>(v);
|
||||
}
|
||||
static type_t reducemin(reg_t v) {
|
||||
return avx2_emu_reduce_min32<type_t>(v);
|
||||
}
|
||||
static reg_t set1(type_t v) { return _mm256_set1_epi32(v); }
|
||||
template <uint8_t mask>
|
||||
static reg_t shuffle(reg_t ymm) {
|
||||
return _mm256_shuffle_epi32(ymm, mask);
|
||||
}
|
||||
static void storeu(void *mem, reg_t x) {
|
||||
_mm256_storeu_si256((__m256i *)mem, x);
|
||||
}
|
||||
static reg_t sort_vec(reg_t x) {
|
||||
return sort_ymm_32bit<avx2_vector<type_t>>(x);
|
||||
}
|
||||
static reg_t cast_from(__m256i v) { return v; }
|
||||
static __m256i cast_to(reg_t v) { return v; }
|
||||
static int double_compressstore(type_t *left_addr, type_t *right_addr,
|
||||
opmask_t k, reg_t reg) {
|
||||
return avx2_double_compressstore32<type_t>(left_addr, right_addr, k,
|
||||
reg);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct avx2_vector<float> {
|
||||
using type_t = float;
|
||||
using reg_t = __m256;
|
||||
using ymmi_t = __m256i;
|
||||
using opmask_t = __m256i;
|
||||
static const uint8_t numlanes = 8;
|
||||
#ifdef XSS_MINIMAL_NETWORK_SORT
|
||||
static constexpr int network_sort_threshold = numlanes;
|
||||
#else
|
||||
static constexpr int network_sort_threshold = 256;
|
||||
#endif
|
||||
static constexpr int partition_unroll_factor = 4;
|
||||
|
||||
using swizzle_ops = avx2_32bit_swizzle_ops;
|
||||
|
||||
static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
|
||||
static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
|
||||
static reg_t zmm_max() { return _mm256_set1_ps(type_max()); }
|
||||
|
||||
static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
|
||||
int v8) {
|
||||
return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
|
||||
}
|
||||
|
||||
static reg_t maskz_loadu(opmask_t mask, void const *mem) {
|
||||
return _mm256_maskload_ps((const float *)mem, mask);
|
||||
}
|
||||
static opmask_t ge(reg_t x, reg_t y) {
|
||||
return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GE_OQ));
|
||||
}
|
||||
static opmask_t gt(reg_t x, reg_t y) {
|
||||
return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GT_OQ));
|
||||
}
|
||||
static opmask_t eq(reg_t x, reg_t y) {
|
||||
return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_EQ_OQ));
|
||||
}
|
||||
static opmask_t get_partial_loadmask(uint64_t num_to_read) {
|
||||
auto mask = ((0x1ull << num_to_read) - 0x1ull);
|
||||
return convert_int_to_avx2_mask(mask);
|
||||
}
|
||||
static int32_t convert_mask_to_int(opmask_t mask) {
|
||||
return convert_avx2_mask_to_int(mask);
|
||||
}
|
||||
template <int type>
|
||||
static opmask_t fpclass(reg_t x) {
|
||||
if constexpr (type == (0x01 | 0x80)) {
|
||||
return _mm256_castps_si256(_mm256_cmp_ps(x, x, _CMP_UNORD_Q));
|
||||
} else {
|
||||
static_assert(type == (0x01 | 0x80), "should not reach here");
|
||||
}
|
||||
}
|
||||
template <int scale>
|
||||
static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index,
|
||||
void const *base) {
|
||||
return _mm256_mask_i32gather_ps(src, base, index,
|
||||
_mm256_castsi256_ps(mask), scale);
|
||||
;
|
||||
}
|
||||
template <int scale>
|
||||
static reg_t i64gather(__m256i index, void const *base) {
|
||||
return _mm256_i32gather_ps((float *)base, index, scale);
|
||||
}
|
||||
static reg_t loadu(void const *mem) {
|
||||
return _mm256_loadu_ps((float const *)mem);
|
||||
}
|
||||
static reg_t max(reg_t x, reg_t y) { return _mm256_max_ps(x, y); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
|
||||
return avx2_emu_mask_compressstoreu32<type_t>(mem, mask, x);
|
||||
}
|
||||
static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
|
||||
reg_t dst = _mm256_maskload_ps((type_t *)mem, mask);
|
||||
return mask_mov(x, mask, dst);
|
||||
}
|
||||
static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
|
||||
return _mm256_blendv_ps(x, y, _mm256_castsi256_ps(mask));
|
||||
}
|
||||
static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
|
||||
return _mm256_maskstore_ps((type_t *)mem, mask, x);
|
||||
}
|
||||
static reg_t min(reg_t x, reg_t y) { return _mm256_min_ps(x, y); }
|
||||
static reg_t permutexvar(__m256i idx, reg_t ymm) {
|
||||
return _mm256_permutevar8x32_ps(ymm, idx);
|
||||
}
|
||||
static reg_t permutevar(reg_t ymm, __m256i idx) {
|
||||
return _mm256_permutevar8x32_ps(ymm, idx);
|
||||
}
|
||||
static reg_t reverse(reg_t ymm) {
|
||||
const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
|
||||
return permutexvar(rev_index, ymm);
|
||||
}
|
||||
static type_t reducemax(reg_t v) {
|
||||
return avx2_emu_reduce_max32<type_t>(v);
|
||||
}
|
||||
static type_t reducemin(reg_t v) {
|
||||
return avx2_emu_reduce_min32<type_t>(v);
|
||||
}
|
||||
static reg_t set1(type_t v) { return _mm256_set1_ps(v); }
|
||||
template <uint8_t mask>
|
||||
static reg_t shuffle(reg_t ymm) {
|
||||
return _mm256_castsi256_ps(
|
||||
_mm256_shuffle_epi32(_mm256_castps_si256(ymm), mask));
|
||||
}
|
||||
static void storeu(void *mem, reg_t x) {
|
||||
_mm256_storeu_ps((float *)mem, x);
|
||||
}
|
||||
static reg_t sort_vec(reg_t x) {
|
||||
return sort_ymm_32bit<avx2_vector<type_t>>(x);
|
||||
}
|
||||
static reg_t cast_from(__m256i v) { return _mm256_castsi256_ps(v); }
|
||||
static __m256i cast_to(reg_t v) { return _mm256_castps_si256(v); }
|
||||
static int double_compressstore(type_t *left_addr, type_t *right_addr,
|
||||
opmask_t k, reg_t reg) {
|
||||
return avx2_double_compressstore32<type_t>(left_addr, right_addr, k,
|
||||
reg);
|
||||
}
|
||||
};
|
||||
|
||||
struct avx2_32bit_swizzle_ops {
|
||||
template <typename vtype, int scale>
|
||||
X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(
|
||||
typename vtype::reg_t reg) {
|
||||
__m256i v = vtype::cast_to(reg);
|
||||
|
||||
if constexpr (scale == 2) {
|
||||
__m256 vf = _mm256_castsi256_ps(v);
|
||||
vf = _mm256_permute_ps(vf, 0b10110001);
|
||||
v = _mm256_castps_si256(vf);
|
||||
} else if constexpr (scale == 4) {
|
||||
__m256 vf = _mm256_castsi256_ps(v);
|
||||
vf = _mm256_permute_ps(vf, 0b01001110);
|
||||
v = _mm256_castps_si256(vf);
|
||||
} else if constexpr (scale == 8) {
|
||||
v = _mm256_permute2x128_si256(v, v, 0b00000001);
|
||||
} else {
|
||||
static_assert(scale == -1, "should not be reached");
|
||||
}
|
||||
|
||||
return vtype::cast_from(v);
|
||||
}
|
||||
|
||||
template <typename vtype, int scale>
|
||||
X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n(
|
||||
typename vtype::reg_t reg) {
|
||||
__m256i v = vtype::cast_to(reg);
|
||||
|
||||
if constexpr (scale == 2) {
|
||||
return swap_n<vtype, 2>(reg);
|
||||
} else if constexpr (scale == 4) {
|
||||
constexpr uint64_t mask = 0b00011011;
|
||||
__m256 vf = _mm256_castsi256_ps(v);
|
||||
vf = _mm256_permute_ps(vf, mask);
|
||||
v = _mm256_castps_si256(vf);
|
||||
} else if constexpr (scale == 8) {
|
||||
return vtype::reverse(reg);
|
||||
} else {
|
||||
static_assert(scale == -1, "should not be reached");
|
||||
}
|
||||
|
||||
return vtype::cast_from(v);
|
||||
}
|
||||
|
||||
template <typename vtype, int scale>
|
||||
X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n(
|
||||
typename vtype::reg_t reg, typename vtype::reg_t other) {
|
||||
__m256i v1 = vtype::cast_to(reg);
|
||||
__m256i v2 = vtype::cast_to(other);
|
||||
|
||||
if constexpr (scale == 2) {
|
||||
v1 = _mm256_blend_epi32(v1, v2, 0b01010101);
|
||||
} else if constexpr (scale == 4) {
|
||||
v1 = _mm256_blend_epi32(v1, v2, 0b00110011);
|
||||
} else if constexpr (scale == 8) {
|
||||
v1 = _mm256_blend_epi32(v1, v2, 0b00001111);
|
||||
} else {
|
||||
static_assert(scale == -1, "should not be reached");
|
||||
}
|
||||
|
||||
return vtype::cast_from(v1);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // AVX2_QSORT_32BIT
|
||||
183
src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp
Normal file
183
src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp
Normal file
@@ -0,0 +1,183 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
#ifndef AVX2_EMU_FUNCS
|
||||
#define AVX2_EMU_FUNCS
|
||||
|
||||
#include <array>
|
||||
#include <utility>
|
||||
|
||||
#include "xss-common-qsort.h"
|
||||
|
||||
constexpr auto avx2_mask_helper_lut32 = [] {
|
||||
std::array<std::array<int32_t, 8>, 256> lut{};
|
||||
for (int64_t i = 0; i <= 0xFF; i++) {
|
||||
std::array<int32_t, 8> entry{};
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (((i >> j) & 1) == 1)
|
||||
entry[j] = 0xFFFFFFFF;
|
||||
else
|
||||
entry[j] = 0;
|
||||
}
|
||||
lut[i] = entry;
|
||||
}
|
||||
return lut;
|
||||
}();
|
||||
|
||||
constexpr auto avx2_compressstore_lut32_gen = [] {
|
||||
std::array<std::array<std::array<int32_t, 8>, 256>, 2> lutPair{};
|
||||
auto &permLut = lutPair[0];
|
||||
auto &leftLut = lutPair[1];
|
||||
for (int64_t i = 0; i <= 0xFF; i++) {
|
||||
std::array<int32_t, 8> indices{};
|
||||
std::array<int32_t, 8> leftEntry = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
int right = 7;
|
||||
int left = 0;
|
||||
for (int j = 0; j < 8; j++) {
|
||||
bool ge = (i >> j) & 1;
|
||||
if (ge) {
|
||||
indices[right] = j;
|
||||
right--;
|
||||
} else {
|
||||
indices[left] = j;
|
||||
leftEntry[left] = 0xFFFFFFFF;
|
||||
left++;
|
||||
}
|
||||
}
|
||||
permLut[i] = indices;
|
||||
leftLut[i] = leftEntry;
|
||||
}
|
||||
return lutPair;
|
||||
}();
|
||||
|
||||
constexpr auto avx2_compressstore_lut32_perm = avx2_compressstore_lut32_gen[0];
|
||||
constexpr auto avx2_compressstore_lut32_left = avx2_compressstore_lut32_gen[1];
|
||||
|
||||
|
||||
X86_SIMD_SORT_INLINE
|
||||
__m256i convert_int_to_avx2_mask(int32_t m) {
|
||||
return _mm256_loadu_si256(
|
||||
(const __m256i *)avx2_mask_helper_lut32[m].data());
|
||||
}
|
||||
|
||||
X86_SIMD_SORT_INLINE
|
||||
int32_t convert_avx2_mask_to_int(__m256i m) {
|
||||
return _mm256_movemask_ps(_mm256_castsi256_ps(m));
|
||||
}
|
||||
|
||||
// Emulators for intrinsics missing from AVX2 compared to AVX512
|
||||
template <typename T>
|
||||
T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x) {
|
||||
using vtype = avx2_vector<T>;
|
||||
using reg_t = typename vtype::reg_t;
|
||||
|
||||
reg_t inter1 =
|
||||
vtype::max(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
|
||||
reg_t inter2 = vtype::max(
|
||||
inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
|
||||
T arr[vtype::numlanes];
|
||||
vtype::storeu(arr, inter2);
|
||||
return std::max(arr[0], arr[7]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x) {
|
||||
using vtype = avx2_vector<T>;
|
||||
using reg_t = typename vtype::reg_t;
|
||||
|
||||
reg_t inter1 =
|
||||
vtype::min(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
|
||||
reg_t inter2 = vtype::min(
|
||||
inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
|
||||
T arr[vtype::numlanes];
|
||||
vtype::storeu(arr, inter2);
|
||||
return std::min(arr[0], arr[7]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void avx2_emu_mask_compressstoreu32(void *base_addr,
|
||||
typename avx2_vector<T>::opmask_t k,
|
||||
typename avx2_vector<T>::reg_t reg) {
|
||||
using vtype = avx2_vector<T>;
|
||||
|
||||
T *leftStore = (T *)base_addr;
|
||||
|
||||
int32_t shortMask = convert_avx2_mask_to_int(k);
|
||||
const __m256i &perm = _mm256_loadu_si256(
|
||||
(const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
|
||||
const __m256i &left = _mm256_loadu_si256(
|
||||
(const __m256i *)avx2_compressstore_lut32_left[shortMask].data());
|
||||
|
||||
typename vtype::reg_t temp = vtype::permutevar(reg, perm);
|
||||
|
||||
vtype::mask_storeu(leftStore, left, temp);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
int avx2_double_compressstore32(void *left_addr, void *right_addr,
|
||||
typename avx2_vector<T>::opmask_t k,
|
||||
typename avx2_vector<T>::reg_t reg) {
|
||||
using vtype = avx2_vector<T>;
|
||||
|
||||
T *leftStore = (T *)left_addr;
|
||||
T *rightStore = (T *)right_addr;
|
||||
|
||||
int32_t shortMask = convert_avx2_mask_to_int(k);
|
||||
const __m256i &perm = _mm256_loadu_si256(
|
||||
(const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
|
||||
|
||||
typename vtype::reg_t temp = vtype::permutevar(reg, perm);
|
||||
|
||||
vtype::storeu(leftStore, temp);
|
||||
vtype::storeu(rightStore, temp);
|
||||
|
||||
return _mm_popcnt_u32(shortMask);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
|
||||
typename avx2_vector<T>::reg_t y) {
|
||||
using vtype = avx2_vector<T>;
|
||||
typename vtype::opmask_t nlt = vtype::gt(x, y);
|
||||
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y),
|
||||
_mm256_castsi256_pd(x),
|
||||
_mm256_castsi256_pd(nlt)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
|
||||
typename avx2_vector<T>::reg_t y) {
|
||||
using vtype = avx2_vector<T>;
|
||||
typename vtype::opmask_t nlt = vtype::gt(x, y);
|
||||
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x),
|
||||
_mm256_castsi256_pd(y),
|
||||
_mm256_castsi256_pd(nlt)));
|
||||
}
|
||||
|
||||
#endif
|
||||
66
src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp
Normal file
66
src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (c) 2023 Intel Corporation. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "simdsort-support.hpp"
|
||||
#ifdef __SIMDSORT_SUPPORTED_LINUX
|
||||
|
||||
#pragma GCC target("avx2")
|
||||
#include "avx2-32bit-qsort.hpp"
|
||||
#include "classfile_constants.h"
|
||||
|
||||
|
||||
#define DLL_PUBLIC __attribute__((visibility("default")))
|
||||
#define INSERTION_SORT_THRESHOLD_32BIT 16
|
||||
|
||||
extern "C" {
|
||||
|
||||
DLL_PUBLIC void avx2_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) {
|
||||
switch(elem_type) {
|
||||
case JVM_T_INT:
|
||||
avx2_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
|
||||
break;
|
||||
case JVM_T_FLOAT:
|
||||
avx2_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
|
||||
break;
|
||||
default:
|
||||
assert(false, "Unexpected type");
|
||||
}
|
||||
}
|
||||
|
||||
DLL_PUBLIC void avx2_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
|
||||
switch(elem_type) {
|
||||
case JVM_T_INT:
|
||||
avx2_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
break;
|
||||
case JVM_T_FLOAT:
|
||||
avx2_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
break;
|
||||
default:
|
||||
assert(false, "Unexpected type");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -28,7 +28,7 @@
|
||||
#ifndef AVX512_QSORT_32BIT
|
||||
#define AVX512_QSORT_32BIT
|
||||
|
||||
#include "avx512-common-qsort.h"
|
||||
#include "xss-common-qsort.h"
|
||||
|
||||
/*
|
||||
* Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
|
||||
@@ -43,130 +43,204 @@
|
||||
#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
|
||||
#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
|
||||
|
||||
template <typename vtype, typename reg_t>
|
||||
X86_SIMD_SORT_INLINE reg_t sort_zmm_32bit(reg_t zmm);
|
||||
|
||||
struct avx512_32bit_swizzle_ops;
|
||||
|
||||
template <>
|
||||
struct zmm_vector<int32_t> {
|
||||
using type_t = int32_t;
|
||||
using zmm_t = __m512i;
|
||||
using ymm_t = __m256i;
|
||||
using reg_t = __m512i;
|
||||
using halfreg_t = __m256i;
|
||||
using opmask_t = __mmask16;
|
||||
static const uint8_t numlanes = 16;
|
||||
#ifdef XSS_MINIMAL_NETWORK_SORT
|
||||
static constexpr int network_sort_threshold = numlanes;
|
||||
#else
|
||||
static constexpr int network_sort_threshold = 512;
|
||||
#endif
|
||||
static constexpr int partition_unroll_factor = 8;
|
||||
|
||||
using swizzle_ops = avx512_32bit_swizzle_ops;
|
||||
|
||||
static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
|
||||
static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
|
||||
static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
|
||||
static reg_t zmm_max() { return _mm512_set1_epi32(type_max()); }
|
||||
|
||||
static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
|
||||
static opmask_t ge(zmm_t x, zmm_t y) {
|
||||
|
||||
static opmask_t ge(reg_t x, reg_t y) {
|
||||
return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
|
||||
}
|
||||
static opmask_t gt(zmm_t x, zmm_t y) {
|
||||
|
||||
static opmask_t gt(reg_t x, reg_t y) {
|
||||
return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_GT);
|
||||
}
|
||||
|
||||
static opmask_t get_partial_loadmask(uint64_t num_to_read) {
|
||||
return ((0x1ull << num_to_read) - 0x1ull);
|
||||
}
|
||||
template <int scale>
|
||||
static ymm_t i64gather(__m512i index, void const *base) {
|
||||
static halfreg_t i64gather(__m512i index, void const *base) {
|
||||
return _mm512_i64gather_epi32(index, base, scale);
|
||||
}
|
||||
static zmm_t merge(ymm_t y1, ymm_t y2) {
|
||||
zmm_t z1 = _mm512_castsi256_si512(y1);
|
||||
static reg_t merge(halfreg_t y1, halfreg_t y2) {
|
||||
reg_t z1 = _mm512_castsi256_si512(y1);
|
||||
return _mm512_inserti32x8(z1, y2, 1);
|
||||
}
|
||||
static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
|
||||
static reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
|
||||
return _mm512_mask_compressstoreu_epi32(mem, mask, x);
|
||||
}
|
||||
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
|
||||
static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
|
||||
return _mm512_mask_loadu_epi32(x, mask, mem);
|
||||
}
|
||||
static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
|
||||
static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
|
||||
return _mm512_mask_mov_epi32(x, mask, y);
|
||||
}
|
||||
static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
|
||||
static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
|
||||
return _mm512_mask_storeu_epi32(mem, mask, x);
|
||||
}
|
||||
static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
|
||||
static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
|
||||
static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
|
||||
static reg_t min(reg_t x, reg_t y) { return _mm512_min_epi32(x, y); }
|
||||
static reg_t max(reg_t x, reg_t y) { return _mm512_max_epi32(x, y); }
|
||||
static reg_t permutexvar(__m512i idx, reg_t zmm) {
|
||||
return _mm512_permutexvar_epi32(idx, zmm);
|
||||
}
|
||||
static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi32(v); }
|
||||
static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi32(v); }
|
||||
static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
|
||||
static type_t reducemax(reg_t v) { return _mm512_reduce_max_epi32(v); }
|
||||
static type_t reducemin(reg_t v) { return _mm512_reduce_min_epi32(v); }
|
||||
static reg_t set1(type_t v) { return _mm512_set1_epi32(v); }
|
||||
template <uint8_t mask>
|
||||
static zmm_t shuffle(zmm_t zmm) {
|
||||
static reg_t shuffle(reg_t zmm) {
|
||||
return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
|
||||
}
|
||||
static void storeu(void *mem, zmm_t x) {
|
||||
static void storeu(void *mem, reg_t x) {
|
||||
return _mm512_storeu_si512(mem, x);
|
||||
}
|
||||
|
||||
static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
|
||||
static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
|
||||
static halfreg_t max(halfreg_t x, halfreg_t y) {
|
||||
return _mm256_max_epi32(x, y);
|
||||
}
|
||||
static halfreg_t min(halfreg_t x, halfreg_t y) {
|
||||
return _mm256_min_epi32(x, y);
|
||||
}
|
||||
static reg_t reverse(reg_t zmm) {
|
||||
const auto rev_index = _mm512_set_epi32(NETWORK_32BIT_5);
|
||||
return permutexvar(rev_index, zmm);
|
||||
}
|
||||
static reg_t sort_vec(reg_t x) {
|
||||
return sort_zmm_32bit<zmm_vector<type_t>>(x);
|
||||
}
|
||||
static reg_t cast_from(__m512i v) { return v; }
|
||||
static __m512i cast_to(reg_t v) { return v; }
|
||||
static int double_compressstore(type_t *left_addr, type_t *right_addr,
|
||||
opmask_t k, reg_t reg) {
|
||||
return avx512_double_compressstore<zmm_vector<type_t>>(
|
||||
left_addr, right_addr, k, reg);
|
||||
}
|
||||
};
|
||||
template <>
|
||||
struct zmm_vector<float> {
|
||||
using type_t = float;
|
||||
using zmm_t = __m512;
|
||||
using ymm_t = __m256;
|
||||
using reg_t = __m512;
|
||||
using halfreg_t = __m256;
|
||||
using opmask_t = __mmask16;
|
||||
static const uint8_t numlanes = 16;
|
||||
#ifdef XSS_MINIMAL_NETWORK_SORT
|
||||
static constexpr int network_sort_threshold = numlanes;
|
||||
#else
|
||||
static constexpr int network_sort_threshold = 512;
|
||||
#endif
|
||||
static constexpr int partition_unroll_factor = 8;
|
||||
|
||||
using swizzle_ops = avx512_32bit_swizzle_ops;
|
||||
|
||||
static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
|
||||
static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
|
||||
static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
|
||||
static reg_t zmm_max() { return _mm512_set1_ps(type_max()); }
|
||||
|
||||
static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
|
||||
static opmask_t ge(zmm_t x, zmm_t y) {
|
||||
static opmask_t ge(reg_t x, reg_t y) {
|
||||
return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
|
||||
}
|
||||
static opmask_t gt(zmm_t x, zmm_t y) {
|
||||
static opmask_t gt(reg_t x, reg_t y) {
|
||||
return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
|
||||
}
|
||||
static opmask_t get_partial_loadmask(uint64_t num_to_read) {
|
||||
return ((0x1ull << num_to_read) - 0x1ull);
|
||||
}
|
||||
static int32_t convert_mask_to_int(opmask_t mask) { return mask; }
|
||||
template <int type>
|
||||
static opmask_t fpclass(reg_t x) {
|
||||
return _mm512_fpclass_ps_mask(x, type);
|
||||
}
|
||||
template <int scale>
|
||||
static ymm_t i64gather(__m512i index, void const *base) {
|
||||
static halfreg_t i64gather(__m512i index, void const *base) {
|
||||
return _mm512_i64gather_ps(index, base, scale);
|
||||
}
|
||||
static zmm_t merge(ymm_t y1, ymm_t y2) {
|
||||
zmm_t z1 = _mm512_castsi512_ps(
|
||||
static reg_t merge(halfreg_t y1, halfreg_t y2) {
|
||||
reg_t z1 = _mm512_castsi512_ps(
|
||||
_mm512_castsi256_si512(_mm256_castps_si256(y1)));
|
||||
return _mm512_insertf32x8(z1, y2, 1);
|
||||
}
|
||||
static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
|
||||
static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
|
||||
static reg_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
|
||||
static reg_t max(reg_t x, reg_t y) { return _mm512_max_ps(x, y); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
|
||||
return _mm512_mask_compressstoreu_ps(mem, mask, x);
|
||||
}
|
||||
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
|
||||
static reg_t maskz_loadu(opmask_t mask, void const *mem) {
|
||||
return _mm512_maskz_loadu_ps(mask, mem);
|
||||
}
|
||||
static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
|
||||
return _mm512_mask_loadu_ps(x, mask, mem);
|
||||
}
|
||||
static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
|
||||
static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
|
||||
return _mm512_mask_mov_ps(x, mask, y);
|
||||
}
|
||||
static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
|
||||
static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
|
||||
return _mm512_mask_storeu_ps(mem, mask, x);
|
||||
}
|
||||
static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
|
||||
static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
|
||||
static reg_t min(reg_t x, reg_t y) { return _mm512_min_ps(x, y); }
|
||||
static reg_t permutexvar(__m512i idx, reg_t zmm) {
|
||||
return _mm512_permutexvar_ps(idx, zmm);
|
||||
}
|
||||
static type_t reducemax(zmm_t v) { return _mm512_reduce_max_ps(v); }
|
||||
static type_t reducemin(zmm_t v) { return _mm512_reduce_min_ps(v); }
|
||||
static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
|
||||
static type_t reducemax(reg_t v) { return _mm512_reduce_max_ps(v); }
|
||||
static type_t reducemin(reg_t v) { return _mm512_reduce_min_ps(v); }
|
||||
static reg_t set1(type_t v) { return _mm512_set1_ps(v); }
|
||||
template <uint8_t mask>
|
||||
static zmm_t shuffle(zmm_t zmm) {
|
||||
static reg_t shuffle(reg_t zmm) {
|
||||
return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
|
||||
}
|
||||
static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
|
||||
static void storeu(void *mem, reg_t x) { return _mm512_storeu_ps(mem, x); }
|
||||
|
||||
static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
|
||||
static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
|
||||
static halfreg_t max(halfreg_t x, halfreg_t y) {
|
||||
return _mm256_max_ps(x, y);
|
||||
}
|
||||
static halfreg_t min(halfreg_t x, halfreg_t y) {
|
||||
return _mm256_min_ps(x, y);
|
||||
}
|
||||
static reg_t reverse(reg_t zmm) {
|
||||
const auto rev_index = _mm512_set_epi32(NETWORK_32BIT_5);
|
||||
return permutexvar(rev_index, zmm);
|
||||
}
|
||||
static reg_t sort_vec(reg_t x) {
|
||||
return sort_zmm_32bit<zmm_vector<type_t>>(x);
|
||||
}
|
||||
static reg_t cast_from(__m512i v) { return _mm512_castsi512_ps(v); }
|
||||
static __m512i cast_to(reg_t v) { return _mm512_castps_si512(v); }
|
||||
static int double_compressstore(type_t *left_addr, type_t *right_addr,
|
||||
opmask_t k, reg_t reg) {
|
||||
return avx512_double_compressstore<zmm_vector<type_t>>(
|
||||
left_addr, right_addr, k, reg);
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Assumes zmm is random and performs a full sorting network defined in
|
||||
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
|
||||
*/
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
|
||||
X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) {
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_INLINE reg_t sort_zmm_32bit(reg_t zmm) {
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
|
||||
zmm = cmp_merge<vtype>(
|
||||
@@ -193,249 +267,71 @@ X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) {
|
||||
return zmm;
|
||||
}
|
||||
|
||||
// Assumes zmm is bitonic and performs a recursive half cleaner
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
|
||||
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) {
|
||||
// 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm),
|
||||
0xFF00);
|
||||
// 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
|
||||
0xF0F0);
|
||||
// 3) half_cleaner[4]
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC);
|
||||
// 3) half_cleaner[1]
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
|
||||
return zmm;
|
||||
}
|
||||
struct avx512_32bit_swizzle_ops {
|
||||
template <typename vtype, int scale>
|
||||
X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(
|
||||
typename vtype::reg_t reg) {
|
||||
__m512i v = vtype::cast_to(reg);
|
||||
|
||||
// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
|
||||
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1,
|
||||
zmm_t *zmm2) {
|
||||
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
|
||||
*zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
|
||||
zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
|
||||
zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
|
||||
// 2) Recursive half cleaner for each
|
||||
*zmm1 = bitonic_merge_zmm_32bit<vtype>(zmm3);
|
||||
*zmm2 = bitonic_merge_zmm_32bit<vtype>(zmm4);
|
||||
}
|
||||
if constexpr (scale == 2) {
|
||||
v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b10110001);
|
||||
} else if constexpr (scale == 4) {
|
||||
v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b01001110);
|
||||
} else if constexpr (scale == 8) {
|
||||
v = _mm512_shuffle_i64x2(v, v, 0b10110001);
|
||||
} else if constexpr (scale == 16) {
|
||||
v = _mm512_shuffle_i64x2(v, v, 0b01001110);
|
||||
} else {
|
||||
static_assert(scale == -1, "should not be reached");
|
||||
}
|
||||
|
||||
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
|
||||
// half cleaner
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
|
||||
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) {
|
||||
zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
|
||||
zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
|
||||
zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
|
||||
zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
|
||||
zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
|
||||
vtype::max(zmm[1], zmm2r));
|
||||
zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
|
||||
vtype::max(zmm[0], zmm3r));
|
||||
zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
|
||||
zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
|
||||
zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
|
||||
zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
|
||||
zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm0);
|
||||
zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm1);
|
||||
zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm2);
|
||||
zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm3);
|
||||
}
|
||||
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
|
||||
X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) {
|
||||
zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
|
||||
zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
|
||||
zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]);
|
||||
zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]);
|
||||
zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
|
||||
zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
|
||||
zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
|
||||
zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
|
||||
zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
|
||||
vtype::max(zmm[3], zmm4r));
|
||||
zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
|
||||
vtype::max(zmm[2], zmm5r));
|
||||
zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
|
||||
vtype::max(zmm[1], zmm6r));
|
||||
zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
|
||||
vtype::max(zmm[0], zmm7r));
|
||||
COEX<vtype>(zmm_t1, zmm_t3);
|
||||
COEX<vtype>(zmm_t2, zmm_t4);
|
||||
COEX<vtype>(zmm_t5, zmm_t7);
|
||||
COEX<vtype>(zmm_t6, zmm_t8);
|
||||
COEX<vtype>(zmm_t1, zmm_t2);
|
||||
COEX<vtype>(zmm_t3, zmm_t4);
|
||||
COEX<vtype>(zmm_t5, zmm_t6);
|
||||
COEX<vtype>(zmm_t7, zmm_t8);
|
||||
zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm_t1);
|
||||
zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm_t2);
|
||||
zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm_t3);
|
||||
zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm_t4);
|
||||
zmm[4] = bitonic_merge_zmm_32bit<vtype>(zmm_t5);
|
||||
zmm[5] = bitonic_merge_zmm_32bit<vtype>(zmm_t6);
|
||||
zmm[6] = bitonic_merge_zmm_32bit<vtype>(zmm_t7);
|
||||
zmm[7] = bitonic_merge_zmm_32bit<vtype>(zmm_t8);
|
||||
}
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N) {
|
||||
typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
|
||||
typename vtype::zmm_t zmm =
|
||||
vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
|
||||
vtype::mask_storeu(arr, load_mask, sort_zmm_32bit<vtype>(zmm));
|
||||
}
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N) {
|
||||
if (N <= 16) {
|
||||
sort_16_32bit<vtype>(arr, N);
|
||||
return;
|
||||
}
|
||||
using zmm_t = typename vtype::zmm_t;
|
||||
zmm_t zmm1 = vtype::loadu(arr);
|
||||
typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001;
|
||||
zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
|
||||
zmm1 = sort_zmm_32bit<vtype>(zmm1);
|
||||
zmm2 = sort_zmm_32bit<vtype>(zmm2);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm1, &zmm2);
|
||||
vtype::storeu(arr, zmm1);
|
||||
vtype::mask_storeu(arr + 16, load_mask, zmm2);
|
||||
}
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N) {
|
||||
if (N <= 32) {
|
||||
sort_32_32bit<vtype>(arr, N);
|
||||
return;
|
||||
}
|
||||
using zmm_t = typename vtype::zmm_t;
|
||||
using opmask_t = typename vtype::opmask_t;
|
||||
zmm_t zmm[4];
|
||||
zmm[0] = vtype::loadu(arr);
|
||||
zmm[1] = vtype::loadu(arr + 16);
|
||||
opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
|
||||
uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
|
||||
load_mask1 &= combined_mask & 0xFFFF;
|
||||
load_mask2 &= (combined_mask >> 16) & 0xFFFF;
|
||||
zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
|
||||
zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
|
||||
zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
|
||||
zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
|
||||
zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
|
||||
zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
|
||||
bitonic_merge_four_zmm_32bit<vtype>(zmm);
|
||||
vtype::storeu(arr, zmm[0]);
|
||||
vtype::storeu(arr + 16, zmm[1]);
|
||||
vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
|
||||
vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
|
||||
}
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) {
|
||||
if (N <= 64) {
|
||||
sort_64_32bit<vtype>(arr, N);
|
||||
return;
|
||||
}
|
||||
using zmm_t = typename vtype::zmm_t;
|
||||
using opmask_t = typename vtype::opmask_t;
|
||||
zmm_t zmm[8];
|
||||
zmm[0] = vtype::loadu(arr);
|
||||
zmm[1] = vtype::loadu(arr + 16);
|
||||
zmm[2] = vtype::loadu(arr + 32);
|
||||
zmm[3] = vtype::loadu(arr + 48);
|
||||
zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
|
||||
zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
|
||||
zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
|
||||
zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
|
||||
opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
|
||||
opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
|
||||
if (N != 128) {
|
||||
uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
|
||||
load_mask1 &= combined_mask & 0xFFFF;
|
||||
load_mask2 &= (combined_mask >> 16) & 0xFFFF;
|
||||
load_mask3 &= (combined_mask >> 32) & 0xFFFF;
|
||||
load_mask4 &= (combined_mask >> 48) & 0xFFFF;
|
||||
}
|
||||
zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
|
||||
zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
|
||||
zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
|
||||
zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
|
||||
zmm[4] = sort_zmm_32bit<vtype>(zmm[4]);
|
||||
zmm[5] = sort_zmm_32bit<vtype>(zmm[5]);
|
||||
zmm[6] = sort_zmm_32bit<vtype>(zmm[6]);
|
||||
zmm[7] = sort_zmm_32bit<vtype>(zmm[7]);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm[4], &zmm[5]);
|
||||
bitonic_merge_two_zmm_32bit<vtype>(&zmm[6], &zmm[7]);
|
||||
bitonic_merge_four_zmm_32bit<vtype>(zmm);
|
||||
bitonic_merge_four_zmm_32bit<vtype>(zmm + 4);
|
||||
bitonic_merge_eight_zmm_32bit<vtype>(zmm);
|
||||
vtype::storeu(arr, zmm[0]);
|
||||
vtype::storeu(arr + 16, zmm[1]);
|
||||
vtype::storeu(arr + 32, zmm[2]);
|
||||
vtype::storeu(arr + 48, zmm[3]);
|
||||
vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
|
||||
vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
|
||||
vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
|
||||
vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
|
||||
}
|
||||
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
|
||||
int64_t max_iters) {
|
||||
/*
|
||||
* Resort to std::sort if quicksort isnt making any progress
|
||||
*/
|
||||
if (max_iters <= 0) {
|
||||
std::sort(arr + left, arr + right + 1);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Base case: use bitonic networks to sort arrays <= 128
|
||||
*/
|
||||
if (right + 1 - left <= 128) {
|
||||
sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
|
||||
return;
|
||||
return vtype::cast_from(v);
|
||||
}
|
||||
|
||||
type_t pivot = get_pivot_scalar<type_t>(arr, left, right);
|
||||
type_t smallest = vtype::type_max();
|
||||
type_t biggest = vtype::type_min();
|
||||
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
|
||||
arr, left, right + 1, pivot, &smallest, &biggest, false);
|
||||
if (pivot != smallest)
|
||||
qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
|
||||
if (pivot != biggest)
|
||||
qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
|
||||
}
|
||||
template <typename vtype, int scale>
|
||||
X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n(
|
||||
typename vtype::reg_t reg) {
|
||||
__m512i v = vtype::cast_to(reg);
|
||||
|
||||
template <>
|
||||
void inline avx512_qsort<int32_t>(int32_t *arr, int64_t fromIndex, int64_t toIndex) {
|
||||
int64_t arrsize = toIndex - fromIndex;
|
||||
if (arrsize > 1) {
|
||||
qsort_32bit_<zmm_vector<int32_t>, int32_t>(arr, fromIndex, toIndex - 1,
|
||||
2 * (int64_t)log2(arrsize));
|
||||
}
|
||||
}
|
||||
if constexpr (scale == 2) {
|
||||
return swap_n<vtype, 2>(reg);
|
||||
} else if constexpr (scale == 4) {
|
||||
__m512i mask = _mm512_set_epi32(12, 13, 14, 15, 8, 9, 10, 11, 4, 5,
|
||||
6, 7, 0, 1, 2, 3);
|
||||
v = _mm512_permutexvar_epi32(mask, v);
|
||||
} else if constexpr (scale == 8) {
|
||||
__m512i mask = _mm512_set_epi32(8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
|
||||
2, 3, 4, 5, 6, 7);
|
||||
v = _mm512_permutexvar_epi32(mask, v);
|
||||
} else if constexpr (scale == 16) {
|
||||
return vtype::reverse(reg);
|
||||
} else {
|
||||
static_assert(scale == -1, "should not be reached");
|
||||
}
|
||||
|
||||
template <>
|
||||
void inline avx512_qsort<float>(float *arr, int64_t fromIndex, int64_t toIndex) {
|
||||
int64_t arrsize = toIndex - fromIndex;
|
||||
if (arrsize > 1) {
|
||||
qsort_32bit_<zmm_vector<float>, float>(arr, fromIndex, toIndex - 1,
|
||||
2 * (int64_t)log2(arrsize));
|
||||
return vtype::cast_from(v);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename vtype, int scale>
|
||||
X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n(
|
||||
typename vtype::reg_t reg, typename vtype::reg_t other) {
|
||||
__m512i v1 = vtype::cast_to(reg);
|
||||
__m512i v2 = vtype::cast_to(other);
|
||||
|
||||
if constexpr (scale == 2) {
|
||||
v1 = _mm512_mask_blend_epi32(0b0101010101010101, v1, v2);
|
||||
} else if constexpr (scale == 4) {
|
||||
v1 = _mm512_mask_blend_epi32(0b0011001100110011, v1, v2);
|
||||
} else if constexpr (scale == 8) {
|
||||
v1 = _mm512_mask_blend_epi32(0b0000111100001111, v1, v2);
|
||||
} else if constexpr (scale == 16) {
|
||||
v1 = _mm512_mask_blend_epi32(0b0000000011111111, v1, v2);
|
||||
} else {
|
||||
static_assert(scale == -1, "should not be reached");
|
||||
}
|
||||
|
||||
return vtype::cast_from(v1);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // AVX512_QSORT_32BIT
|
||||
|
||||
@@ -1,212 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
#ifndef AVX512_64BIT_COMMON
|
||||
#define AVX512_64BIT_COMMON
|
||||
#include "avx512-common-qsort.h"
|
||||
|
||||
/*
|
||||
* Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
|
||||
* sorting network (see
|
||||
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
|
||||
*/
|
||||
// ZMM 7, 6, 5, 4, 3, 2, 1, 0
|
||||
#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3
|
||||
#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
|
||||
#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
|
||||
#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
|
||||
|
||||
template <>
|
||||
struct zmm_vector<int64_t> {
|
||||
using type_t = int64_t;
|
||||
using zmm_t = __m512i;
|
||||
using zmmi_t = __m512i;
|
||||
using ymm_t = __m512i;
|
||||
using opmask_t = __mmask8;
|
||||
static const uint8_t numlanes = 8;
|
||||
|
||||
static type_t type_max() { return X86_SIMD_SORT_MAX_INT64; }
|
||||
static type_t type_min() { return X86_SIMD_SORT_MIN_INT64; }
|
||||
static zmm_t zmm_max() {
|
||||
return _mm512_set1_epi64(type_max());
|
||||
} // TODO: this should broadcast bits as is?
|
||||
|
||||
static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
|
||||
int v8) {
|
||||
return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
|
||||
}
|
||||
static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
|
||||
return _kxor_mask8(x, y);
|
||||
}
|
||||
static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
|
||||
static opmask_t le(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_LE);
|
||||
}
|
||||
static opmask_t ge(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
|
||||
}
|
||||
static opmask_t gt(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_GT);
|
||||
}
|
||||
static opmask_t eq(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ);
|
||||
}
|
||||
template <int scale>
|
||||
static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
|
||||
void const *base) {
|
||||
return _mm512_mask_i64gather_epi64(src, mask, index, base, scale);
|
||||
}
|
||||
template <int scale>
|
||||
static zmm_t i64gather(__m512i index, void const *base) {
|
||||
return _mm512_i64gather_epi64(index, base, scale);
|
||||
}
|
||||
static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
|
||||
static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi64(x, y); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
|
||||
return _mm512_mask_compressstoreu_epi64(mem, mask, x);
|
||||
}
|
||||
static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
|
||||
return _mm512_maskz_loadu_epi64(mask, mem);
|
||||
}
|
||||
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
|
||||
return _mm512_mask_loadu_epi64(x, mask, mem);
|
||||
}
|
||||
static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
|
||||
return _mm512_mask_mov_epi64(x, mask, y);
|
||||
}
|
||||
static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
|
||||
return _mm512_mask_storeu_epi64(mem, mask, x);
|
||||
}
|
||||
static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi64(x, y); }
|
||||
static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
|
||||
return _mm512_permutexvar_epi64(idx, zmm);
|
||||
}
|
||||
static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi64(v); }
|
||||
static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi64(v); }
|
||||
static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); }
|
||||
template <uint8_t mask>
|
||||
static zmm_t shuffle(zmm_t zmm) {
|
||||
__m512d temp = _mm512_castsi512_pd(zmm);
|
||||
return _mm512_castpd_si512(
|
||||
_mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
|
||||
}
|
||||
static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); }
|
||||
};
|
||||
template <>
|
||||
struct zmm_vector<double> {
|
||||
using type_t = double;
|
||||
using zmm_t = __m512d;
|
||||
using zmmi_t = __m512i;
|
||||
using ymm_t = __m512d;
|
||||
using opmask_t = __mmask8;
|
||||
static const uint8_t numlanes = 8;
|
||||
|
||||
static type_t type_max() { return X86_SIMD_SORT_INFINITY; }
|
||||
static type_t type_min() { return -X86_SIMD_SORT_INFINITY; }
|
||||
static zmm_t zmm_max() { return _mm512_set1_pd(type_max()); }
|
||||
|
||||
static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
|
||||
int v8) {
|
||||
return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
|
||||
}
|
||||
|
||||
static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
|
||||
return _mm512_maskz_loadu_pd(mask, mem);
|
||||
}
|
||||
static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
|
||||
static opmask_t ge(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ);
|
||||
}
|
||||
static opmask_t gt(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ);
|
||||
}
|
||||
static opmask_t eq(zmm_t x, zmm_t y) {
|
||||
return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);
|
||||
}
|
||||
template <int type>
|
||||
static opmask_t fpclass(zmm_t x) {
|
||||
return _mm512_fpclass_pd_mask(x, type);
|
||||
}
|
||||
template <int scale>
|
||||
static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
|
||||
void const *base) {
|
||||
return _mm512_mask_i64gather_pd(src, mask, index, base, scale);
|
||||
}
|
||||
template <int scale>
|
||||
static zmm_t i64gather(__m512i index, void const *base) {
|
||||
return _mm512_i64gather_pd(index, base, scale);
|
||||
}
|
||||
static zmm_t loadu(void const *mem) { return _mm512_loadu_pd(mem); }
|
||||
static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_pd(x, y); }
|
||||
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
|
||||
return _mm512_mask_compressstoreu_pd(mem, mask, x);
|
||||
}
|
||||
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
|
||||
return _mm512_mask_loadu_pd(x, mask, mem);
|
||||
}
|
||||
static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
|
||||
return _mm512_mask_mov_pd(x, mask, y);
|
||||
}
|
||||
static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
|
||||
return _mm512_mask_storeu_pd(mem, mask, x);
|
||||
}
|
||||
static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_pd(x, y); }
|
||||
static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
|
||||
return _mm512_permutexvar_pd(idx, zmm);
|
||||
}
|
||||
static type_t reducemax(zmm_t v) { return _mm512_reduce_max_pd(v); }
|
||||
static type_t reducemin(zmm_t v) { return _mm512_reduce_min_pd(v); }
|
||||
static zmm_t set1(type_t v) { return _mm512_set1_pd(v); }
|
||||
template <uint8_t mask>
|
||||
static zmm_t shuffle(zmm_t zmm) {
|
||||
return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask);
|
||||
}
|
||||
static void storeu(void *mem, zmm_t x) { _mm512_storeu_pd(mem, x); }
|
||||
};
|
||||
|
||||
/*
|
||||
* Assumes zmm is random and performs a full sorting network defined in
|
||||
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
|
||||
*/
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
|
||||
X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) {
|
||||
const typename vtype::zmmi_t rev_index = vtype::seti(NETWORK_64BIT_2);
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_1), zmm), 0xCC);
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
|
||||
zmm = cmp_merge<vtype>(zmm, vtype::permutexvar(rev_index, zmm), 0xF0);
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_3), zmm), 0xCC);
|
||||
zmm = cmp_merge<vtype>(
|
||||
zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
|
||||
return zmm;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,483 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
#ifndef AVX512_QSORT_COMMON
|
||||
#define AVX512_QSORT_COMMON
|
||||
|
||||
/*
|
||||
* Quicksort using AVX-512. The ideas and code are based on these two research
|
||||
* papers [1] and [2]. On a high level, the idea is to vectorize quicksort
|
||||
* partitioning using AVX-512 compressstore instructions. If the array size is
|
||||
* < 128, then use Bitonic sorting network implemented on 512-bit registers.
|
||||
* The precise network definitions depend on the dtype and are defined in
|
||||
* separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
|
||||
* avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
|
||||
* network. The core implementations of the vectorized qsort functions
|
||||
* avx512_qsort<T>(T*, int64_t) are modified versions of avx2 quicksort
|
||||
* presented in the paper [2] and source code associated with that paper [3].
|
||||
*
|
||||
* [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
|
||||
* https://drops.dagstuhl.de/opus/volltexte/2021/13775/
|
||||
*
|
||||
* [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
|
||||
* Skylake https://arxiv.org/pdf/1704.08579.pdf
|
||||
*
|
||||
* [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier:
|
||||
* MIT
|
||||
*
|
||||
* [4]
|
||||
* http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
|
||||
*
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
||||
/*
|
||||
Workaround for the bug in GCC12 (that was fixed in GCC 12.3.1).
|
||||
More details are available at: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
||||
*/
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
#include <immintrin.h>
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
|
||||
#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
|
||||
#define X86_SIMD_SORT_INFINITYH 0x7c00
|
||||
#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
|
||||
#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
|
||||
#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
|
||||
#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
|
||||
#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
|
||||
#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
|
||||
#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
|
||||
#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
|
||||
#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
|
||||
#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
|
||||
#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
|
||||
#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
|
||||
#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
|
||||
#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
|
||||
#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
|
||||
#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
|
||||
#define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH)
|
||||
#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
|
||||
#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
|
||||
#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
|
||||
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define X86_SIMD_SORT_INLINE static inline
|
||||
#define X86_SIMD_SORT_FINLINE static __forceinline
|
||||
#elif defined(__CYGWIN__)
|
||||
/*
|
||||
* Force inline in cygwin to work around a compiler bug. See
|
||||
* https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
|
||||
*/
|
||||
#define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
|
||||
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
|
||||
#elif defined(__GNUC__)
|
||||
#define X86_SIMD_SORT_INLINE static inline
|
||||
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
|
||||
#else
|
||||
#define X86_SIMD_SORT_INLINE static
|
||||
#define X86_SIMD_SORT_FINLINE static
|
||||
#endif
|
||||
|
||||
#define LIKELY(x) __builtin_expect((x), 1)
|
||||
#define UNLIKELY(x) __builtin_expect((x), 0)
|
||||
|
||||
template <typename type>
|
||||
struct zmm_vector;
|
||||
|
||||
template <typename type>
|
||||
struct ymm_vector;
|
||||
|
||||
// Regular quicksort routines:
|
||||
template <typename T>
|
||||
void avx512_qsort(T *arr, int64_t arrsize);
|
||||
|
||||
template <typename T>
|
||||
void inline avx512_qsort(T *arr, int64_t from_index, int64_t to_index);
|
||||
|
||||
template <typename T>
|
||||
bool is_a_nan(T elem) {
|
||||
return std::isnan(elem);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) {
|
||||
// median of 8 equally spaced elements
|
||||
int64_t NUM_ELEMENTS = 8;
|
||||
int64_t MID = NUM_ELEMENTS / 2;
|
||||
int64_t size = (right - left) / NUM_ELEMENTS;
|
||||
T temp[NUM_ELEMENTS];
|
||||
for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)];
|
||||
std::sort(temp, temp + NUM_ELEMENTS);
|
||||
return temp[MID];
|
||||
}
|
||||
|
||||
template <typename vtype, typename T = typename vtype::type_t>
|
||||
bool comparison_func_ge(const T &a, const T &b) {
|
||||
return a < b;
|
||||
}
|
||||
|
||||
template <typename vtype, typename T = typename vtype::type_t>
|
||||
bool comparison_func_gt(const T &a, const T &b) {
|
||||
return a <= b;
|
||||
}
|
||||
|
||||
/*
|
||||
* COEX == Compare and Exchange two registers by swapping min and max values
|
||||
*/
|
||||
template <typename vtype, typename mm_t>
|
||||
static void COEX(mm_t &a, mm_t &b) {
|
||||
mm_t temp = a;
|
||||
a = vtype::min(a, b);
|
||||
b = vtype::max(temp, b);
|
||||
}
|
||||
template <typename vtype, typename zmm_t = typename vtype::zmm_t,
|
||||
typename opmask_t = typename vtype::opmask_t>
|
||||
static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) {
|
||||
zmm_t min = vtype::min(in2, in1);
|
||||
zmm_t max = vtype::max(in2, in1);
|
||||
return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max
|
||||
}
|
||||
/*
|
||||
* Parition one ZMM register based on the pivot and returns the
|
||||
* number of elements that are greater than or equal to the pivot.
|
||||
*/
|
||||
template <typename vtype, typename type_t, typename zmm_t>
|
||||
static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right,
|
||||
const zmm_t curr_vec, const zmm_t pivot_vec,
|
||||
zmm_t *smallest_vec, zmm_t *biggest_vec, bool use_gt) {
|
||||
/* which elements are larger than or equal to the pivot */
|
||||
typename vtype::opmask_t mask;
|
||||
if (use_gt) mask = vtype::gt(curr_vec, pivot_vec);
|
||||
else mask = vtype::ge(curr_vec, pivot_vec);
|
||||
//mask = vtype::ge(curr_vec, pivot_vec);
|
||||
int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)mask);
|
||||
vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(mask),
|
||||
curr_vec);
|
||||
vtype::mask_compressstoreu(arr + right - amount_ge_pivot, mask,
|
||||
curr_vec);
|
||||
*smallest_vec = vtype::min(curr_vec, *smallest_vec);
|
||||
*biggest_vec = vtype::max(curr_vec, *biggest_vec);
|
||||
return amount_ge_pivot;
|
||||
}
|
||||
/*
|
||||
* Parition an array based on the pivot and returns the index of the
|
||||
* first element that is greater than or equal to the pivot.
|
||||
*/
|
||||
template <typename vtype, typename type_t>
|
||||
static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
|
||||
type_t pivot, type_t *smallest,
|
||||
type_t *biggest, bool use_gt) {
|
||||
auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
|
||||
/* make array length divisible by vtype::numlanes , shortening the array */
|
||||
for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
|
||||
*smallest = std::min(*smallest, arr[left], comparison_func);
|
||||
*biggest = std::max(*biggest, arr[left], comparison_func);
|
||||
if (!comparison_func(arr[left], pivot)) {
|
||||
std::swap(arr[left], arr[--right]);
|
||||
} else {
|
||||
++left;
|
||||
}
|
||||
}
|
||||
|
||||
if (left == right)
|
||||
return left; /* less than vtype::numlanes elements in the array */
|
||||
|
||||
using zmm_t = typename vtype::zmm_t;
|
||||
zmm_t pivot_vec = vtype::set1(pivot);
|
||||
zmm_t min_vec = vtype::set1(*smallest);
|
||||
zmm_t max_vec = vtype::set1(*biggest);
|
||||
|
||||
if (right - left == vtype::numlanes) {
|
||||
zmm_t vec = vtype::loadu(arr + left);
|
||||
int32_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr, left, left + vtype::numlanes, vec,
|
||||
pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
*smallest = vtype::reducemin(min_vec);
|
||||
*biggest = vtype::reducemax(max_vec);
|
||||
return left + (vtype::numlanes - amount_ge_pivot);
|
||||
}
|
||||
|
||||
// first and last vtype::numlanes values are partitioned at the end
|
||||
zmm_t vec_left = vtype::loadu(arr + left);
|
||||
zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
|
||||
// store points of the vectors
|
||||
int64_t r_store = right - vtype::numlanes;
|
||||
int64_t l_store = left;
|
||||
// indices for loading the elements
|
||||
left += vtype::numlanes;
|
||||
right -= vtype::numlanes;
|
||||
while (right - left != 0) {
|
||||
zmm_t curr_vec;
|
||||
/*
|
||||
* if fewer elements are stored on the right side of the array,
|
||||
* then next elements are loaded from the right side,
|
||||
* otherwise from the left side
|
||||
*/
|
||||
if ((r_store + vtype::numlanes) - right < left - l_store) {
|
||||
right -= vtype::numlanes;
|
||||
curr_vec = vtype::loadu(arr + right);
|
||||
} else {
|
||||
curr_vec = vtype::loadu(arr + left);
|
||||
left += vtype::numlanes;
|
||||
}
|
||||
// partition the current vector and save it on both sides of the array
|
||||
int32_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
|
||||
curr_vec, pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
;
|
||||
r_store -= amount_ge_pivot;
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
}
|
||||
|
||||
/* partition and save vec_left and vec_right */
|
||||
int32_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes, vec_left,
|
||||
pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
amount_ge_pivot =
|
||||
partition_vec<vtype>(arr, l_store, l_store + vtype::numlanes, vec_right,
|
||||
pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
*smallest = vtype::reducemin(min_vec);
|
||||
*biggest = vtype::reducemax(max_vec);
|
||||
return l_store;
|
||||
}
|
||||
|
||||
template <typename vtype, int num_unroll,
|
||||
typename type_t = typename vtype::type_t>
|
||||
static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
|
||||
int64_t right, type_t pivot,
|
||||
type_t *smallest,
|
||||
type_t *biggest, bool use_gt) {
|
||||
if (right - left <= 2 * num_unroll * vtype::numlanes) {
|
||||
return partition_avx512<vtype>(arr, left, right, pivot, smallest,
|
||||
biggest, use_gt);
|
||||
}
|
||||
|
||||
auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
|
||||
/* make array length divisible by 8*vtype::numlanes , shortening the array
|
||||
*/
|
||||
for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
|
||||
--i) {
|
||||
*smallest = std::min(*smallest, arr[left], comparison_func);
|
||||
*biggest = std::max(*biggest, arr[left], comparison_func);
|
||||
if (!comparison_func(arr[left], pivot)) {
|
||||
std::swap(arr[left], arr[--right]);
|
||||
} else {
|
||||
++left;
|
||||
}
|
||||
}
|
||||
|
||||
if (left == right)
|
||||
return left; /* less than vtype::numlanes elements in the array */
|
||||
|
||||
using zmm_t = typename vtype::zmm_t;
|
||||
zmm_t pivot_vec = vtype::set1(pivot);
|
||||
zmm_t min_vec = vtype::set1(*smallest);
|
||||
zmm_t max_vec = vtype::set1(*biggest);
|
||||
|
||||
// We will now have atleast 16 registers worth of data to process:
|
||||
// left and right vtype::numlanes values are partitioned at the end
|
||||
zmm_t vec_left[num_unroll], vec_right[num_unroll];
|
||||
#pragma GCC unroll 8
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
|
||||
vec_right[ii] =
|
||||
vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii)));
|
||||
}
|
||||
// store points of the vectors
|
||||
int64_t r_store = right - vtype::numlanes;
|
||||
int64_t l_store = left;
|
||||
// indices for loading the elements
|
||||
left += num_unroll * vtype::numlanes;
|
||||
right -= num_unroll * vtype::numlanes;
|
||||
while (right - left != 0) {
|
||||
zmm_t curr_vec[num_unroll];
|
||||
/*
|
||||
* if fewer elements are stored on the right side of the array,
|
||||
* then next elements are loaded from the right side,
|
||||
* otherwise from the left side
|
||||
*/
|
||||
if ((r_store + vtype::numlanes) - right < left - l_store) {
|
||||
right -= num_unroll * vtype::numlanes;
|
||||
#pragma GCC unroll 8
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
|
||||
}
|
||||
} else {
|
||||
#pragma GCC unroll 8
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
|
||||
}
|
||||
left += num_unroll * vtype::numlanes;
|
||||
}
|
||||
// partition the current vector and save it on both sides of the array
|
||||
#pragma GCC unroll 8
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
int32_t amount_ge_pivot = partition_vec<vtype>(
|
||||
arr, l_store, r_store + vtype::numlanes, curr_vec[ii],
|
||||
pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
r_store -= amount_ge_pivot;
|
||||
}
|
||||
}
|
||||
|
||||
/* partition and save vec_left[8] and vec_right[8] */
|
||||
#pragma GCC unroll 8
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
int32_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
|
||||
vec_left[ii], pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
r_store -= amount_ge_pivot;
|
||||
}
|
||||
#pragma GCC unroll 8
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
int32_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
|
||||
vec_right[ii], pivot_vec, &min_vec, &max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
r_store -= amount_ge_pivot;
|
||||
}
|
||||
*smallest = vtype::reducemin(min_vec);
|
||||
*biggest = vtype::reducemax(max_vec);
|
||||
return l_store;
|
||||
}
|
||||
|
||||
// to_index (exclusive)
|
||||
template <typename vtype, typename type_t>
|
||||
static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) {
|
||||
type_t smallest = vtype::type_max();
|
||||
type_t biggest = vtype::type_min();
|
||||
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
|
||||
arr, from_index, to_index, pivot, &smallest, &biggest, use_gt);
|
||||
return pivot_index;
|
||||
}
|
||||
|
||||
// partitioning functions
|
||||
template <typename T>
|
||||
void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){
|
||||
const T pivot1 = arr[index_pivot1];
|
||||
const T pivot2 = arr[index_pivot2];
|
||||
|
||||
const int64_t low = from_index;
|
||||
const int64_t high = to_index;
|
||||
const int64_t start = low + 1;
|
||||
const int64_t end = high - 1;
|
||||
|
||||
|
||||
std::swap(arr[index_pivot1], arr[low]);
|
||||
std::swap(arr[index_pivot2], arr[end]);
|
||||
|
||||
|
||||
const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, start, end, pivot2, true); // use_gt = true
|
||||
std::swap(arr[end], arr[pivot_index2]);
|
||||
int64_t upper = pivot_index2;
|
||||
|
||||
// if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning
|
||||
if (upper == start) {
|
||||
pivot_indices[0] = low;
|
||||
pivot_indices[1] = upper;
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false)
|
||||
int64_t lower = pivot_index1 - 1;
|
||||
std::swap(arr[low], arr[lower]);
|
||||
|
||||
pivot_indices[0] = lower;
|
||||
pivot_indices[1] = upper;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot){
|
||||
const T pivot = arr[index_pivot];
|
||||
|
||||
const int64_t low = from_index;
|
||||
const int64_t high = to_index;
|
||||
const int64_t end = high - 1;
|
||||
|
||||
|
||||
const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, low, high, pivot, false); // use_gt = false (use_ge)
|
||||
int64_t lower = pivot_index1;
|
||||
|
||||
const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, pivot_index1, high, pivot, true); // use_gt = true
|
||||
int64_t upper = pivot_index2;
|
||||
|
||||
pivot_indices[0] = lower;
|
||||
pivot_indices[1] = upper;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void inline avx512_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) {
|
||||
if (index_pivot1 != index_pivot2) {
|
||||
avx512_dual_pivot_partition<T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
}
|
||||
else {
|
||||
avx512_single_pivot_partition<T>(arr, from_index, to_index, pivot_indices, index_pivot1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void inline insertion_sort(T *arr, int32_t from_index, int32_t to_index) {
|
||||
for (int i, k = from_index; ++k < to_index; ) {
|
||||
T ai = arr[i = k];
|
||||
|
||||
if (ai < arr[i - 1]) {
|
||||
while (--i >= from_index && ai < arr[i]) {
|
||||
arr[i + 1] = arr[i];
|
||||
}
|
||||
arr[i + 1] = ai;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void inline avx512_fast_sort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) {
|
||||
int32_t size = to_index - from_index;
|
||||
|
||||
if (size <= INS_SORT_THRESHOLD) {
|
||||
insertion_sort<T>(arr, from_index, to_index);
|
||||
}
|
||||
else {
|
||||
avx512_qsort<T>(arr, from_index, to_index);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // AVX512_QSORT_COMMON
|
||||
@@ -21,12 +21,15 @@
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
#include "simdsort-support.hpp"
|
||||
#ifdef __SIMDSORT_SUPPORTED_LINUX
|
||||
|
||||
#pragma GCC target("avx512dq", "avx512f")
|
||||
#include "avx512-32bit-qsort.hpp"
|
||||
#include "avx512-64bit-qsort.hpp"
|
||||
#include "classfile_constants.h"
|
||||
|
||||
|
||||
#define DLL_PUBLIC __attribute__((visibility("default")))
|
||||
#define INSERTION_SORT_THRESHOLD_32BIT 16
|
||||
#define INSERTION_SORT_THRESHOLD_64BIT 20
|
||||
@@ -36,35 +39,41 @@ extern "C" {
|
||||
DLL_PUBLIC void avx512_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) {
|
||||
switch(elem_type) {
|
||||
case JVM_T_INT:
|
||||
avx512_fast_sort<int32_t>((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
|
||||
avx512_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
|
||||
break;
|
||||
case JVM_T_LONG:
|
||||
avx512_fast_sort<int64_t>((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
|
||||
avx512_fast_sort((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
|
||||
break;
|
||||
case JVM_T_FLOAT:
|
||||
avx512_fast_sort<float>((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
|
||||
avx512_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
|
||||
break;
|
||||
case JVM_T_DOUBLE:
|
||||
avx512_fast_sort<double>((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
|
||||
avx512_fast_sort((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
|
||||
break;
|
||||
default:
|
||||
assert(false, "Unexpected type");
|
||||
}
|
||||
}
|
||||
|
||||
DLL_PUBLIC void avx512_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
|
||||
switch(elem_type) {
|
||||
case JVM_T_INT:
|
||||
avx512_fast_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
avx512_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
break;
|
||||
case JVM_T_LONG:
|
||||
avx512_fast_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
avx512_fast_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
break;
|
||||
case JVM_T_FLOAT:
|
||||
avx512_fast_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
avx512_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
break;
|
||||
case JVM_T_DOUBLE:
|
||||
avx512_fast_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
avx512_fast_partition((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
break;
|
||||
default:
|
||||
assert(false, "Unexpected type");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
39
src/java.base/linux/native/libsimdsort/simdsort-support.hpp
Normal file
39
src/java.base/linux/native/libsimdsort/simdsort-support.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2023 Intel Corporation. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SIMDSORT_SUPPORT_HPP
|
||||
#define SIMDSORT_SUPPORT_HPP
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#undef assert
|
||||
#define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }}
|
||||
|
||||
|
||||
// GCC >= 7.5 is needed to build AVX2 portions of libsimdsort using C++17 features
|
||||
#if defined(_LP64) && (defined(__GNUC__) && ((__GNUC__ > 7) || ((__GNUC__ == 7) && (__GNUC_MINOR__ >= 5))))
|
||||
#define __SIMDSORT_SUPPORTED_LINUX
|
||||
#endif
|
||||
|
||||
#endif //SIMDSORT_SUPPORT_HPP
|
||||
101
src/java.base/linux/native/libsimdsort/xss-common-includes.h
Normal file
101
src/java.base/linux/native/libsimdsort/xss-common-includes.h
Normal file
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
#ifndef XSS_COMMON_INCLUDES
|
||||
#define XSS_COMMON_INCLUDES
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
/*
|
||||
Workaround for the bug in GCC12 (that was fixed in GCC 12.3.1).
|
||||
More details are available at:
|
||||
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
||||
*/
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
#include <immintrin.h>
|
||||
#pragma GCC diagnostic pop
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
|
||||
#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
|
||||
#define X86_SIMD_SORT_INFINITYH 0x7c00
|
||||
#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
|
||||
#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
|
||||
#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
|
||||
#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
|
||||
#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
|
||||
#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
|
||||
#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
|
||||
#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
|
||||
#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
|
||||
#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
|
||||
#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
|
||||
#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
|
||||
#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
|
||||
#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
|
||||
#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
|
||||
#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
|
||||
#define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH)
|
||||
#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
|
||||
#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
|
||||
#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
|
||||
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
|
||||
|
||||
#define PRAGMA(x) _Pragma(#x)
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
/* Compiler specific macros specific */
|
||||
#if defined(__GNUC__)
|
||||
#define X86_SIMD_SORT_INLINE static inline
|
||||
#define X86_SIMD_SORT_FINLINE static inline __attribute__((always_inline))
|
||||
#else
|
||||
#define X86_SIMD_SORT_INLINE static
|
||||
#define X86_SIMD_SORT_FINLINE static
|
||||
#endif
|
||||
|
||||
#if __GNUC__ >= 8
|
||||
#define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(GCC unroll num)
|
||||
#else
|
||||
#define X86_SIMD_SORT_UNROLL_LOOP(num)
|
||||
#endif
|
||||
|
||||
typedef size_t arrsize_t;
|
||||
|
||||
template <typename type>
|
||||
struct zmm_vector;
|
||||
|
||||
template <typename type>
|
||||
struct ymm_vector;
|
||||
|
||||
template <typename type>
|
||||
struct avx2_vector;
|
||||
|
||||
#endif // XSS_COMMON_INCLUDES
|
||||
528
src/java.base/linux/native/libsimdsort/xss-common-qsort.h
Normal file
528
src/java.base/linux/native/libsimdsort/xss-common-qsort.h
Normal file
@@ -0,0 +1,528 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
#ifndef XSS_COMMON_QSORT
|
||||
#define XSS_COMMON_QSORT
|
||||
|
||||
/*
|
||||
* Quicksort using AVX-512. The ideas and code are based on these two research
|
||||
* papers [1] and [2]. On a high level, the idea is to vectorize quicksort
|
||||
* partitioning using AVX-512 compressstore instructions. If the array size is
|
||||
* < 128, then use Bitonic sorting network implemented on 512-bit registers.
|
||||
* The precise network definitions depend on the dtype and are defined in
|
||||
* separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
|
||||
* avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
|
||||
* network. The core implementations of the vectorized qsort functions
|
||||
* avx512_qsort<T>(T*, arrsize_t) are modified versions of avx2 quicksort
|
||||
* presented in the paper [2] and source code associated with that paper [3].
|
||||
*
|
||||
* [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
|
||||
* https://drops.dagstuhl.de/opus/volltexte/2021/13775/
|
||||
*
|
||||
* [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
|
||||
* Skylake https://arxiv.org/pdf/1704.08579.pdf
|
||||
*
|
||||
* [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier:
|
||||
* MIT
|
||||
*
|
||||
* [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
|
||||
*
|
||||
*/
|
||||
|
||||
#include "xss-common-includes.h"
|
||||
#include "xss-pivot-selection.hpp"
|
||||
#include "xss-network-qsort.hpp"
|
||||
|
||||
|
||||
template <typename T>
|
||||
bool is_a_nan(T elem) {
|
||||
return std::isnan(elem);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) {
|
||||
// median of 8 equally spaced elements
|
||||
int64_t NUM_ELEMENTS = 8;
|
||||
int64_t MID = NUM_ELEMENTS / 2;
|
||||
int64_t size = (right - left) / NUM_ELEMENTS;
|
||||
T temp[NUM_ELEMENTS];
|
||||
for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)];
|
||||
std::sort(temp, temp + NUM_ELEMENTS);
|
||||
return temp[MID];
|
||||
}
|
||||
|
||||
template <typename vtype, typename T = typename vtype::type_t>
|
||||
bool comparison_func_ge(const T &a, const T &b) {
|
||||
return a < b;
|
||||
}
|
||||
|
||||
template <typename vtype, typename T = typename vtype::type_t>
|
||||
bool comparison_func_gt(const T &a, const T &b) {
|
||||
return a <= b;
|
||||
}
|
||||
|
||||
/*
|
||||
* COEX == Compare and Exchange two registers by swapping min and max values
|
||||
*/
|
||||
template <typename vtype, typename mm_t>
|
||||
X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) {
|
||||
mm_t temp = a;
|
||||
a = vtype::min(a, b);
|
||||
b = vtype::max(temp, b);
|
||||
}
|
||||
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t,
|
||||
typename opmask_t = typename vtype::opmask_t>
|
||||
X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) {
|
||||
reg_t min = vtype::min(in2, in1);
|
||||
reg_t max = vtype::max(in2, in1);
|
||||
return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max
|
||||
}
|
||||
|
||||
template <typename vtype, typename type_t, typename reg_t>
|
||||
int avx512_double_compressstore(type_t *left_addr, type_t *right_addr,
|
||||
typename vtype::opmask_t k, reg_t reg) {
|
||||
int amount_ge_pivot = _mm_popcnt_u32((int)k);
|
||||
|
||||
vtype::mask_compressstoreu(left_addr, vtype::knot_opmask(k), reg);
|
||||
vtype::mask_compressstoreu(right_addr + vtype::numlanes - amount_ge_pivot,
|
||||
k, reg);
|
||||
|
||||
return amount_ge_pivot;
|
||||
}
|
||||
|
||||
// Generic function dispatches to AVX2 or AVX512 code
|
||||
template <typename vtype, typename type_t,
|
||||
typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store,
|
||||
const reg_t curr_vec,
|
||||
const reg_t pivot_vec,
|
||||
reg_t &smallest_vec,
|
||||
reg_t &biggest_vec, bool use_gt) {
|
||||
//typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
|
||||
typename vtype::opmask_t mask;
|
||||
if (use_gt) mask = vtype::gt(curr_vec, pivot_vec);
|
||||
else mask = vtype::ge(curr_vec, pivot_vec);
|
||||
|
||||
int amount_ge_pivot =
|
||||
vtype::double_compressstore(l_store, r_store, mask, curr_vec);
|
||||
|
||||
smallest_vec = vtype::min(curr_vec, smallest_vec);
|
||||
biggest_vec = vtype::max(curr_vec, biggest_vec);
|
||||
|
||||
return amount_ge_pivot;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parition an array based on the pivot and returns the index of the
|
||||
* first element that is greater than or equal to the pivot.
|
||||
*/
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, arrsize_t left,
|
||||
arrsize_t right, type_t pivot,
|
||||
type_t *smallest,
|
||||
type_t *biggest,
|
||||
bool use_gt) {
|
||||
auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
|
||||
/* make array length divisible by vtype::numlanes , shortening the array */
|
||||
for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
|
||||
*smallest = std::min(*smallest, arr[left], comparison_func);
|
||||
*biggest = std::max(*biggest, arr[left], comparison_func);
|
||||
if (!comparison_func(arr[left], pivot)) {
|
||||
std::swap(arr[left], arr[--right]);
|
||||
} else {
|
||||
++left;
|
||||
}
|
||||
}
|
||||
|
||||
if (left == right)
|
||||
return left; /* less than vtype::numlanes elements in the array */
|
||||
|
||||
using reg_t = typename vtype::reg_t;
|
||||
reg_t pivot_vec = vtype::set1(pivot);
|
||||
reg_t min_vec = vtype::set1(*smallest);
|
||||
reg_t max_vec = vtype::set1(*biggest);
|
||||
|
||||
if (right - left == vtype::numlanes) {
|
||||
reg_t vec = vtype::loadu(arr + left);
|
||||
arrsize_t unpartitioned = right - left - vtype::numlanes;
|
||||
arrsize_t l_store = left;
|
||||
|
||||
arrsize_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
vec, pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
*smallest = vtype::reducemin(min_vec);
|
||||
*biggest = vtype::reducemax(max_vec);
|
||||
return l_store;
|
||||
}
|
||||
|
||||
// first and last vtype::numlanes values are partitioned at the end
|
||||
reg_t vec_left = vtype::loadu(arr + left);
|
||||
reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
|
||||
// store points of the vectors
|
||||
arrsize_t unpartitioned = right - left - vtype::numlanes;
|
||||
arrsize_t l_store = left;
|
||||
// indices for loading the elements
|
||||
left += vtype::numlanes;
|
||||
right -= vtype::numlanes;
|
||||
while (right - left != 0) {
|
||||
reg_t curr_vec;
|
||||
/*
|
||||
* if fewer elements are stored on the right side of the array,
|
||||
* then next elements are loaded from the right side,
|
||||
* otherwise from the left side
|
||||
*/
|
||||
if ((l_store + unpartitioned + vtype::numlanes) - right <
|
||||
left - l_store) {
|
||||
right -= vtype::numlanes;
|
||||
curr_vec = vtype::loadu(arr + right);
|
||||
} else {
|
||||
curr_vec = vtype::loadu(arr + left);
|
||||
left += vtype::numlanes;
|
||||
}
|
||||
// partition the current vector and save it on both sides of the array
|
||||
arrsize_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
curr_vec, pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
}
|
||||
|
||||
/* partition and save vec_left and vec_right */
|
||||
arrsize_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
vec_left, pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
|
||||
amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
vec_right, pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
|
||||
*smallest = vtype::reducemin(min_vec);
|
||||
*biggest = vtype::reducemax(max_vec);
|
||||
return l_store;
|
||||
}
|
||||
|
||||
template <typename vtype, int num_unroll,
|
||||
typename type_t = typename vtype::type_t>
|
||||
X86_SIMD_SORT_INLINE arrsize_t
|
||||
partition_avx512_unrolled(type_t *arr, arrsize_t left, arrsize_t right,
|
||||
type_t pivot, type_t *smallest, type_t *biggest, bool use_gt) {
|
||||
if constexpr (num_unroll == 0) {
|
||||
return partition_avx512<vtype>(arr, left, right, pivot, smallest,
|
||||
biggest, use_gt);
|
||||
}
|
||||
|
||||
/* Use regular partition_avx512 for smaller arrays */
|
||||
if (right - left < 3 * num_unroll * vtype::numlanes) {
|
||||
return partition_avx512<vtype>(arr, left, right, pivot, smallest,
|
||||
biggest, use_gt);
|
||||
}
|
||||
|
||||
auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
|
||||
/* make array length divisible by vtype::numlanes, shortening the array */
|
||||
for (int32_t i = ((right - left) % (vtype::numlanes)); i > 0; --i) {
|
||||
*smallest = std::min(*smallest, arr[left], comparison_func);
|
||||
*biggest = std::max(*biggest, arr[left], comparison_func);
|
||||
if (!comparison_func(arr[left], pivot)) {
|
||||
std::swap(arr[left], arr[--right]);
|
||||
} else {
|
||||
++left;
|
||||
}
|
||||
}
|
||||
|
||||
arrsize_t unpartitioned = right - left - vtype::numlanes;
|
||||
arrsize_t l_store = left;
|
||||
|
||||
using reg_t = typename vtype::reg_t;
|
||||
reg_t pivot_vec = vtype::set1(pivot);
|
||||
reg_t min_vec = vtype::set1(*smallest);
|
||||
reg_t max_vec = vtype::set1(*biggest);
|
||||
|
||||
/* Calculate and load more registers to make the rest of the array a
|
||||
* multiple of num_unroll. These registers will be partitioned at the very
|
||||
* end. */
|
||||
int vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
|
||||
reg_t vec_align[num_unroll];
|
||||
for (int i = 0; i < vecsToPartition; i++) {
|
||||
vec_align[i] = vtype::loadu(arr + left + i * vtype::numlanes);
|
||||
}
|
||||
left += vecsToPartition * vtype::numlanes;
|
||||
|
||||
/* We will now have atleast 3*num_unroll registers worth of data to
|
||||
* process. Load left and right vtype::numlanes*num_unroll values into
|
||||
* registers to make space for in-place parition. The vec_left and
|
||||
* vec_right registers are partitioned at the end */
|
||||
reg_t vec_left[num_unroll], vec_right[num_unroll];
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
|
||||
vec_right[ii] =
|
||||
vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii)));
|
||||
}
|
||||
/* indices for loading the elements */
|
||||
left += num_unroll * vtype::numlanes;
|
||||
right -= num_unroll * vtype::numlanes;
|
||||
while (right - left != 0) {
|
||||
reg_t curr_vec[num_unroll];
|
||||
/*
|
||||
* if fewer elements are stored on the right side of the array,
|
||||
* then next elements are loaded from the right side,
|
||||
* otherwise from the left side
|
||||
*/
|
||||
if ((l_store + unpartitioned + vtype::numlanes) - right <
|
||||
left - l_store) {
|
||||
right -= num_unroll * vtype::numlanes;
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
|
||||
/*
|
||||
* error: '_mm_prefetch' needs target feature mmx on clang-cl
|
||||
*/
|
||||
#if !(defined(_MSC_VER) && defined(__clang__))
|
||||
_mm_prefetch((char *)(arr + right + ii * vtype::numlanes -
|
||||
num_unroll * vtype::numlanes),
|
||||
_MM_HINT_T0);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
|
||||
/*
|
||||
* error: '_mm_prefetch' needs target feature mmx on clang-cl
|
||||
*/
|
||||
#if !(defined(_MSC_VER) && defined(__clang__))
|
||||
_mm_prefetch((char *)(arr + left + ii * vtype::numlanes +
|
||||
num_unroll * vtype::numlanes),
|
||||
_MM_HINT_T0);
|
||||
#endif
|
||||
}
|
||||
left += num_unroll * vtype::numlanes;
|
||||
}
|
||||
/* partition the current vector and save it on both sides of the array
|
||||
* */
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
arrsize_t amount_ge_pivot = partition_vec<vtype>(
|
||||
arr + l_store, arr + l_store + unpartitioned, curr_vec[ii],
|
||||
pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
}
|
||||
}
|
||||
|
||||
/* partition and save vec_left[num_unroll] and vec_right[num_unroll] */
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
arrsize_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
vec_left[ii], pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
}
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < num_unroll; ++ii) {
|
||||
arrsize_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
vec_right[ii], pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
}
|
||||
|
||||
/* partition and save vec_align[vecsToPartition] */
|
||||
X86_SIMD_SORT_UNROLL_LOOP(8)
|
||||
for (int ii = 0; ii < vecsToPartition; ++ii) {
|
||||
arrsize_t amount_ge_pivot =
|
||||
partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
|
||||
vec_align[ii], pivot_vec, min_vec, max_vec, use_gt);
|
||||
l_store += (vtype::numlanes - amount_ge_pivot);
|
||||
unpartitioned -= vtype::numlanes;
|
||||
}
|
||||
|
||||
*smallest = vtype::reducemin(min_vec);
|
||||
*biggest = vtype::reducemax(max_vec);
|
||||
return l_store;
|
||||
}
|
||||
|
||||
template <typename vtype, int maxN>
|
||||
void sort_n(typename vtype::type_t *arr, int N);
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
static void qsort_(type_t *arr, arrsize_t left, arrsize_t right,
|
||||
arrsize_t max_iters) {
|
||||
/*
|
||||
* Resort to std::sort if quicksort isnt making any progress
|
||||
*/
|
||||
if (max_iters <= 0) {
|
||||
std::sort(arr + left, arr + right + 1, comparison_func_ge<vtype>);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Base case: use bitonic networks to sort arrays <=
|
||||
* vtype::network_sort_threshold
|
||||
*/
|
||||
if (right + 1 - left <= vtype::network_sort_threshold) {
|
||||
sort_n<vtype, vtype::network_sort_threshold>(
|
||||
arr + left, (int32_t)(right + 1 - left));
|
||||
return;
|
||||
}
|
||||
|
||||
type_t pivot = get_pivot_blocks<vtype, type_t>(arr, left, right);
|
||||
type_t smallest = vtype::type_max();
|
||||
type_t biggest = vtype::type_min();
|
||||
|
||||
arrsize_t pivot_index =
|
||||
partition_avx512_unrolled<vtype, vtype::partition_unroll_factor>(
|
||||
arr, left, right + 1, pivot, &smallest, &biggest, false);
|
||||
|
||||
if (pivot != smallest)
|
||||
qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
|
||||
if (pivot != biggest) qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
|
||||
}
|
||||
|
||||
// Hooks for OpenJDK sort
|
||||
// to_index (exclusive)
|
||||
template <typename vtype, typename type_t>
|
||||
static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) {
|
||||
type_t smallest = vtype::type_max();
|
||||
type_t biggest = vtype::type_min();
|
||||
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
|
||||
arr, from_index, to_index, pivot, &smallest, &biggest, use_gt);
|
||||
return pivot_index;
|
||||
}
|
||||
|
||||
// partitioning functions
|
||||
template <typename vtype, typename T>
|
||||
X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){
|
||||
const T pivot1 = arr[index_pivot1];
|
||||
const T pivot2 = arr[index_pivot2];
|
||||
|
||||
const int64_t low = from_index;
|
||||
const int64_t high = to_index;
|
||||
const int64_t start = low + 1;
|
||||
const int64_t end = high - 1;
|
||||
|
||||
|
||||
std::swap(arr[index_pivot1], arr[low]);
|
||||
std::swap(arr[index_pivot2], arr[end]);
|
||||
|
||||
|
||||
const int64_t pivot_index2 = vectorized_partition<vtype, T>(arr, start, end, pivot2, true); // use_gt = true
|
||||
std::swap(arr[end], arr[pivot_index2]);
|
||||
int64_t upper = pivot_index2;
|
||||
|
||||
// if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning
|
||||
if (upper == start) {
|
||||
pivot_indices[0] = low;
|
||||
pivot_indices[1] = upper;
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t pivot_index1 = vectorized_partition<vtype, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false)
|
||||
int64_t lower = pivot_index1 - 1;
|
||||
std::swap(arr[low], arr[lower]);
|
||||
|
||||
pivot_indices[0] = lower;
|
||||
pivot_indices[1] = upper;
|
||||
}
|
||||
|
||||
template <typename vtype, typename T>
|
||||
X86_SIMD_SORT_INLINE void simd_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot) {
|
||||
const T pivot = arr[index_pivot];
|
||||
|
||||
const int64_t low = from_index;
|
||||
const int64_t high = to_index;
|
||||
const int64_t end = high - 1;
|
||||
|
||||
|
||||
const int64_t pivot_index1 = vectorized_partition<vtype, T>(arr, low, high, pivot, false); // use_gt = false (use_ge)
|
||||
int64_t lower = pivot_index1;
|
||||
|
||||
const int64_t pivot_index2 = vectorized_partition<vtype, T>(arr, pivot_index1, high, pivot, true); // use_gt = true
|
||||
int64_t upper = pivot_index2;
|
||||
|
||||
pivot_indices[0] = lower;
|
||||
pivot_indices[1] = upper;
|
||||
}
|
||||
|
||||
template <typename vtype, typename T>
|
||||
X86_SIMD_SORT_INLINE void simd_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) {
|
||||
if (index_pivot1 != index_pivot2) {
|
||||
simd_dual_pivot_partition<vtype, T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
|
||||
}
|
||||
else {
|
||||
simd_single_pivot_partition<vtype, T>(arr, from_index, to_index, pivot_indices, index_pivot1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
X86_SIMD_SORT_INLINE void insertion_sort(T *arr, int32_t from_index, int32_t to_index) {
|
||||
for (int i, k = from_index; ++k < to_index; ) {
|
||||
T ai = arr[i = k];
|
||||
if (ai < arr[i - 1]) {
|
||||
while (--i >= from_index && ai < arr[i]) {
|
||||
arr[i + 1] = arr[i];
|
||||
}
|
||||
arr[i + 1] = ai;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename vtype, typename T>
|
||||
X86_SIMD_SORT_INLINE void simd_fast_sort(T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD)
|
||||
{
|
||||
arrsize_t arrsize = to_index - from_index;
|
||||
if (arrsize <= INS_SORT_THRESHOLD) {
|
||||
insertion_sort<T>(arr, from_index, to_index);
|
||||
} else {
|
||||
qsort_<vtype, T>(arr, from_index, to_index - 1, 2 * (arrsize_t)log2(arrsize));
|
||||
}
|
||||
}
|
||||
|
||||
#define DEFINE_METHODS(ISA, VTYPE) \
|
||||
template <typename T> \
|
||||
X86_SIMD_SORT_INLINE void ISA##_fast_sort( \
|
||||
T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) \
|
||||
{ \
|
||||
simd_fast_sort<VTYPE, T>(arr, from_index, to_index, INS_SORT_THRESHOLD); \
|
||||
} \
|
||||
template <typename T> \
|
||||
X86_SIMD_SORT_INLINE void ISA##_fast_partition( \
|
||||
T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) \
|
||||
{ \
|
||||
simd_fast_partition<VTYPE, T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); \
|
||||
}
|
||||
|
||||
DEFINE_METHODS(avx2, avx2_vector<T>)
|
||||
DEFINE_METHODS(avx512, zmm_vector<T>)
|
||||
|
||||
#endif // XSS_COMMON_QSORT
|
||||
209
src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp
Normal file
209
src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
#ifndef XSS_NETWORK_QSORT
|
||||
#define XSS_NETWORK_QSORT
|
||||
|
||||
#include "xss-common-qsort.h"
|
||||
#include "xss-optimal-networks.hpp"
|
||||
|
||||
template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs) {
|
||||
if constexpr (numVecs == 1) {
|
||||
UNUSED(regs);
|
||||
return;
|
||||
} else if constexpr (numVecs == 2) {
|
||||
COEX<vtype>(regs[0], regs[1]);
|
||||
} else if constexpr (numVecs == 4) {
|
||||
optimal_sort_4<vtype>(regs);
|
||||
} else if constexpr (numVecs == 8) {
|
||||
optimal_sort_8<vtype>(regs);
|
||||
} else if constexpr (numVecs == 16) {
|
||||
optimal_sort_16<vtype>(regs);
|
||||
} else if constexpr (numVecs == 32) {
|
||||
optimal_sort_32<vtype>(regs);
|
||||
} else {
|
||||
static_assert(numVecs == -1, "should not reach here");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Swizzle ops explained:
|
||||
* swap_n<scale>: swap neighbouring blocks of size <scale/2> within block of
|
||||
* size <scale> reg i = [7,6,5,4,3,2,1,0] swap_n<2>: =
|
||||
* [[6,7],[4,5],[2,3],[0,1]] swap_n<4>: = [[5,4,7,6],[1,0,3,2]] swap_n<8>: =
|
||||
* [[3,2,1,0,7,6,5,4]] reverse_n<scale>: reverse elements within block of size
|
||||
* <scale> reg i = [7,6,5,4,3,2,1,0] rev_n<2>: =
|
||||
* [[6,7],[4,5],[2,3],[0,1]] rev_n<4>: = [[4,5,6,7],[0,1,2,3]] rev_n<8>: =
|
||||
* [[0,1,2,3,4,5,6,7]] merge_n<scale>: merge blocks of <scale/2> elements from
|
||||
* two regs reg b,a = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b] merge_n<2> =
|
||||
* [a,b,a,b,a,b,a,b] merge_n<4> = [a,a,b,b,a,a,b,b] merge_n<8> =
|
||||
* [a,a,a,a,b,b,b,b]
|
||||
*/
|
||||
|
||||
template <typename vtype, int numVecs, int scale, bool first = true>
|
||||
X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg) {
|
||||
using reg_t = typename vtype::reg_t;
|
||||
using swizzle = typename vtype::swizzle_ops;
|
||||
if constexpr (scale <= 1) {
|
||||
UNUSED(reg);
|
||||
return;
|
||||
} else {
|
||||
if constexpr (first) {
|
||||
// Use reverse then merge
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = 0; i < numVecs; i++) {
|
||||
reg_t &v = reg[i];
|
||||
reg_t rev = swizzle::template reverse_n<vtype, scale>(v);
|
||||
COEX<vtype>(rev, v);
|
||||
v = swizzle::template merge_n<vtype, scale>(v, rev);
|
||||
}
|
||||
} else {
|
||||
// Use swap then merge
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = 0; i < numVecs; i++) {
|
||||
reg_t &v = reg[i];
|
||||
reg_t swap = swizzle::template swap_n<vtype, scale>(v);
|
||||
COEX<vtype>(swap, v);
|
||||
v = swizzle::template merge_n<vtype, scale>(v, swap);
|
||||
}
|
||||
}
|
||||
internal_merge_n_vec<vtype, numVecs, scale / 2, false>(reg);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename vtype, int numVecs, int scale,
|
||||
typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void merge_substep_n_vec(reg_t *regs) {
|
||||
using swizzle = typename vtype::swizzle_ops;
|
||||
if constexpr (numVecs <= 1) {
|
||||
UNUSED(regs);
|
||||
return;
|
||||
}
|
||||
|
||||
// Reverse upper half of vectors
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = numVecs / 2; i < numVecs; i++) {
|
||||
regs[i] = swizzle::template reverse_n<vtype, scale>(regs[i]);
|
||||
}
|
||||
// Do compare exchanges
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = 0; i < numVecs / 2; i++) {
|
||||
COEX<vtype>(regs[i], regs[numVecs - 1 - i]);
|
||||
}
|
||||
|
||||
merge_substep_n_vec<vtype, numVecs / 2, scale>(regs);
|
||||
merge_substep_n_vec<vtype, numVecs / 2, scale>(regs + numVecs / 2);
|
||||
}
|
||||
|
||||
template <typename vtype, int numVecs, int scale,
|
||||
typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void merge_step_n_vec(reg_t *regs) {
|
||||
// Do cross vector merges
|
||||
merge_substep_n_vec<vtype, numVecs, scale>(regs);
|
||||
|
||||
// Do internal vector merges
|
||||
internal_merge_n_vec<vtype, numVecs, scale>(regs);
|
||||
}
|
||||
|
||||
template <typename vtype, int numVecs, int numPer = 2,
|
||||
typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs) {
|
||||
if constexpr (numPer > vtype::numlanes) {
|
||||
UNUSED(regs);
|
||||
return;
|
||||
} else {
|
||||
merge_step_n_vec<vtype, numVecs, numPer>(regs);
|
||||
merge_n_vec<vtype, numVecs, numPer * 2>(regs);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) {
|
||||
static_assert(numVecs > 0, "numVecs should be > 0");
|
||||
if constexpr (numVecs > 1) {
|
||||
if (N * 2 <= numVecs * vtype::numlanes) {
|
||||
sort_n_vec<vtype, numVecs / 2>(arr, N);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
reg_t vecs[numVecs];
|
||||
|
||||
// Generate masks for loading and storing
|
||||
typename vtype::opmask_t ioMasks[numVecs - numVecs / 2];
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
|
||||
uint64_t num_to_read =
|
||||
std::min((uint64_t)std::max(0, N - i * vtype::numlanes),
|
||||
(uint64_t)vtype::numlanes);
|
||||
ioMasks[j] = vtype::get_partial_loadmask(num_to_read);
|
||||
}
|
||||
|
||||
// Unmasked part of the load
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = 0; i < numVecs / 2; i++) {
|
||||
vecs[i] = vtype::loadu(arr + i * vtype::numlanes);
|
||||
}
|
||||
// Masked part of the load
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
|
||||
vecs[i] = vtype::mask_loadu(vtype::zmm_max(), ioMasks[j],
|
||||
arr + i * vtype::numlanes);
|
||||
}
|
||||
|
||||
/* Run the initial sorting network to sort the columns of the [numVecs x
|
||||
* num_lanes] matrix
|
||||
*/
|
||||
bitonic_sort_n_vec<vtype, numVecs>(vecs);
|
||||
|
||||
// Merge the vectors using bitonic merging networks
|
||||
merge_n_vec<vtype, numVecs>(vecs);
|
||||
|
||||
// Unmasked part of the store
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = 0; i < numVecs / 2; i++) {
|
||||
vtype::storeu(arr + i * vtype::numlanes, vecs[i]);
|
||||
}
|
||||
// Masked part of the store
|
||||
X86_SIMD_SORT_UNROLL_LOOP(64)
|
||||
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
|
||||
vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename vtype, int maxN>
|
||||
X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) {
|
||||
constexpr int numVecs = maxN / vtype::numlanes;
|
||||
constexpr bool isMultiple = (maxN == (vtype::numlanes * numVecs));
|
||||
constexpr bool powerOfTwo = (numVecs != 0 && !(numVecs & (numVecs - 1)));
|
||||
static_assert(powerOfTwo == true && isMultiple == true,
|
||||
"maxN must be vtype::numlanes times a power of 2");
|
||||
|
||||
sort_n_vec<vtype, numVecs>(arr, N);
|
||||
}
|
||||
#endif
|
||||
342
src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp
Normal file
342
src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp
Normal file
@@ -0,0 +1,342 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) All of these sources
|
||||
// files are generated from the optimal networks described in
|
||||
// https://bertdobbelaere.github.io/sorting_networks.html
|
||||
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs) {
|
||||
COEX<vtype>(vecs[0], vecs[2]);
|
||||
COEX<vtype>(vecs[1], vecs[3]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[1]);
|
||||
COEX<vtype>(vecs[2], vecs[3]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[2]);
|
||||
}
|
||||
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void optimal_sort_8(reg_t *vecs) {
|
||||
COEX<vtype>(vecs[0], vecs[2]);
|
||||
COEX<vtype>(vecs[1], vecs[3]);
|
||||
COEX<vtype>(vecs[4], vecs[6]);
|
||||
COEX<vtype>(vecs[5], vecs[7]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[4]);
|
||||
COEX<vtype>(vecs[1], vecs[5]);
|
||||
COEX<vtype>(vecs[2], vecs[6]);
|
||||
COEX<vtype>(vecs[3], vecs[7]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[1]);
|
||||
COEX<vtype>(vecs[2], vecs[3]);
|
||||
COEX<vtype>(vecs[4], vecs[5]);
|
||||
COEX<vtype>(vecs[6], vecs[7]);
|
||||
|
||||
COEX<vtype>(vecs[2], vecs[4]);
|
||||
COEX<vtype>(vecs[3], vecs[5]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[4]);
|
||||
COEX<vtype>(vecs[3], vecs[6]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[2]);
|
||||
COEX<vtype>(vecs[3], vecs[4]);
|
||||
COEX<vtype>(vecs[5], vecs[6]);
|
||||
}
|
||||
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void optimal_sort_16(reg_t *vecs) {
|
||||
COEX<vtype>(vecs[0], vecs[13]);
|
||||
COEX<vtype>(vecs[1], vecs[12]);
|
||||
COEX<vtype>(vecs[2], vecs[15]);
|
||||
COEX<vtype>(vecs[3], vecs[14]);
|
||||
COEX<vtype>(vecs[4], vecs[8]);
|
||||
COEX<vtype>(vecs[5], vecs[6]);
|
||||
COEX<vtype>(vecs[7], vecs[11]);
|
||||
COEX<vtype>(vecs[9], vecs[10]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[5]);
|
||||
COEX<vtype>(vecs[1], vecs[7]);
|
||||
COEX<vtype>(vecs[2], vecs[9]);
|
||||
COEX<vtype>(vecs[3], vecs[4]);
|
||||
COEX<vtype>(vecs[6], vecs[13]);
|
||||
COEX<vtype>(vecs[8], vecs[14]);
|
||||
COEX<vtype>(vecs[10], vecs[15]);
|
||||
COEX<vtype>(vecs[11], vecs[12]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[1]);
|
||||
COEX<vtype>(vecs[2], vecs[3]);
|
||||
COEX<vtype>(vecs[4], vecs[5]);
|
||||
COEX<vtype>(vecs[6], vecs[8]);
|
||||
COEX<vtype>(vecs[7], vecs[9]);
|
||||
COEX<vtype>(vecs[10], vecs[11]);
|
||||
COEX<vtype>(vecs[12], vecs[13]);
|
||||
COEX<vtype>(vecs[14], vecs[15]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[2]);
|
||||
COEX<vtype>(vecs[1], vecs[3]);
|
||||
COEX<vtype>(vecs[4], vecs[10]);
|
||||
COEX<vtype>(vecs[5], vecs[11]);
|
||||
COEX<vtype>(vecs[6], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[9]);
|
||||
COEX<vtype>(vecs[12], vecs[14]);
|
||||
COEX<vtype>(vecs[13], vecs[15]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[2]);
|
||||
COEX<vtype>(vecs[3], vecs[12]);
|
||||
COEX<vtype>(vecs[4], vecs[6]);
|
||||
COEX<vtype>(vecs[5], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[10]);
|
||||
COEX<vtype>(vecs[9], vecs[11]);
|
||||
COEX<vtype>(vecs[13], vecs[14]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[4]);
|
||||
COEX<vtype>(vecs[2], vecs[6]);
|
||||
COEX<vtype>(vecs[5], vecs[8]);
|
||||
COEX<vtype>(vecs[7], vecs[10]);
|
||||
COEX<vtype>(vecs[9], vecs[13]);
|
||||
COEX<vtype>(vecs[11], vecs[14]);
|
||||
|
||||
COEX<vtype>(vecs[2], vecs[4]);
|
||||
COEX<vtype>(vecs[3], vecs[6]);
|
||||
COEX<vtype>(vecs[9], vecs[12]);
|
||||
COEX<vtype>(vecs[11], vecs[13]);
|
||||
|
||||
COEX<vtype>(vecs[3], vecs[5]);
|
||||
COEX<vtype>(vecs[6], vecs[8]);
|
||||
COEX<vtype>(vecs[7], vecs[9]);
|
||||
COEX<vtype>(vecs[10], vecs[12]);
|
||||
|
||||
COEX<vtype>(vecs[3], vecs[4]);
|
||||
COEX<vtype>(vecs[5], vecs[6]);
|
||||
COEX<vtype>(vecs[7], vecs[8]);
|
||||
COEX<vtype>(vecs[9], vecs[10]);
|
||||
COEX<vtype>(vecs[11], vecs[12]);
|
||||
|
||||
COEX<vtype>(vecs[6], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[9]);
|
||||
}
|
||||
|
||||
template <typename vtype, typename reg_t = typename vtype::reg_t>
|
||||
X86_SIMD_SORT_FINLINE void optimal_sort_32(reg_t *vecs) {
|
||||
COEX<vtype>(vecs[0], vecs[1]);
|
||||
COEX<vtype>(vecs[2], vecs[3]);
|
||||
COEX<vtype>(vecs[4], vecs[5]);
|
||||
COEX<vtype>(vecs[6], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[9]);
|
||||
COEX<vtype>(vecs[10], vecs[11]);
|
||||
COEX<vtype>(vecs[12], vecs[13]);
|
||||
COEX<vtype>(vecs[14], vecs[15]);
|
||||
COEX<vtype>(vecs[16], vecs[17]);
|
||||
COEX<vtype>(vecs[18], vecs[19]);
|
||||
COEX<vtype>(vecs[20], vecs[21]);
|
||||
COEX<vtype>(vecs[22], vecs[23]);
|
||||
COEX<vtype>(vecs[24], vecs[25]);
|
||||
COEX<vtype>(vecs[26], vecs[27]);
|
||||
COEX<vtype>(vecs[28], vecs[29]);
|
||||
COEX<vtype>(vecs[30], vecs[31]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[2]);
|
||||
COEX<vtype>(vecs[1], vecs[3]);
|
||||
COEX<vtype>(vecs[4], vecs[6]);
|
||||
COEX<vtype>(vecs[5], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[10]);
|
||||
COEX<vtype>(vecs[9], vecs[11]);
|
||||
COEX<vtype>(vecs[12], vecs[14]);
|
||||
COEX<vtype>(vecs[13], vecs[15]);
|
||||
COEX<vtype>(vecs[16], vecs[18]);
|
||||
COEX<vtype>(vecs[17], vecs[19]);
|
||||
COEX<vtype>(vecs[20], vecs[22]);
|
||||
COEX<vtype>(vecs[21], vecs[23]);
|
||||
COEX<vtype>(vecs[24], vecs[26]);
|
||||
COEX<vtype>(vecs[25], vecs[27]);
|
||||
COEX<vtype>(vecs[28], vecs[30]);
|
||||
COEX<vtype>(vecs[29], vecs[31]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[4]);
|
||||
COEX<vtype>(vecs[1], vecs[5]);
|
||||
COEX<vtype>(vecs[2], vecs[6]);
|
||||
COEX<vtype>(vecs[3], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[12]);
|
||||
COEX<vtype>(vecs[9], vecs[13]);
|
||||
COEX<vtype>(vecs[10], vecs[14]);
|
||||
COEX<vtype>(vecs[11], vecs[15]);
|
||||
COEX<vtype>(vecs[16], vecs[20]);
|
||||
COEX<vtype>(vecs[17], vecs[21]);
|
||||
COEX<vtype>(vecs[18], vecs[22]);
|
||||
COEX<vtype>(vecs[19], vecs[23]);
|
||||
COEX<vtype>(vecs[24], vecs[28]);
|
||||
COEX<vtype>(vecs[25], vecs[29]);
|
||||
COEX<vtype>(vecs[26], vecs[30]);
|
||||
COEX<vtype>(vecs[27], vecs[31]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[8]);
|
||||
COEX<vtype>(vecs[1], vecs[9]);
|
||||
COEX<vtype>(vecs[2], vecs[10]);
|
||||
COEX<vtype>(vecs[3], vecs[11]);
|
||||
COEX<vtype>(vecs[4], vecs[12]);
|
||||
COEX<vtype>(vecs[5], vecs[13]);
|
||||
COEX<vtype>(vecs[6], vecs[14]);
|
||||
COEX<vtype>(vecs[7], vecs[15]);
|
||||
COEX<vtype>(vecs[16], vecs[24]);
|
||||
COEX<vtype>(vecs[17], vecs[25]);
|
||||
COEX<vtype>(vecs[18], vecs[26]);
|
||||
COEX<vtype>(vecs[19], vecs[27]);
|
||||
COEX<vtype>(vecs[20], vecs[28]);
|
||||
COEX<vtype>(vecs[21], vecs[29]);
|
||||
COEX<vtype>(vecs[22], vecs[30]);
|
||||
COEX<vtype>(vecs[23], vecs[31]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[16]);
|
||||
COEX<vtype>(vecs[1], vecs[8]);
|
||||
COEX<vtype>(vecs[2], vecs[4]);
|
||||
COEX<vtype>(vecs[3], vecs[12]);
|
||||
COEX<vtype>(vecs[5], vecs[10]);
|
||||
COEX<vtype>(vecs[6], vecs[9]);
|
||||
COEX<vtype>(vecs[7], vecs[14]);
|
||||
COEX<vtype>(vecs[11], vecs[13]);
|
||||
COEX<vtype>(vecs[15], vecs[31]);
|
||||
COEX<vtype>(vecs[17], vecs[24]);
|
||||
COEX<vtype>(vecs[18], vecs[20]);
|
||||
COEX<vtype>(vecs[19], vecs[28]);
|
||||
COEX<vtype>(vecs[21], vecs[26]);
|
||||
COEX<vtype>(vecs[22], vecs[25]);
|
||||
COEX<vtype>(vecs[23], vecs[30]);
|
||||
COEX<vtype>(vecs[27], vecs[29]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[2]);
|
||||
COEX<vtype>(vecs[3], vecs[5]);
|
||||
COEX<vtype>(vecs[4], vecs[8]);
|
||||
COEX<vtype>(vecs[6], vecs[22]);
|
||||
COEX<vtype>(vecs[7], vecs[11]);
|
||||
COEX<vtype>(vecs[9], vecs[25]);
|
||||
COEX<vtype>(vecs[10], vecs[12]);
|
||||
COEX<vtype>(vecs[13], vecs[14]);
|
||||
COEX<vtype>(vecs[17], vecs[18]);
|
||||
COEX<vtype>(vecs[19], vecs[21]);
|
||||
COEX<vtype>(vecs[20], vecs[24]);
|
||||
COEX<vtype>(vecs[23], vecs[27]);
|
||||
COEX<vtype>(vecs[26], vecs[28]);
|
||||
COEX<vtype>(vecs[29], vecs[30]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[17]);
|
||||
COEX<vtype>(vecs[2], vecs[18]);
|
||||
COEX<vtype>(vecs[3], vecs[19]);
|
||||
COEX<vtype>(vecs[4], vecs[20]);
|
||||
COEX<vtype>(vecs[5], vecs[10]);
|
||||
COEX<vtype>(vecs[7], vecs[23]);
|
||||
COEX<vtype>(vecs[8], vecs[24]);
|
||||
COEX<vtype>(vecs[11], vecs[27]);
|
||||
COEX<vtype>(vecs[12], vecs[28]);
|
||||
COEX<vtype>(vecs[13], vecs[29]);
|
||||
COEX<vtype>(vecs[14], vecs[30]);
|
||||
COEX<vtype>(vecs[21], vecs[26]);
|
||||
|
||||
COEX<vtype>(vecs[3], vecs[17]);
|
||||
COEX<vtype>(vecs[4], vecs[16]);
|
||||
COEX<vtype>(vecs[5], vecs[21]);
|
||||
COEX<vtype>(vecs[6], vecs[18]);
|
||||
COEX<vtype>(vecs[7], vecs[9]);
|
||||
COEX<vtype>(vecs[8], vecs[20]);
|
||||
COEX<vtype>(vecs[10], vecs[26]);
|
||||
COEX<vtype>(vecs[11], vecs[23]);
|
||||
COEX<vtype>(vecs[13], vecs[25]);
|
||||
COEX<vtype>(vecs[14], vecs[28]);
|
||||
COEX<vtype>(vecs[15], vecs[27]);
|
||||
COEX<vtype>(vecs[22], vecs[24]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[4]);
|
||||
COEX<vtype>(vecs[3], vecs[8]);
|
||||
COEX<vtype>(vecs[5], vecs[16]);
|
||||
COEX<vtype>(vecs[7], vecs[17]);
|
||||
COEX<vtype>(vecs[9], vecs[21]);
|
||||
COEX<vtype>(vecs[10], vecs[22]);
|
||||
COEX<vtype>(vecs[11], vecs[19]);
|
||||
COEX<vtype>(vecs[12], vecs[20]);
|
||||
COEX<vtype>(vecs[14], vecs[24]);
|
||||
COEX<vtype>(vecs[15], vecs[26]);
|
||||
COEX<vtype>(vecs[23], vecs[28]);
|
||||
COEX<vtype>(vecs[27], vecs[30]);
|
||||
|
||||
COEX<vtype>(vecs[2], vecs[5]);
|
||||
COEX<vtype>(vecs[7], vecs[8]);
|
||||
COEX<vtype>(vecs[9], vecs[18]);
|
||||
COEX<vtype>(vecs[11], vecs[17]);
|
||||
COEX<vtype>(vecs[12], vecs[16]);
|
||||
COEX<vtype>(vecs[13], vecs[22]);
|
||||
COEX<vtype>(vecs[14], vecs[20]);
|
||||
COEX<vtype>(vecs[15], vecs[19]);
|
||||
COEX<vtype>(vecs[23], vecs[24]);
|
||||
COEX<vtype>(vecs[26], vecs[29]);
|
||||
|
||||
COEX<vtype>(vecs[2], vecs[4]);
|
||||
COEX<vtype>(vecs[6], vecs[12]);
|
||||
COEX<vtype>(vecs[9], vecs[16]);
|
||||
COEX<vtype>(vecs[10], vecs[11]);
|
||||
COEX<vtype>(vecs[13], vecs[17]);
|
||||
COEX<vtype>(vecs[14], vecs[18]);
|
||||
COEX<vtype>(vecs[15], vecs[22]);
|
||||
COEX<vtype>(vecs[19], vecs[25]);
|
||||
COEX<vtype>(vecs[20], vecs[21]);
|
||||
COEX<vtype>(vecs[27], vecs[29]);
|
||||
|
||||
COEX<vtype>(vecs[5], vecs[6]);
|
||||
COEX<vtype>(vecs[8], vecs[12]);
|
||||
COEX<vtype>(vecs[9], vecs[10]);
|
||||
COEX<vtype>(vecs[11], vecs[13]);
|
||||
COEX<vtype>(vecs[14], vecs[16]);
|
||||
COEX<vtype>(vecs[15], vecs[17]);
|
||||
COEX<vtype>(vecs[18], vecs[20]);
|
||||
COEX<vtype>(vecs[19], vecs[23]);
|
||||
COEX<vtype>(vecs[21], vecs[22]);
|
||||
COEX<vtype>(vecs[25], vecs[26]);
|
||||
|
||||
COEX<vtype>(vecs[3], vecs[5]);
|
||||
COEX<vtype>(vecs[6], vecs[7]);
|
||||
COEX<vtype>(vecs[8], vecs[9]);
|
||||
COEX<vtype>(vecs[10], vecs[12]);
|
||||
COEX<vtype>(vecs[11], vecs[14]);
|
||||
COEX<vtype>(vecs[13], vecs[16]);
|
||||
COEX<vtype>(vecs[15], vecs[18]);
|
||||
COEX<vtype>(vecs[17], vecs[20]);
|
||||
COEX<vtype>(vecs[19], vecs[21]);
|
||||
COEX<vtype>(vecs[22], vecs[23]);
|
||||
COEX<vtype>(vecs[24], vecs[25]);
|
||||
COEX<vtype>(vecs[26], vecs[28]);
|
||||
|
||||
COEX<vtype>(vecs[3], vecs[4]);
|
||||
COEX<vtype>(vecs[5], vecs[6]);
|
||||
COEX<vtype>(vecs[7], vecs[8]);
|
||||
COEX<vtype>(vecs[9], vecs[10]);
|
||||
COEX<vtype>(vecs[11], vecs[12]);
|
||||
COEX<vtype>(vecs[13], vecs[14]);
|
||||
COEX<vtype>(vecs[15], vecs[16]);
|
||||
COEX<vtype>(vecs[17], vecs[18]);
|
||||
COEX<vtype>(vecs[19], vecs[20]);
|
||||
COEX<vtype>(vecs[21], vecs[22]);
|
||||
COEX<vtype>(vecs[23], vecs[24]);
|
||||
COEX<vtype>(vecs[25], vecs[26]);
|
||||
COEX<vtype>(vecs[27], vecs[28]);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
|
||||
|
||||
template <typename vtype, typename mm_t>
|
||||
X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left,
|
||||
const arrsize_t right) {
|
||||
using reg_t = typename vtype::reg_t;
|
||||
type_t samples[vtype::numlanes];
|
||||
arrsize_t delta = (right - left) / vtype::numlanes;
|
||||
for (int i = 0; i < vtype::numlanes; i++) {
|
||||
samples[i] = arr[left + i * delta];
|
||||
}
|
||||
reg_t rand_vec = vtype::loadu(samples);
|
||||
reg_t sort = vtype::sort_vec(rand_vec);
|
||||
|
||||
return ((type_t *)&sort)[vtype::numlanes / 2];
|
||||
}
|
||||
|
||||
template <typename vtype, typename type_t>
|
||||
X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left,
|
||||
const arrsize_t right) {
|
||||
if (right - left <= 1024) {
|
||||
return get_pivot<vtype>(arr, left, right);
|
||||
}
|
||||
|
||||
using reg_t = typename vtype::reg_t;
|
||||
constexpr int numVecs = 5;
|
||||
|
||||
arrsize_t width = (right - vtype::numlanes) - left;
|
||||
arrsize_t delta = width / numVecs;
|
||||
|
||||
reg_t vecs[numVecs];
|
||||
// Load data
|
||||
for (int i = 0; i < numVecs; i++) {
|
||||
vecs[i] = vtype::loadu(arr + left + delta * i);
|
||||
}
|
||||
|
||||
// Implement sorting network (from
|
||||
// https://bertdobbelaere.github.io/sorting_networks.html)
|
||||
COEX<vtype>(vecs[0], vecs[3]);
|
||||
COEX<vtype>(vecs[1], vecs[4]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[2]);
|
||||
COEX<vtype>(vecs[1], vecs[3]);
|
||||
|
||||
COEX<vtype>(vecs[0], vecs[1]);
|
||||
COEX<vtype>(vecs[2], vecs[4]);
|
||||
|
||||
COEX<vtype>(vecs[1], vecs[2]);
|
||||
COEX<vtype>(vecs[3], vecs[4]);
|
||||
|
||||
COEX<vtype>(vecs[2], vecs[3]);
|
||||
|
||||
// Calculate median of the middle vector
|
||||
reg_t &vec = vecs[numVecs / 2];
|
||||
vec = vtype::sort_vec(vec);
|
||||
|
||||
type_t data[vtype::numlanes];
|
||||
vtype::storeu(data, vec);
|
||||
return data[vtype::numlanes / 2];
|
||||
}
|
||||
@@ -42,15 +42,10 @@ import static java.lang.String.LATIN1;
|
||||
|
||||
final class StringUTF16 {
|
||||
|
||||
// Return a new byte array for a UTF16-coded string for len chars
|
||||
// Throw an exception if out of range
|
||||
public static byte[] newBytesFor(int len) {
|
||||
if (len < 0) {
|
||||
throw new NegativeArraySizeException();
|
||||
}
|
||||
if (len > MAX_LENGTH) {
|
||||
throw new OutOfMemoryError("UTF16 String size is " + len +
|
||||
", should be less than " + MAX_LENGTH);
|
||||
}
|
||||
return new byte[len << 1];
|
||||
return new byte[newBytesLength(len)];
|
||||
}
|
||||
|
||||
// Check the size of a UTF16-coded string
|
||||
@@ -59,7 +54,7 @@ final class StringUTF16 {
|
||||
if (len < 0) {
|
||||
throw new NegativeArraySizeException();
|
||||
}
|
||||
if (len > MAX_LENGTH) {
|
||||
if (len >= MAX_LENGTH) {
|
||||
throw new OutOfMemoryError("UTF16 String size is " + len +
|
||||
", should be less than " + MAX_LENGTH);
|
||||
}
|
||||
|
||||
@@ -1647,7 +1647,7 @@ public class Thread implements Runnable {
|
||||
* interrupt the wait.
|
||||
* For more information, see
|
||||
* <a href="{@docRoot}/java.base/java/lang/doc-files/threadPrimitiveDeprecation.html">Why
|
||||
* are Thread.stop, Thread.suspend and Thread.resume Deprecated?</a>.
|
||||
* is Thread.stop deprecated and the ability to stop a thread removed?</a>.
|
||||
*/
|
||||
@Deprecated(since="1.2", forRemoval=true)
|
||||
public final void stop() {
|
||||
@@ -1788,44 +1788,6 @@ public class Thread implements Runnable {
|
||||
return eetop != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws {@code UnsupportedOperationException}.
|
||||
*
|
||||
* @throws UnsupportedOperationException always
|
||||
*
|
||||
* @deprecated This method was originally specified to suspend a thread.
|
||||
* It was inherently deadlock-prone. If the target thread held a lock on
|
||||
* a monitor protecting a critical system resource when it was suspended,
|
||||
* no thread could access the resource until the target thread was resumed.
|
||||
* If the thread intending to resume the target thread attempted to lock
|
||||
* the monitor prior to calling {@code resume}, deadlock would result.
|
||||
* Such deadlocks typically manifested themselves as "frozen" processes.
|
||||
* For more information, see
|
||||
* <a href="{@docRoot}/java.base/java/lang/doc-files/threadPrimitiveDeprecation.html">Why
|
||||
* are Thread.stop, Thread.suspend and Thread.resume Deprecated?</a>.
|
||||
*/
|
||||
@Deprecated(since="1.2", forRemoval=true)
|
||||
public final void suspend() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws {@code UnsupportedOperationException}.
|
||||
*
|
||||
* @throws UnsupportedOperationException always
|
||||
*
|
||||
* @deprecated This method was originally specified to resume a thread
|
||||
* suspended with {@link #suspend()}. Suspending a thread was
|
||||
* inherently deadlock-prone.
|
||||
* For more information, see
|
||||
* <a href="{@docRoot}/java.base/java/lang/doc-files/threadPrimitiveDeprecation.html">Why
|
||||
* are Thread.stop, Thread.suspend and Thread.resume Deprecated?</a>.
|
||||
*/
|
||||
@Deprecated(since="1.2", forRemoval=true)
|
||||
public final void resume() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the priority of this thread.
|
||||
*
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1995, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1995, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@@ -554,17 +554,6 @@ public class ThreadGroup implements Thread.UncaughtExceptionHandler {
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws {@code UnsupportedOperationException}.
|
||||
*
|
||||
* @deprecated This method was originally specified to stop all threads in
|
||||
* the thread group. It was inherently unsafe.
|
||||
*/
|
||||
@Deprecated(since="1.2", forRemoval=true)
|
||||
public final void stop() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Interrupts all {@linkplain Thread#isAlive() live} platform threads in
|
||||
* this thread group and its subgroups.
|
||||
@@ -587,28 +576,6 @@ public class ThreadGroup implements Thread.UncaughtExceptionHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws {@code UnsupportedOperationException}.
|
||||
*
|
||||
* @deprecated This method was originally specified to suspend all threads
|
||||
* in the thread group.
|
||||
*/
|
||||
@Deprecated(since="1.2", forRemoval=true)
|
||||
public final void suspend() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws {@code UnsupportedOperationException}.
|
||||
*
|
||||
* @deprecated This method was originally specified to resume all threads
|
||||
* in the thread group.
|
||||
*/
|
||||
@Deprecated(since="1.2", forRemoval=true)
|
||||
public final void resume() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does nothing.
|
||||
*
|
||||
|
||||
@@ -1475,6 +1475,9 @@ public sealed interface ClassFile
|
||||
/** The class major version of JAVA_22. */
|
||||
int JAVA_22_VERSION = 66;
|
||||
|
||||
/** 67 */
|
||||
int JAVA_23_VERSION = 67;
|
||||
|
||||
/**
|
||||
* A minor version number indicating a class uses preview features
|
||||
* of a Java SE version since 12, for major versions {@value
|
||||
@@ -1486,7 +1489,7 @@ public sealed interface ClassFile
|
||||
* {@return the latest major Java version}
|
||||
*/
|
||||
static int latestMajorVersion() {
|
||||
return JAVA_22_VERSION;
|
||||
return JAVA_23_VERSION;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -202,7 +202,7 @@ public sealed interface ModuleAttribute
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the module flags
|
||||
* Sets the module version
|
||||
* @param version the module version
|
||||
* @return this builder
|
||||
*/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
<!doctype html>
|
||||
<!--
|
||||
Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
Copyright (c) 2005, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
@@ -158,173 +158,5 @@ wouldn't respond to <code>Thread.stop</code> either.</em> Such
|
||||
cases include deliberate denial-of-service attacks, and I/O
|
||||
operations for which thread.stop and thread.interrupt do not work
|
||||
properly.</p>
|
||||
<hr>
|
||||
<h2>Why are <code>Thread.suspend</code> and
|
||||
<code>Thread.resume</code> deprecated and the ability to suspend or
|
||||
resume a thread removed?</h2>
|
||||
<p><code>Thread.suspend</code> was inherently deadlock-prone. If the
|
||||
target thread held a lock on a monitor protecting a critical
|
||||
system resource when it is suspended, no thread could access the
|
||||
resource until the target thread was resumed. If the thread intending
|
||||
to resume the target thread attempted to lock the monitor prior
|
||||
to calling <code>resume</code>, deadlock resulted. Such deadlocks
|
||||
typically manifest themselves as "frozen" processes.</p>
|
||||
<hr>
|
||||
<h2>What should I use instead of <code>Thread.suspend</code> and
|
||||
<code>Thread.resume</code>?</h2>
|
||||
<p>As with <code>Thread.stop</code>, the prudent approach is to
|
||||
have the "target thread" poll a variable indicating the desired
|
||||
state of the thread (active or suspended). When the desired state
|
||||
is suspended, the thread waits using <code>Object.wait</code>. When
|
||||
the thread is resumed, the target thread is notified using
|
||||
<code>Object.notify</code>.</p>
|
||||
<p>For example, suppose your applet contains the following
|
||||
mousePressed event handler, which toggles the state of a thread
|
||||
called <code>blinker</code>:</p>
|
||||
<pre>
|
||||
private boolean threadSuspended;
|
||||
|
||||
Public void mousePressed(MouseEvent e) {
|
||||
e.consume();
|
||||
|
||||
if (threadSuspended)
|
||||
blinker.resume();
|
||||
else
|
||||
blinker.suspend(); // DEADLOCK-PRONE!
|
||||
|
||||
threadSuspended = !threadSuspended;
|
||||
}
|
||||
</pre>
|
||||
You can avoid the use of <code>Thread.suspend</code> and
|
||||
<code>Thread.resume</code> by replacing the event handler above
|
||||
with:
|
||||
<pre>
|
||||
public synchronized void mousePressed(MouseEvent e) {
|
||||
e.consume();
|
||||
|
||||
threadSuspended = !threadSuspended;
|
||||
|
||||
if (!threadSuspended)
|
||||
notify();
|
||||
}
|
||||
</pre>
|
||||
and adding the following code to the "run loop":
|
||||
<pre>
|
||||
synchronized(this) {
|
||||
while (threadSuspended)
|
||||
wait();
|
||||
}
|
||||
</pre>
|
||||
The <code>wait</code> method throws the
|
||||
<code>InterruptedException</code>, so it must be inside a <code>try
|
||||
... catch</code> clause. It's fine to put it in the same clause as
|
||||
the <code>sleep</code>. The check should follow (rather than
|
||||
precede) the <code>sleep</code> so the window is immediately
|
||||
repainted when the thread is "resumed." The resulting
|
||||
<code>run</code> method follows:
|
||||
<pre>
|
||||
public void run() {
|
||||
while (true) {
|
||||
try {
|
||||
Thread.sleep(interval);
|
||||
|
||||
synchronized(this) {
|
||||
while (threadSuspended)
|
||||
wait();
|
||||
}
|
||||
} catch (InterruptedException e){
|
||||
}
|
||||
repaint();
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
Note that the <code>notify</code> in the <code>mousePressed</code>
|
||||
method and the <code>wait</code> in the <code>run</code> method are
|
||||
inside <code>synchronized</code> blocks. This is required by the
|
||||
language, and ensures that <code>wait</code> and
|
||||
<code>notify</code> are properly serialized. In practical terms,
|
||||
this eliminates race conditions that could cause the "suspended"
|
||||
thread to miss a <code>notify</code> and remain suspended
|
||||
indefinitely.
|
||||
<p>While the cost of synchronization in Java is decreasing as the
|
||||
platform matures, it will never be free. A simple trick can be used
|
||||
to remove the synchronization that we've added to each iteration of
|
||||
the "run loop." The synchronized block that was added is replaced
|
||||
by a slightly more complex piece of code that enters a synchronized
|
||||
block only if the thread has actually been suspended:</p>
|
||||
<pre>
|
||||
if (threadSuspended) {
|
||||
synchronized(this) {
|
||||
while (threadSuspended)
|
||||
wait();
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
<p>In the absence of explicit synchronization,
|
||||
<code>threadSuspended</code> must be made <code>volatile</code> to ensure
|
||||
prompt communication of the suspend-request.</p>
|
||||
The resulting <code>run</code> method is:
|
||||
<pre>
|
||||
private volatile boolean threadSuspended;
|
||||
|
||||
public void run() {
|
||||
while (true) {
|
||||
try {
|
||||
Thread.sleep(interval);
|
||||
|
||||
if (threadSuspended) {
|
||||
synchronized(this) {
|
||||
while (threadSuspended)
|
||||
wait();
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e){
|
||||
}
|
||||
repaint();
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
<hr>
|
||||
<h2>Can I combine the two techniques to produce a thread that may
|
||||
be safely "stopped" or "suspended"?</h2>
|
||||
Yes, it's reasonably straightforward. The one subtlety is that the
|
||||
target thread may already be suspended at the time that another
|
||||
thread tries to stop it. If the <code>stop</code> method merely sets
|
||||
the state variable (<code>blinker</code>) to null, the target thread
|
||||
will remain suspended (waiting on the monitor), rather than exiting
|
||||
gracefully as it should. If the applet is restarted, multiple
|
||||
threads could end up waiting on the monitor at the same time,
|
||||
resulting in erratic behavior.
|
||||
<p>To rectify this situation, the <code>stop</code> method must ensure
|
||||
that the target thread resumes immediately if it is suspended. Once
|
||||
the target thread resumes, it must recognize immediately that it
|
||||
has been stopped, and exit gracefully. Here's how the resulting
|
||||
<code>run</code> and <code>stop</code> methods look:</p>
|
||||
<pre>
|
||||
public void run() {
|
||||
Thread thisThread = Thread.currentThread();
|
||||
while (blinker == thisThread) {
|
||||
try {
|
||||
Thread.sleep(interval);
|
||||
|
||||
synchronized(this) {
|
||||
while (threadSuspended && blinker==thisThread)
|
||||
wait();
|
||||
}
|
||||
} catch (InterruptedException e){
|
||||
}
|
||||
repaint();
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void stop() {
|
||||
blinker = null;
|
||||
notify();
|
||||
}
|
||||
</pre>
|
||||
If the <code>stop</code> method calls <code>Thread.interrupt</code>, as
|
||||
described above, it needn't call <code>notify</code> as well, but it
|
||||
still must be synchronized. This ensures that the target thread
|
||||
won't miss an interrupt due to a race condition.
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -631,6 +631,9 @@ public sealed interface MemoryLayout
|
||||
* <li>The accessed memory segment must be
|
||||
* {@link MemorySegment#isAccessibleBy(Thread) accessible} from the thread
|
||||
* performing the access operation, or a {@link WrongThreadException} is thrown.</li>
|
||||
* <li>For write operations, the accessed memory segment must not be
|
||||
* {@link MemorySegment#isReadOnly() read only}, or an
|
||||
* {@link IllegalArgumentException} is thrown.</li>
|
||||
* <li>The {@linkplain MemorySegment#scope() scope} associated with the accessed
|
||||
* segment must be {@linkplain MemorySegment.Scope#isAlive() alive}, or an
|
||||
* {@link IllegalStateException} is thrown.</li>
|
||||
|
||||
@@ -869,7 +869,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* this segment is not {@linkplain Scope#isAlive() alive}
|
||||
* @throws WrongThreadException if this method is called from a thread {@code T},
|
||||
* such that {@code isAccessibleBy(T) == false}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
MemorySegment fill(byte value);
|
||||
@@ -894,7 +894,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* {@code src} is not {@linkplain Scope#isAlive() alive}
|
||||
* @throws WrongThreadException if this method is called from a thread {@code T},
|
||||
* such that {@code src.isAccessibleBy(T) == false}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
* @return this segment
|
||||
*/
|
||||
@@ -1269,6 +1269,8 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* this segment is not {@linkplain Scope#isAlive() alive}
|
||||
* @throws WrongThreadException if this method is called from a thread {@code T},
|
||||
* such that {@code isAccessibleBy(T) == false}
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setString(long offset, String str);
|
||||
|
||||
@@ -1306,6 +1308,8 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* such that {@code isAccessibleBy(T) == false}
|
||||
* @throws IllegalArgumentException if {@code charset} is not a
|
||||
* {@linkplain StandardCharsets standard charset}
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setString(long offset, String str, Charset charset);
|
||||
|
||||
@@ -1493,7 +1497,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IndexOutOfBoundsException if {@code dstOffset > dstSegment.byteSize() - bytes}
|
||||
* @throws IndexOutOfBoundsException if either {@code srcOffset},
|
||||
* {@code dstOffset} or {@code bytes} are {@code < 0}
|
||||
* @throws UnsupportedOperationException if {@code dstSegment} is
|
||||
* @throws IllegalArgumentException if {@code dstSegment} is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
@ForceInline
|
||||
@@ -1552,7 +1556,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* {@code dstSegment} is not {@linkplain Scope#isAlive() alive}
|
||||
* @throws WrongThreadException if this method is called from a thread {@code T},
|
||||
* such that {@code dstSegment.isAccessibleBy(T) == false}
|
||||
* @throws UnsupportedOperationException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IndexOutOfBoundsException if {@code elementCount * srcLayout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code elementCount * dtsLayout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code srcOffset > srcSegment.byteSize() - (elementCount * srcLayout.byteSize())}
|
||||
@@ -1605,7 +1609,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfByte layout, long offset, byte value);
|
||||
@@ -1643,7 +1647,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfBoolean layout, long offset, boolean value);
|
||||
@@ -1681,7 +1685,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfChar layout, long offset, char value);
|
||||
@@ -1719,7 +1723,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfShort layout, long offset, short value);
|
||||
@@ -1757,7 +1761,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfInt layout, long offset, int value);
|
||||
@@ -1795,7 +1799,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfFloat layout, long offset, float value);
|
||||
@@ -1833,7 +1837,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfLong layout, long offset, long value);
|
||||
@@ -1871,7 +1875,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the provided layout
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(ValueLayout.OfDouble layout, long offset, double value);
|
||||
@@ -1921,8 +1925,10 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
* @throws UnsupportedOperationException if {@code value} is not a
|
||||
* @throws IllegalArgumentException if {@code value} is not a
|
||||
* {@linkplain #isNative() native} segment
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void set(AddressLayout layout, long offset, MemorySegment value);
|
||||
|
||||
@@ -2055,7 +2061,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfByte layout, long index, byte value);
|
||||
|
||||
@@ -2078,7 +2084,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfBoolean layout, long index, boolean value);
|
||||
|
||||
@@ -2101,7 +2107,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfShort layout, long index, short value);
|
||||
|
||||
@@ -2146,7 +2152,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfInt layout, long index, int value);
|
||||
|
||||
@@ -2191,7 +2197,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfFloat layout, long index, float value);
|
||||
|
||||
@@ -2236,7 +2242,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfLong layout, long index, long value);
|
||||
|
||||
@@ -2281,7 +2287,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(ValueLayout.OfDouble layout, long index, double value);
|
||||
|
||||
@@ -2336,7 +2342,9 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
|
||||
* @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
|
||||
* @throws UnsupportedOperationException if {@code value} is not a {@linkplain #isNative() native} segment
|
||||
* @throws IllegalArgumentException if {@code value} is not a {@linkplain #isNative() native} segment
|
||||
* @throws IllegalArgumentException if this segment is
|
||||
* {@linkplain #isReadOnly() read-only}
|
||||
*/
|
||||
void setAtIndex(AddressLayout layout, long index, MemorySegment value);
|
||||
|
||||
@@ -2460,7 +2468,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
|
||||
* <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
|
||||
* in the source element layout
|
||||
* @throws IllegalArgumentException if {@code dstLayout.byteAlignment() > dstLayout.byteSize()}
|
||||
* @throws UnsupportedOperationException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IllegalArgumentException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
|
||||
* @throws IndexOutOfBoundsException if {@code elementCount * dstLayout.byteSize()} overflows
|
||||
* @throws IndexOutOfBoundsException if {@code dstOffset > dstSegment.byteSize() - (elementCount * dstLayout.byteSize())}
|
||||
* @throws IndexOutOfBoundsException if {@code srcIndex > srcArray.length - elementCount}
|
||||
|
||||
@@ -350,7 +350,7 @@ public interface SegmentAllocator {
|
||||
*
|
||||
* @param layout the layout of the block of memory to be allocated
|
||||
* @param value the value to be set in the newly allocated memory segment
|
||||
* @throws UnsupportedOperationException if {@code value} is not
|
||||
* @throws IllegalArgumentException if {@code value} is not
|
||||
* a {@linkplain MemorySegment#isNative() native} segment
|
||||
*/
|
||||
default MemorySegment allocateFrom(AddressLayout layout, MemorySegment value) {
|
||||
@@ -670,9 +670,11 @@ public interface SegmentAllocator {
|
||||
*
|
||||
* @param segment the segment from which the returned allocator should slice from
|
||||
* @return a new slicing allocator
|
||||
* @throws IllegalArgumentException if the {@code segment} is
|
||||
* {@linkplain MemorySegment#isReadOnly() read-only}
|
||||
*/
|
||||
static SegmentAllocator slicingAllocator(MemorySegment segment) {
|
||||
Objects.requireNonNull(segment);
|
||||
assertWritable(segment);
|
||||
return new SlicingAllocator(segment);
|
||||
}
|
||||
|
||||
@@ -700,9 +702,19 @@ public interface SegmentAllocator {
|
||||
* @param segment the memory segment to be recycled by the returned allocator
|
||||
* @return an allocator that recycles an existing segment upon each new
|
||||
* allocation request
|
||||
* @throws IllegalArgumentException if the {@code segment} is
|
||||
* {@linkplain MemorySegment#isReadOnly() read-only}
|
||||
*/
|
||||
static SegmentAllocator prefixAllocator(MemorySegment segment) {
|
||||
return (AbstractMemorySegmentImpl)Objects.requireNonNull(segment);
|
||||
assertWritable(segment);
|
||||
return (AbstractMemorySegmentImpl)segment;
|
||||
}
|
||||
|
||||
private static void assertWritable(MemorySegment segment) {
|
||||
// Implicit null check
|
||||
if (segment.isReadOnly()) {
|
||||
throw new IllegalArgumentException("read-only segment");
|
||||
}
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
|
||||
@@ -1841,7 +1841,7 @@ public class MethodHandles {
|
||||
* <a href="MethodHandles.Lookup.html#secmgr">refuses access</a>
|
||||
* @throws NullPointerException if {@code bytes} is {@code null}
|
||||
* @since 9
|
||||
* @see Lookup#privateLookupIn
|
||||
* @see MethodHandles#privateLookupIn
|
||||
* @see Lookup#dropLookupMode
|
||||
* @see ClassLoader#defineClass(String,byte[],int,int,ProtectionDomain)
|
||||
*/
|
||||
|
||||
@@ -294,6 +294,18 @@ public enum ClassFileFormatVersion {
|
||||
* <cite>The Java Virtual Machine Specification, Java SE 22 Edition</cite></a>
|
||||
*/
|
||||
RELEASE_22(66),
|
||||
|
||||
/**
|
||||
* The version introduced by the Java Platform, Standard Edition
|
||||
* 23.
|
||||
*
|
||||
* @since 23
|
||||
*
|
||||
* @see <a
|
||||
* href="https://docs.oracle.com/javase/specs/jvms/se23/html/index.html">
|
||||
* <cite>The Java Virtual Machine Specification, Java SE 23 Edition</cite></a>
|
||||
*/
|
||||
RELEASE_23(67),
|
||||
; // Reduce code churn when appending new constants
|
||||
|
||||
// Note to maintainers: when adding constants for newer releases,
|
||||
@@ -309,7 +321,7 @@ public enum ClassFileFormatVersion {
|
||||
* {@return the latest class file format version}
|
||||
*/
|
||||
public static ClassFileFormatVersion latest() {
|
||||
return RELEASE_22;
|
||||
return RELEASE_23;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -29,6 +29,7 @@ package java.nio;
|
||||
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.util.Objects;
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
|
||||
/**
|
||||
#if[rw]
|
||||
@@ -705,6 +706,9 @@ class Heap$Type$Buffer$RW$
|
||||
addr, segment)));
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return ArraysSupport.vectorizedHashCode(hb, ix(position()), remaining(), 1, ArraysSupport.T_BYTE);
|
||||
}
|
||||
|
||||
#end[byte]
|
||||
|
||||
@@ -733,6 +737,9 @@ class Heap$Type$Buffer$RW$
|
||||
offset, segment);
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return ArraysSupport.vectorizedHashCode(hb, ix(position()), remaining(), 1, ArraysSupport.T_CHAR);
|
||||
}
|
||||
#end[char]
|
||||
|
||||
|
||||
|
||||
@@ -88,7 +88,6 @@ import java.util.Arrays;
|
||||
* <p>
|
||||
* Below is an example of constructing a ChoiceFormat with arrays to format
|
||||
* and parse values:
|
||||
* <blockquote>
|
||||
* {@snippet lang=java :
|
||||
* double[] limits = {1,2,3,4,5,6,7};
|
||||
* String[] dayOfWeekNames = {"Sun","Mon","Tue","Wed","Thur","Fri","Sat"};
|
||||
@@ -100,34 +99,27 @@ import java.util.Arrays;
|
||||
* + form.parse(form.format(i),status));
|
||||
* }
|
||||
* }
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>
|
||||
* For more sophisticated patterns, {@code ChoiceFormat} can be used with
|
||||
* {@link MessageFormat} to produce accurate forms for singular and plural:
|
||||
* <blockquote>
|
||||
* {@snippet lang=java :
|
||||
* double[] filelimits = {0,1,2};
|
||||
* String[] filepart = {"are no files","is one file","are {2} files"};
|
||||
* ChoiceFormat fileform = new ChoiceFormat(filelimits, filepart);
|
||||
* Format[] testFormats = {fileform, null, NumberFormat.getInstance()};
|
||||
* MessageFormat pattform = new MessageFormat("There {0} on {1}");
|
||||
* pattform.setFormats(testFormats);
|
||||
* Object[] testArgs = {null, "ADisk", null};
|
||||
* for (int i = 0; i < 4; ++i) {
|
||||
* testArgs[0] = Integer.valueOf(i);
|
||||
* testArgs[2] = testArgs[0];
|
||||
* System.out.println(pattform.format(testArgs));
|
||||
* MessageFormat msgFmt = new MessageFormat("The disk \"{0}\" contains {1}.");
|
||||
* double[] fileLimits = {0,1,2};
|
||||
* String[] filePart = {"no files","one file","{1,number} files"};
|
||||
* ChoiceFormat fileChoices = new ChoiceFormat(fileLimits, filePart);
|
||||
* msgFmt.setFormatByArgumentIndex(1, fileChoices);
|
||||
* Object[] args = {"MyDisk", 1273};
|
||||
* System.out.println(msgFmt.format(args));
|
||||
* }
|
||||
* }
|
||||
* </blockquote>
|
||||
* Would output the following:
|
||||
* <blockquote>
|
||||
* <pre>{@code
|
||||
* There are no files on ADisk
|
||||
* There is one file on ADisk
|
||||
* There are 2 files on ADisk
|
||||
* There are 3 files on ADisk
|
||||
* }</pre>
|
||||
* </blockquote>
|
||||
* The output with different values for {@code fileCount}:
|
||||
* <blockquote><pre>
|
||||
* The disk "MyDisk" contains no files.
|
||||
* The disk "MyDisk" contains one file.
|
||||
* The disk "MyDisk" contains 1,273 files.
|
||||
* </pre></blockquote>
|
||||
* See {@link MessageFormat##pattern_caveats MessageFormat} for caveats regarding
|
||||
* {@code MessageFormat} patterns within a {@code ChoiceFormat} pattern.
|
||||
*
|
||||
* <h2><a id="patterns">Patterns</a></h2>
|
||||
* A {@code ChoiceFormat} pattern has the following syntax:
|
||||
@@ -194,7 +186,6 @@ import java.util.Arrays;
|
||||
* {@code new ChoiceFormat("1# ''one'' ").format(1)} returns {@code " 'one' "}.
|
||||
*
|
||||
* <p>Below is an example of constructing a ChoiceFormat with a pattern:
|
||||
* <blockquote>
|
||||
* {@snippet lang=java :
|
||||
* ChoiceFormat fmt = new ChoiceFormat(
|
||||
* "-1#is negative| 0#is zero or fraction | 1#is one |1.0<is 1+ |2#is two |2<is more than 2.");
|
||||
@@ -210,7 +201,6 @@ import java.util.Arrays;
|
||||
* System.out.println(fmt.format(Double.NaN)); // outputs "is negative"
|
||||
* System.out.println(fmt.format(Double.POSITIVE_INFINITY)); // outputs "is more than 2."
|
||||
* }
|
||||
* </blockquote>
|
||||
*
|
||||
* <h2><a id="synchronization">Synchronization</a></h2>
|
||||
*
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user