8321565: [REDO] Heap dump does not contain virtual Thread stack references

Reviewed-by: sspitsyn, yyang, dholmes
8321400: java/foreign/TestStubAllocFailure.java fails with code cache exhaustion
2025-12-08 02:19:37 +01:00 · 2023-12-13 18:47:04 +00:00 · 2023-12-13 17:34:37 +00:00 · 2023-12-13 12:43:41 +00:00 · 2023-12-13 11:18:51 +00:00 · 2023-12-13 11:18:38 +00:00
261 changed files with 15928 additions and 2938 deletions
--- a/.jcheck/conf
+++ b/.jcheck/conf
@@ -1,7 +1,7 @@
 [general]
 project=jdk
 jbs=JDK
-version=22
+version=23

 [checks]
 error=author,committer,reviewers,merge,issues,executable,symlink,message,hg-tag,whitespace,problemlists
--- a/make/CompileDemos.gmk
+++ b/make/CompileDemos.gmk
@@ -58,7 +58,7 @@ DEMO_MANIFEST := $(SUPPORT_OUTPUTDIR)/demos/java-main-manifest.mf
 # This rule will be depended on due to the MANIFEST line in SetupBuildDemo
 # and SetupBuildJvmtiDemo.
 $(eval $(call SetupTextFileProcessing, BUILD_JAVA_MANIFEST, \
-  SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf, \
+  SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf.template, \
  OUTPUT_FILE := $(DEMO_MANIFEST), \
  REPLACEMENTS := \
      @@VERSION_SPECIFICATION@@ => $(VERSION_SPECIFICATION) ; \
--- a/make/JrtfsJar.gmk
+++ b/make/JrtfsJar.gmk
@@ -33,7 +33,7 @@ include TextFileProcessing.gmk

 # This rule will be depended on due to the MANIFEST line
 $(eval $(call SetupTextFileProcessing, BUILD_JAVA_MANIFEST, \
-  SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf, \
+  SOURCE_FILES := $(TOPDIR)/make/data/mainmanifest/manifest.mf.template, \
  OUTPUT_FILE := $(SUPPORT_OUTPUTDIR)/java-main-manifest.mf, \
  REPLACEMENTS := \
      @@VERSION_SPECIFICATION@@ => $(VERSION_SPECIFICATION) ; \
--- a/make/MacBundles.gmk
+++ b/make/MacBundles.gmk
@@ -69,7 +69,7 @@ ifeq ($(call isTargetOs, macosx), true)
  ))

  $(eval $(call SetupTextFileProcessing, BUILD_JDK_PLIST, \
-      SOURCE_FILES := $(MACOSX_PLIST_SRC)/JDK-Info.plist, \
+      SOURCE_FILES := $(MACOSX_PLIST_SRC)/JDK-Info.plist.template, \
      OUTPUT_FILE := $(JDK_MACOSX_CONTENTS_DIR)/Info.plist, \
      REPLACEMENTS := \
          @@ID@@ => $(MACOSX_BUNDLE_ID_BASE).jdk ; \
@@ -82,7 +82,7 @@ ifeq ($(call isTargetOs, macosx), true)
  ))

  $(eval $(call SetupTextFileProcessing, BUILD_JRE_PLIST, \
-      SOURCE_FILES := $(MACOSX_PLIST_SRC)/JRE-Info.plist, \
+      SOURCE_FILES := $(MACOSX_PLIST_SRC)/JRE-Info.plist.template, \
      OUTPUT_FILE := $(JRE_MACOSX_CONTENTS_DIR)/Info.plist, \
      REPLACEMENTS := \
          @@ID@@ => $(MACOSX_BUNDLE_ID_BASE).jre ; \
--- a/make/Main.gmk
+++ b/make/Main.gmk
@@ -744,9 +744,16 @@ endif

 $(eval $(call SetupTarget, build-test-lib, \
    MAKEFILE := test/BuildTestLib, \
+    TARGET := build-test-lib, \
    DEPS := exploded-image, \
 ))

+$(eval $(call SetupTarget, test-image-lib, \
+    MAKEFILE := test/BuildTestLib, \
+    TARGET := test-image-lib, \
+    DEPS := build-test-lib, \
+))
+
 ifeq ($(BUILD_FAILURE_HANDLER), true)
  # Builds the failure handler jtreg extension
  $(eval $(call SetupTarget, build-test-failure-handler, \
@@ -781,7 +788,7 @@ endif

 $(eval $(call SetupTarget, build-microbenchmark, \
    MAKEFILE := test/BuildMicrobenchmark, \
-    DEPS := interim-langtools exploded-image, \
+    DEPS := interim-langtools exploded-image build-test-lib, \
 ))

 ################################################################################
@@ -1264,7 +1271,7 @@ all-docs-bundles: docs-jdk-bundles docs-javase-bundles docs-reference-bundles
 # This target builds the test image
 test-image: prepare-test-image test-image-jdk-jtreg-native \
    test-image-demos-jdk test-image-libtest-jtreg-native \
-    test-image-lib-native
+    test-image-lib test-image-lib-native

 ifneq ($(JVM_TEST_IMAGE_TARGETS), )
  # If JVM_TEST_IMAGE_TARGETS is externally defined, use it instead of the
--- a/make/autoconf/Makefile.template
+++ b/make/autoconf/Makefile.template
--- a/make/autoconf/basic.m4
+++ b/make/autoconf/basic.m4
@@ -448,17 +448,17 @@ AC_DEFUN_ONCE([BASIC_SETUP_OUTPUT_DIR],
  AC_SUBST(CONFIGURESUPPORT_OUTPUTDIR)

  # The spec.gmk file contains all variables for the make system.
-  AC_CONFIG_FILES([$OUTPUTDIR/spec.gmk:$AUTOCONF_DIR/spec.gmk.in])
+  AC_CONFIG_FILES([$OUTPUTDIR/spec.gmk:$AUTOCONF_DIR/spec.gmk.template])
  # The bootcycle-spec.gmk file contains support for boot cycle builds.
-  AC_CONFIG_FILES([$OUTPUTDIR/bootcycle-spec.gmk:$AUTOCONF_DIR/bootcycle-spec.gmk.in])
+  AC_CONFIG_FILES([$OUTPUTDIR/bootcycle-spec.gmk:$AUTOCONF_DIR/bootcycle-spec.gmk.template])
  # The buildjdk-spec.gmk file contains support for building a buildjdk when cross compiling.
-  AC_CONFIG_FILES([$OUTPUTDIR/buildjdk-spec.gmk:$AUTOCONF_DIR/buildjdk-spec.gmk.in])
+  AC_CONFIG_FILES([$OUTPUTDIR/buildjdk-spec.gmk:$AUTOCONF_DIR/buildjdk-spec.gmk.template])
  # The compare.sh is used to compare the build output to other builds.
-  AC_CONFIG_FILES([$OUTPUTDIR/compare.sh:$AUTOCONF_DIR/compare.sh.in])
+  AC_CONFIG_FILES([$OUTPUTDIR/compare.sh:$AUTOCONF_DIR/compare.sh.template])
  # The generated Makefile knows where the spec.gmk is and where the source is.
  # You can run make from the OUTPUTDIR, or from the top-level Makefile
  # which will look for generated configurations
-  AC_CONFIG_FILES([$OUTPUTDIR/Makefile:$AUTOCONF_DIR/Makefile.in])
+  AC_CONFIG_FILES([$OUTPUTDIR/Makefile:$AUTOCONF_DIR/Makefile.template])
 ])

 ###############################################################################
--- a/make/autoconf/bootcycle-spec.gmk.template
+++ b/make/autoconf/bootcycle-spec.gmk.template
--- a/make/autoconf/buildjdk-spec.gmk.template
+++ b/make/autoconf/buildjdk-spec.gmk.template
--- a/make/autoconf/compare.sh.template
+++ b/make/autoconf/compare.sh.template
--- a/make/autoconf/jdk-version.m4
+++ b/make/autoconf/jdk-version.m4
@@ -110,6 +110,15 @@ AC_DEFUN_ONCE([JDKVER_SETUP_JDK_VERSION_NUMBERS],
    CHECK_VALUE: [UTIL_CHECK_STRING_NON_EMPTY_PRINTABLE])
  AC_SUBST(COMPANY_NAME)

+  # Set the JDK RC Company name
+  # Otherwise uses the value set for "vendor-name".
+  UTIL_ARG_WITH(NAME: jdk-rc-company-name, TYPE: string,
+    DEFAULT: $COMPANY_NAME,
+    DESC: [Set JDK RC company name. This is used for CompanyName properties of MS Windows binaries.],
+    DEFAULT_DESC: [from branding.conf],
+    CHECK_VALUE: [UTIL_CHECK_STRING_NON_EMPTY_PRINTABLE])
+  AC_SUBST(JDK_RC_COMPANY_NAME)
+
  # The vendor URL, if any
  # Only set VENDOR_URL if '--with-vendor-url' was used and is not empty.
  # Otherwise we will use the value from "branding.conf" included above.
--- a/make/autoconf/spec.gmk.template
+++ b/make/autoconf/spec.gmk.template
@@ -191,6 +191,7 @@ PRODUCT_NAME := @PRODUCT_NAME@
 PRODUCT_SUFFIX := @PRODUCT_SUFFIX@
 JDK_RC_PLATFORM_NAME := @JDK_RC_PLATFORM_NAME@
 JDK_RC_NAME := @JDK_RC_NAME@
+JDK_RC_COMPANY_NAME:=@JDK_RC_COMPANY_NAME@
 COMPANY_NAME := @COMPANY_NAME@
 HOTSPOT_VM_DISTRO := @HOTSPOT_VM_DISTRO@
 MACOSX_BUNDLE_NAME_BASE := @MACOSX_BUNDLE_NAME_BASE@
--- a/make/common/JdkNativeCompilation.gmk
+++ b/make/common/JdkNativeCompilation.gmk
@@ -98,7 +98,7 @@ GLOBAL_VERSION_INFO_RESOURCE := $(TOPDIR)/src/java.base/windows/native/common/ve

 JDK_RCFLAGS=$(RCFLAGS) \
    -D"JDK_VERSION_STRING=$(VERSION_STRING)" \
-    -D"JDK_COMPANY=$(COMPANY_NAME)" \
+    -D"JDK_COMPANY=$(JDK_RC_COMPANY_NAME)" \
    -D"JDK_VER=$(VERSION_NUMBER_FOUR_POSITIONS)" \
    -D"JDK_COPYRIGHT=Copyright \xA9 $(COPYRIGHT_YEAR)" \
    -D"JDK_NAME=$(JDK_RC_NAME) $(VERSION_SHORT)" \
--- a/make/common/modules/LauncherCommon.gmk
+++ b/make/common/modules/LauncherCommon.gmk
@@ -112,7 +112,7 @@ define SetupBuildLauncherBody
    $1_PLIST_FILE := $$(SUPPORT_OUTPUTDIR)/native/$$(MODULE)/$1/Info.plist

    $$(eval $$(call SetupTextFileProcessing, BUILD_PLIST_$1, \
-        SOURCE_FILES := $(TOPDIR)/make/data/bundle/cmdline-Info.plist, \
+        SOURCE_FILES := $(TOPDIR)/make/data/bundle/cmdline-Info.plist.template, \
        OUTPUT_FILE := $$($1_PLIST_FILE), \
        REPLACEMENTS := \
            @@ID@@ => $(MACOSX_BUNDLE_ID_BASE).$1 ; \
--- a/make/conf/jib-profiles.js
+++ b/make/conf/jib-profiles.js
@@ -1206,7 +1206,7 @@ var getJibProfilesDependencies = function (input, common) {

        jcov: {
            organization: common.organization,
-            revision: "3.0-15-jdk-asm+1.0",
+            revision: "3.0-16-jdk-asm+1.0",
            ext: "zip",
            environment_name: "JCOV_HOME",
        },
--- a/make/conf/version-numbers.conf
+++ b/make/conf/version-numbers.conf
@@ -26,17 +26,17 @@
 # Default version, product, and vendor information to use,
 # unless overridden by configure

-DEFAULT_VERSION_FEATURE=22
+DEFAULT_VERSION_FEATURE=23
 DEFAULT_VERSION_INTERIM=0
 DEFAULT_VERSION_UPDATE=0
 DEFAULT_VERSION_PATCH=0
 DEFAULT_VERSION_EXTRA1=0
 DEFAULT_VERSION_EXTRA2=0
 DEFAULT_VERSION_EXTRA3=0
-DEFAULT_VERSION_DATE=2024-03-19
-DEFAULT_VERSION_CLASSFILE_MAJOR=66  # "`$EXPR $DEFAULT_VERSION_FEATURE + 44`"
+DEFAULT_VERSION_DATE=2024-09-17
+DEFAULT_VERSION_CLASSFILE_MAJOR=67  # "`$EXPR $DEFAULT_VERSION_FEATURE + 44`"
 DEFAULT_VERSION_CLASSFILE_MINOR=0
 DEFAULT_VERSION_DOCS_API_SINCE=11
-DEFAULT_ACCEPTABLE_BOOT_VERSIONS="21 22"
-DEFAULT_JDK_SOURCE_TARGET_VERSION=22
+DEFAULT_ACCEPTABLE_BOOT_VERSIONS="21 22 23"
+DEFAULT_JDK_SOURCE_TARGET_VERSION=23
 DEFAULT_PROMOTED_VERSION_PRE=ea
--- a/make/data/bundle/JDK-Info.plist.template
+++ b/make/data/bundle/JDK-Info.plist.template
--- a/make/data/bundle/JRE-Info.plist.template
+++ b/make/data/bundle/JRE-Info.plist.template
--- a/make/data/bundle/cmdline-Info.plist.template
+++ b/make/data/bundle/cmdline-Info.plist.template
--- a/make/data/mainmanifest/manifest.mf.template
+++ b/make/data/mainmanifest/manifest.mf.template
--- a/make/hotspot/gensrc/GenerateSources.gmk
+++ b/make/hotspot/gensrc/GenerateSources.gmk
@@ -48,7 +48,7 @@ $(eval $(call IncludeCustomExtension, hotspot/gensrc/GenerateSources.gmk))

 # Setup the hotspot launcher script for developer use
 $(eval $(call SetupTextFileProcessing, CREATE_HOTSPOT_LAUNCHER, \
-    SOURCE_FILES := $(TOPDIR)/make/scripts/hotspot.sh, \
+    SOURCE_FILES := $(TOPDIR)/make/scripts/hotspot.sh.template, \
    OUTPUT_FILE := $(JVM_OUTPUTDIR)/hotspot, \
    REPLACEMENTS := \
        @@LIBARCH@@ => $(OPENJDK_TARGET_CPU_LEGACY_LIB) ; \
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -245,7 +245,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
      TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
      OPTIMIZATION := HIGH, \
      CFLAGS := $(CFLAGS_JDKLIB), \
-      CXXFLAGS := $(CXXFLAGS_JDKLIB), \
+      CXXFLAGS := $(CXXFLAGS_JDKLIB) -std=c++17, \
      LDFLAGS := $(LDFLAGS_JDKLIB) \
          $(call SET_SHARED_LIBRARY_ORIGIN), \
      LIBS := $(LIBCXX), \
--- a/make/scripts/hotspot.sh.template
+++ b/make/scripts/hotspot.sh.template
--- a/make/test/BuildMicrobenchmark.gmk
+++ b/make/test/BuildMicrobenchmark.gmk
@@ -53,11 +53,10 @@ JMH_UNPACKED_DIR := $(MICROBENCHMARK_OUTPUT)/jmh_jars
 JMH_UNPACKED_JARS_DONE := $(JMH_UNPACKED_DIR)/_unpacked.marker

 # External dependencies
-JMH_COMPILE_JARS := $(JMH_CORE_JAR) $(JMH_GENERATOR_JAR)
+WHITEBOX_JAR := $(SUPPORT_OUTPUTDIR)/test/lib/wb.jar
+JMH_COMPILE_JARS := $(JMH_CORE_JAR) $(JMH_GENERATOR_JAR) $(WHITEBOX_JAR)
 JMH_RUNTIME_JARS := $(JMH_CORE_JAR) $(JMH_COMMONS_MATH_JAR) $(JMH_JOPT_SIMPLE_JAR)

-MICROBENCHMARK_CLASSPATH := $(call PathList, $(JMH_COMPILE_JARS))
-
 # Native dependencies
 MICROBENCHMARK_NATIVE_SRC_DIRS := $(MICROBENCHMARK_SRC)
 MICROBENCHMARK_NATIVE_OUTPUT := $(MICROBENCHMARK_OUTPUT)/native
@@ -92,24 +91,28 @@ $(eval $(call SetupJavaCompilation, BUILD_INDIFY, \
 $(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
    TARGET_RELEASE := $(TARGET_RELEASE_NEWJDK_UPGRADED), \
    SMALL_JAVA := false, \
-    CLASSPATH := $(MICROBENCHMARK_CLASSPATH), \
-    DISABLED_WARNINGS := restricted this-escape processing rawtypes cast serial preview, \
+    CLASSPATH := $(JMH_COMPILE_JARS), \
+    DISABLED_WARNINGS := restricted this-escape processing rawtypes cast \
+        serial preview, \
    SRC := $(MICROBENCHMARK_SRC), \
    BIN := $(MICROBENCHMARK_CLASSES), \
-    JAVAC_FLAGS := --add-exports java.base/sun.security.util=ALL-UNNAMED \
-        --add-exports java.base/sun.invoke.util=ALL-UNNAMED \
+    JAVAC_FLAGS := \
        --add-exports java.base/jdk.internal.classfile.impl=ALL-UNNAMED \
-        --add-exports java.base/jdk.internal.org.objectweb.asm=ALL-UNNAMED \
-        --add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
-        --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
-        --add-exports java.base/jdk.internal.misc=ALL-UNNAMED \
        --add-exports java.base/jdk.internal.event=ALL-UNNAMED \
        --add-exports java.base/jdk.internal.foreign=ALL-UNNAMED \
+        --add-exports java.base/jdk.internal.misc=ALL-UNNAMED \
+        --add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
+        --add-exports java.base/jdk.internal.org.objectweb.asm=ALL-UNNAMED \
+        --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
+        --add-exports java.base/sun.invoke.util=ALL-UNNAMED \
+        --add-exports java.base/sun.security.util=ALL-UNNAMED \
        --enable-preview \
        -processor org.openjdk.jmh.generators.BenchmarkProcessor, \
-    JAVA_FLAGS := --add-modules jdk.unsupported --limit-modules java.management \
+    JAVA_FLAGS := \
        --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
-        --enable-preview, \
+        --add-modules jdk.unsupported \
+        --enable-preview \
+        --limit-modules java.management, \
 ))

 $(BUILD_JDK_MICROBENCHMARK): $(JMH_COMPILE_JARS)
--- a/make/test/BuildTestLib.gmk
+++ b/make/test/BuildTestLib.gmk
@@ -23,12 +23,22 @@
 # questions.
 #

+################################################################################
+# This file builds the Java components of testlib.
+# It also covers the test-image part, where the built files are copied to the
+# test image.
+################################################################################
+
 default: all

 include $(SPEC)
 include MakeBase.gmk
 include JavaCompilation.gmk

+################################################################################
+# Targets for building the test lib jars
+################################################################################
+
 TARGETS :=

 TEST_LIB_SOURCE_DIR := $(TOPDIR)/test/lib
@@ -63,8 +73,21 @@ $(eval $(call SetupJavaCompilation, BUILD_TEST_LIB_JAR, \

 TARGETS += $(BUILD_TEST_LIB_JAR)

-##########################################################################################
+build-test-lib: $(TARGETS)

-all: $(TARGETS)
+################################################################################
+# Targets for building test-image.
+################################################################################

-.PHONY: default all
+# Copy the jars to the test image.
+$(eval $(call SetupCopyFiles, COPY_LIBTEST_JARS, \
+    DEST := $(TEST_IMAGE_DIR)/lib-test, \
+    FILES := $(BUILD_WB_JAR_JAR) $(BUILD_TEST_LIB_JAR_JAR), \
+))
+#
+
+test-image-lib: $(COPY_LIBTEST_JARS)
+
+all: build-test-lib
+
+.PHONY: default all build-test-lib test-image-lib
--- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
@@ -193,4 +193,9 @@
    }
  }

+  // Is SIMD sort supported for this CPU?
+  static bool supports_simd_sort(BasicType bt) {
+    return false;
+  }
+
 #endif // CPU_AARCH64_MATCHER_AARCH64_HPP
--- a/src/hotspot/cpu/arm/interp_masm_arm.cpp
+++ b/src/hotspot/cpu/arm/interp_masm_arm.cpp
@@ -303,15 +303,19 @@ void InterpreterMacroAssembler::load_field_entry(Register cache, Register index,
 }

 void InterpreterMacroAssembler::load_method_entry(Register cache, Register index, int bcp_offset) {
+  assert_different_registers(cache, index);
+
  // Get index out of bytecode pointer
  get_index_at_bcp(index, bcp_offset, cache /* as tmp */, sizeof(u2));
+
+  // sizeof(ResolvedMethodEntry) is not a power of 2 on Arm, so can't use shift
  mov(cache, sizeof(ResolvedMethodEntry));
  mul(index, index, cache); // Scale the index to be the entry index * sizeof(ResolvedMethodEntry)

  // load constant pool cache pointer
  ldr(cache, Address(FP, frame::interpreter_frame_cache_offset * wordSize));
  // Get address of method entries array
-  ldr(cache, Address(cache, ConstantPoolCache::method_entries_offset()));
+  ldr(cache, Address(cache, in_bytes(ConstantPoolCache::method_entries_offset())));
  add(cache, cache, Array<ResolvedMethodEntry>::base_offset_in_bytes());
  add(cache, cache, index);
 }
--- a/src/hotspot/cpu/arm/matcher_arm.hpp
+++ b/src/hotspot/cpu/arm/matcher_arm.hpp
@@ -186,4 +186,9 @@
    }
  }

+  // Is SIMD sort supported for this CPU?
+  static bool supports_simd_sort(BasicType bt) {
+    return false;
+  }
+
 #endif // CPU_ARM_MATCHER_ARM_HPP
--- a/src/hotspot/cpu/arm/templateInterpreterGenerator_arm.cpp
+++ b/src/hotspot/cpu/arm/templateInterpreterGenerator_arm.cpp
@@ -370,17 +370,16 @@ address TemplateInterpreterGenerator::generate_return_entry_for(TosState state,
  if (index_size == sizeof(u4)) {
    __ load_resolved_indy_entry(Rcache, Rindex);
    __ ldrh(Rcache, Address(Rcache, in_bytes(ResolvedIndyEntry::num_parameters_offset())));
-    __ check_stack_top();
-    __ add(Rstack_top, Rstack_top, AsmOperand(Rcache, lsl, Interpreter::logStackElementSize));
  } else {
    // Pop N words from the stack
    assert(index_size == sizeof(u2), "Can only be u2");
    __ load_method_entry(Rcache, Rindex);
-    __ ldrh(Rcache, Address(Rcache, in_bytes(ResolvedIndyEntry::num_parameters_offset())));
-    __ check_stack_top();
-    __ add(Rstack_top, Rstack_top, AsmOperand(Rcache, lsl, Interpreter::logStackElementSize));
+    __ ldrh(Rcache, Address(Rcache, in_bytes(ResolvedMethodEntry::num_parameters_offset())));
  }

+  __ check_stack_top();
+  __ add(Rstack_top, Rstack_top, AsmOperand(Rcache, lsl, Interpreter::logStackElementSize));
+
  __ convert_retval_to_tos(state);

  __ check_and_handle_popframe();
--- a/src/hotspot/cpu/arm/templateTable_arm.cpp
+++ b/src/hotspot/cpu/arm/templateTable_arm.cpp
@@ -3666,15 +3666,15 @@ void TemplateTable::prepare_invoke(Register Rcache, Register recv) {
  // load receiver if needed (after extra argument is pushed so parameter size is correct)
  if (load_receiver) {
    __ ldrh(recv, Address(Rcache, in_bytes(ResolvedMethodEntry::num_parameters_offset())));
-    Address recv_addr = __ receiver_argument_address(Rstack_top, Rtemp, recv);
-    __ ldr(recv, recv_addr);
+    __ add(recv, Rstack_top, AsmOperand(recv, lsl, Interpreter::logStackElementSize));
+    __ ldr(recv, Address(recv, -Interpreter::stackElementSize));
    __ verify_oop(recv);
  }

  // load return address
  { const address table = (address) Interpreter::invoke_return_entry_table_for(code);
-    __ mov_slow(Rtemp, table);
-    __ ldr(LR, Address::indexed_ptr(Rtemp, ret_type));
+    __ mov_slow(LR, table);
+    __ ldr(LR, Address::indexed_ptr(LR, ret_type));
  }
 }

@@ -3744,10 +3744,13 @@ void TemplateTable::invokevirtual(int byte_no) {
 void TemplateTable::invokespecial(int byte_no) {
  transition(vtos, vtos);
  assert(byte_no == f1_byte, "use this argument");
+
  const Register Rrecv  = R2_tmp;
-  load_resolved_method_entry_special_or_static(R2_tmp,  // ResolvedMethodEntry*
+  const Register Rflags = R3_tmp;
+
+  load_resolved_method_entry_special_or_static(Rrecv,  // ResolvedMethodEntry*
                                               Rmethod, // Method*
-                                               R3_tmp); // Flags
+                                               Rflags); // Flags
  prepare_invoke(Rrecv, Rrecv);
  __ verify_oop(Rrecv);
  __ null_check(Rrecv, Rtemp);
@@ -3760,12 +3763,16 @@ void TemplateTable::invokespecial(int byte_no) {
 void TemplateTable::invokestatic(int byte_no) {
  transition(vtos, vtos);
  assert(byte_no == f1_byte, "use this argument");
-  load_resolved_method_entry_special_or_static(R2_tmp,  // ResolvedMethodEntry*
+
+  const Register Rrecv  = R2_tmp;
+  const Register Rflags = R3_tmp;
+
+  load_resolved_method_entry_special_or_static(Rrecv,  // ResolvedMethodEntry*
                                               Rmethod, // Method*
-                                               R3_tmp); // Flags
-  prepare_invoke(R2_tmp, R2_tmp);
+                                               Rflags); // Flags
+  prepare_invoke(Rrecv, Rrecv);
  // do the call
-  __ profile_call(R2_tmp);
+  __ profile_call(Rrecv);
  __ jump_from_interpreted(Rmethod);
 }

@@ -3788,10 +3795,10 @@ void TemplateTable::invokeinterface(int byte_no) {
  const Register Rflags  = R3_tmp;
  const Register Rklass  = R2_tmp; // Note! Same register with Rrecv

-  load_resolved_method_entry_interface(R2_tmp,  // ResolvedMethodEntry*
-                                       R1_tmp,  // Klass*
+  load_resolved_method_entry_interface(Rrecv,   // ResolvedMethodEntry*
+                                       Rinterf, // Klass*
                                       Rmethod, // Method* or itable/vtable index
-                                       R3_tmp); // Flags
+                                       Rflags); // Flags
  prepare_invoke(Rrecv, Rrecv);

  // First check for Object case, then private interface method,
--- a/src/hotspot/cpu/ppc/matcher_ppc.hpp
+++ b/src/hotspot/cpu/ppc/matcher_ppc.hpp
@@ -195,4 +195,9 @@
    }
  }

+  // Is SIMD sort supported for this CPU?
+  static bool supports_simd_sort(BasicType bt) {
+    return false;
+  }
+
 #endif // CPU_PPC_MATCHER_PPC_HPP
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@@ -1459,6 +1459,112 @@ void C2_MacroAssembler::string_equals(Register a1, Register a2,
  BLOCK_COMMENT("} string_equals");
 }

+// jdk.internal.util.ArraysSupport.vectorizedHashCode
+void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
+                                        Register tmp1, Register tmp2, Register tmp3,
+                                        Register tmp4, Register tmp5, Register tmp6,
+                                        BasicType eltype)
+{
+  assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
+
+  const int elsize = arrays_hashcode_elsize(eltype);
+  const int chunks_end_shift = exact_log2(elsize);
+
+  switch (eltype) {
+  case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
+  case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
+  case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
+  case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
+  case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  const int stride = 4;
+  const Register pow31_4 = tmp1;
+  const Register pow31_3 = tmp2;
+  const Register pow31_2 = tmp3;
+  const Register chunks  = tmp4;
+  const Register chunks_end = chunks;
+
+  Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
+
+  // result has a value initially
+
+  beqz(cnt, DONE);
+
+  andi(chunks, cnt, ~(stride-1));
+  beqz(chunks, TAIL);
+
+  mv(pow31_4, 923521);           // [31^^4]
+  mv(pow31_3,  29791);           // [31^^3]
+  mv(pow31_2,    961);           // [31^^2]
+
+  slli(chunks_end, chunks, chunks_end_shift);
+  add(chunks_end, ary, chunks_end);
+  andi(cnt, cnt, stride-1);      // don't forget about tail!
+
+  bind(WIDE_LOOP);
+  mulw(result, result, pow31_4); // 31^^4 * h
+  arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
+  arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
+  arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
+  arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
+  mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
+  addw(result, result, t0);
+  mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
+  addw(result, result, t1);
+  slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
+  subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
+  addw(result, result, tmp5);
+  addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
+                                 //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
+  addi(ary, ary, elsize * stride);
+  bne(ary, chunks_end, WIDE_LOOP);
+  beqz(cnt, DONE);
+
+  bind(TAIL);
+  slli(chunks_end, cnt, chunks_end_shift);
+  add(chunks_end, ary, chunks_end);
+
+  bind(TAIL_LOOP);
+  arrays_hashcode_elload(t0, Address(ary), eltype);
+  slli(t1, result, 5);           // optimize 31 * result
+  subw(result, t1, result);      // with result<<5 - result
+  addw(result, result, t0);
+  addi(ary, ary, elsize);
+  bne(ary, chunks_end, TAIL_LOOP);
+
+  bind(DONE);
+  BLOCK_COMMENT("} // arrays_hashcode");
+}
+
+int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
+  switch (eltype) {
+  case T_BOOLEAN: return sizeof(jboolean);
+  case T_BYTE:    return sizeof(jbyte);
+  case T_SHORT:   return sizeof(jshort);
+  case T_CHAR:    return sizeof(jchar);
+  case T_INT:     return sizeof(jint);
+  default:
+    ShouldNotReachHere();
+    return -1;
+  }
+}
+
+void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
+  switch (eltype) {
+  // T_BOOLEAN used as surrogate for unsigned byte
+  case T_BOOLEAN: lbu(dst, src);   break;
+  case T_BYTE:     lb(dst, src);   break;
+  case T_SHORT:    lh(dst, src);   break;
+  case T_CHAR:    lhu(dst, src);   break;
+  case T_INT:      lw(dst, src);   break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
                                                              bool is_far, bool is_unordered);
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
@@ -82,6 +82,15 @@
                     Register result, Register cnt1,
                     int elem_size);

+  void arrays_hashcode(Register ary, Register cnt, Register result,
+                       Register tmp1, Register tmp2,
+                       Register tmp3, Register tmp4,
+                       Register tmp5, Register tmp6,
+                       BasicType eltype);
+  // helper function for arrays_hashcode
+  int arrays_hashcode_elsize(BasicType eltype);
+  void arrays_hashcode_elload(Register dst, Address src, BasicType eltype);
+
  void string_equals(Register r1, Register r2,
                     Register result, Register cnt1,
                     int elem_size);
--- a/src/hotspot/cpu/riscv/matcher_riscv.hpp
+++ b/src/hotspot/cpu/riscv/matcher_riscv.hpp
@@ -192,4 +192,9 @@
    }
  }

+  // Is SIMD sort supported for this CPU?
+  static bool supports_simd_sort(BasicType bt) {
+    return false;
+  }
+
 #endif // CPU_RISCV_MATCHER_RISCV_HPP
--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@@ -10371,6 +10371,26 @@ instruct array_equalsC(iRegP_R11 ary1, iRegP_R12 ary2, iRegI_R10 result,
  ins_pipe(pipe_class_memory);
 %}

+// fast ArraysSupport.vectorizedHashCode
+instruct arrays_hashcode(iRegP_R11 ary, iRegI_R12 cnt, iRegI_R10 result, immI basic_type,
+                         iRegLNoSp tmp1, iRegLNoSp tmp2,
+                         iRegLNoSp tmp3, iRegLNoSp tmp4,
+                         iRegLNoSp tmp5, iRegLNoSp tmp6, rFlagsReg cr)
+%{
+  match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6,
+         USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr);
+
+  format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result   // KILL all" %}
+  ins_encode %{
+    __ arrays_hashcode($ary$$Register, $cnt$$Register, $result$$Register,
+                       $tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
+                       $tmp4$$Register, $tmp5$$Register, $tmp6$$Register,
+                       (BasicType)$basic_type$$constant);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 // ============================================================================
 // Safepoint Instructions

--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp
+++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp
@@ -315,6 +315,10 @@ void VM_Version::c2_initialize() {
    }
  }

+  if (FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic)) {
+    FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, true);
+  }
+
  if (!UseZicbop) {
    if (!FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
      warning("Zicbop is not available on this CPU");
--- a/src/hotspot/cpu/s390/matcher_s390.hpp
+++ b/src/hotspot/cpu/s390/matcher_s390.hpp
@@ -184,4 +184,9 @@
    }
  }

+  // Is SIMD sort supported for this CPU?
+  static bool supports_simd_sort(BasicType bt) {
+    return false;
+  }
+
 #endif // CPU_S390_MATCHER_S390_HPP
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -920,6 +920,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
    case 0x11: // movups
    case 0x12: // movlps
    case 0x28: // movaps
+    case 0x29: // movaps
    case 0x2E: // ucomiss
    case 0x2F: // comiss
    case 0x54: // andps
@@ -969,7 +970,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
      assert(which == call32_operand, "jcc has no disp32 or imm");
      return ip;
    default:
-      ShouldNotReachHere();
+      fatal("not handled: 0x0F%2X", 0xFF & *(ip-1));
    }
    break;

--- a/src/hotspot/cpu/x86/matcher_x86.hpp
+++ b/src/hotspot/cpu/x86/matcher_x86.hpp
@@ -248,4 +248,17 @@
    }
  }

+  // Is SIMD sort supported for this CPU?
+  static bool supports_simd_sort(BasicType bt) {
+    if (VM_Version::supports_avx512dq()) {
+      return true;
+    }
+    else if (VM_Version::supports_avx2() && !is_double_word_type(bt)) {
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+
 #endif // CPU_X86_MATCHER_X86_HPP
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4193,22 +4193,23 @@ void StubGenerator::generate_compiler_stubs() {
      = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
  }

-  // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics
-  if (VM_Version::is_intel() && VM_Version::supports_avx512dq()) {
+  // Load x86_64_sort library on supported hardware to enable SIMD sort and partition intrinsics
+
+  if (VM_Version::is_intel() && (VM_Version::supports_avx512dq() || VM_Version::supports_avx2())) {
    void *libsimdsort = nullptr;
    char ebuf_[1024];
    char dll_name_simd_sort[JVM_MAXPATHLEN];
    if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
      libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
    }
-    // Get addresses for avx512 sort and partition routines
+    // Get addresses for SIMD sort and partition routines
    if (libsimdsort != nullptr) {
      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));

-      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort");
+      snprintf(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512dq() ? "avx512_sort" : "avx2_sort");
      StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_);

-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition");
+      snprintf(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512dq() ? "avx512_partition" : "avx2_partition");
      StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_);
    }
  }
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@@ -858,7 +858,7 @@ void VM_Version::get_processor_features() {

  // Check if processor has Intel Ecore
  if (FLAG_IS_DEFAULT(EnableX86ECoreOpts) && is_intel() && cpu_family() == 6 &&
-    (_model == 0x97 || _model == 0xAC || _model == 0xAF)) {
+    (_model == 0x97 || _model == 0xAA || _model == 0xAC || _model == 0xAF)) {
    FLAG_SET_DEFAULT(EnableX86ECoreOpts, true);
  }

@@ -1130,6 +1130,7 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
  }

+#ifdef _LP64
  // ChaCha20 Intrinsics
  // As long as the system supports AVX as a baseline we can do a
  // SIMD-enabled block function.  StubGenerator makes the determination
@@ -1145,6 +1146,13 @@ void VM_Version::get_processor_features() {
      }
      FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
  }
+#else
+  // No support currently for ChaCha20 intrinsics on 32-bit platforms
+  if (UseChaCha20Intrinsics) {
+      warning("ChaCha20 intrinsics are not available on this CPU.");
+      FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
+  }
+#endif // _LP64

  // Base64 Intrinsics (Check the condition for which the intrinsic will be active)
  if (UseAVX >= 2) {
--- a/src/hotspot/share/cds/cdsConfig.hpp
+++ b/src/hotspot/share/cds/cdsConfig.hpp
@@ -52,7 +52,7 @@ public:
  static void initialize() NOT_CDS_RETURN;
  static void check_system_property(const char* key, const char* value) NOT_CDS_RETURN;
  static void check_unsupported_dumping_properties() NOT_CDS_RETURN;
-  static bool check_vm_args_consistency(bool patch_mod_javabase,  bool mode_flag_cmd_line) NOT_CDS_RETURN_(false);
+  static bool check_vm_args_consistency(bool patch_mod_javabase,  bool mode_flag_cmd_line) NOT_CDS_RETURN_(true);

  // Basic CDS features
  static bool      is_dumping_archive()                      { return is_dumping_static_archive() || is_dumping_dynamic_archive(); }
--- a/src/hotspot/share/cds/cdsHeapVerifier.cpp
+++ b/src/hotspot/share/cds/cdsHeapVerifier.cpp
@@ -129,7 +129,23 @@ CDSHeapVerifier::CDSHeapVerifier() : _archived_objs(0), _problems(0)
  // This just points to an empty Map
  ADD_EXCL("jdk/internal/reflect/Reflection",            "methodFilterMap");       // E
  ADD_EXCL("jdk/internal/util/StaticProperty",           "FILE_ENCODING",          // C
-                                                 "JAVA_LOCALE_USE_OLD_ISO_CODES"); // C
+                                                 "JAVA_LOCALE_USE_OLD_ISO_CODES",  // C
+                                                 "USER_LANGUAGE",                  // C
+                                                 "USER_LANGUAGE_DISPLAY",          // C
+                                                 "USER_LANGUAGE_FORMAT",           // C
+                                                 "USER_SCRIPT",                    // C
+                                                 "USER_SCRIPT_DISPLAY",            // C
+                                                 "USER_SCRIPT_FORMAT",             // C
+                                                 "USER_COUNTRY",                   // C
+                                                 "USER_COUNTRY_DISPLAY",           // C
+                                                 "USER_COUNTRY_FORMAT",            // C
+                                                 "USER_VARIANT",                   // C
+                                                 "USER_VARIANT_DISPLAY",           // C
+                                                 "USER_VARIANT_FORMAT",            // C
+                                                 "USER_EXTENSIONS",                // C
+                                                 "USER_EXTENSIONS_DISPLAY",        // C
+                                                 "USER_EXTENSIONS_FORMAT",         // C
+                                                 "USER_REGION");                   // C

  // Integer for 0 and 1 are in java/lang/Integer$IntegerCache and are archived
  ADD_EXCL("sun/invoke/util/ValueConversions",           "ONE_INT",                // E
--- a/src/hotspot/share/cds/filemap.cpp
+++ b/src/hotspot/share/cds/filemap.cpp
@@ -1465,7 +1465,7 @@ BitMapView FileMapRegion::ptrmap_view() {
  return bitmap_view(false);
 }

-bool FileMapRegion::check_region_crc() const {
+bool FileMapRegion::check_region_crc(char* base) const {
  // This function should be called after the region has been properly
  // loaded into memory via FileMapInfo::map_region() or FileMapInfo::read_region().
  // I.e., this->mapped_base() must be valid.
@@ -1474,8 +1474,8 @@ bool FileMapRegion::check_region_crc() const {
    return true;
  }

-  assert(mapped_base() != nullptr, "must be initialized");
-  int crc = ClassLoader::crc32(0, mapped_base(), (jint)sz);
+  assert(base != nullptr, "must be initialized");
+  int crc = ClassLoader::crc32(0, base, (jint)sz);
  if (crc != this->crc()) {
    log_warning(cds)("Checksum verification failed.");
    return false;
@@ -1760,13 +1760,13 @@ bool FileMapInfo::read_region(int i, char* base, size_t size, bool do_commit) {
    return false;
  }

-  r->set_mapped_from_file(false);
-  r->set_mapped_base(base);
-
-  if (VerifySharedSpaces && !r->check_region_crc()) {
+  if (VerifySharedSpaces && !r->check_region_crc(base)) {
    return false;
  }

+  r->set_mapped_from_file(false);
+  r->set_mapped_base(base);
+
  return true;
 }

@@ -1803,6 +1803,7 @@ MapArchiveResult FileMapInfo::map_region(int i, intx addr_delta, char* mapped_ba
      return MAP_ARCHIVE_OTHER_FAILURE; // oom or I/O error.
    } else {
      assert(r->mapped_base() != nullptr, "must be initialized");
+      return MAP_ARCHIVE_SUCCESS;
    }
  } else {
    // Note that this may either be a "fresh" mapping into unreserved address
@@ -1817,15 +1818,16 @@ MapArchiveResult FileMapInfo::map_region(int i, intx addr_delta, char* mapped_ba
      _memory_mapping_failed = true;
      return MAP_ARCHIVE_MMAP_FAILURE;
    }
+
+    if (VerifySharedSpaces && !r->check_region_crc(requested_addr)) {
+      return MAP_ARCHIVE_OTHER_FAILURE;
+    }
+
    r->set_mapped_from_file(true);
    r->set_mapped_base(requested_addr);
-  }

-  if (VerifySharedSpaces && !r->check_region_crc()) {
-    return MAP_ARCHIVE_OTHER_FAILURE;
+    return MAP_ARCHIVE_SUCCESS;
  }
-
-  return MAP_ARCHIVE_SUCCESS;
 }

 // The return value is the location of the archive relocation bitmap.
@@ -1843,8 +1845,7 @@ char* FileMapInfo::map_bitmap_region() {
    return nullptr;
  }

-  r->set_mapped_base(bitmap_base);
-  if (VerifySharedSpaces && !r->check_region_crc()) {
+  if (VerifySharedSpaces && !r->check_region_crc(bitmap_base)) {
    log_error(cds)("relocation bitmap CRC error");
    if (!os::unmap_memory(bitmap_base, r->used_aligned())) {
      fatal("os::unmap_memory of relocation bitmap failed");
@@ -1853,6 +1854,7 @@ char* FileMapInfo::map_bitmap_region() {
  }

  r->set_mapped_from_file(true);
+  r->set_mapped_base(bitmap_base);
  log_info(cds)("Mapped %s region #%d at base " INTPTR_FORMAT " top " INTPTR_FORMAT " (%s)",
                is_static() ? "static " : "dynamic",
                MetaspaceShared::bm, p2i(r->mapped_base()), p2i(r->mapped_end()),
@@ -2128,13 +2130,14 @@ bool FileMapInfo::map_heap_region_impl() {
    return false;
  }

-  r->set_mapped_base(base);
-  if (VerifySharedSpaces && !r->check_region_crc()) {
+  if (VerifySharedSpaces && !r->check_region_crc(base)) {
    dealloc_heap_region();
    log_info(cds)("UseSharedSpaces: mapped heap region is corrupt");
    return false;
  }

+  r->set_mapped_base(base);
+
  // If the requested range is different from the range allocated by GC, then
  // the pointers need to be patched.
  address mapped_start = (address) _mapped_heap_memregion.start();
--- a/src/hotspot/share/cds/filemap.hpp
+++ b/src/hotspot/share/cds/filemap.hpp
@@ -170,7 +170,7 @@ public:
  BitMapView ptrmap_view();
  bool has_ptrmap()                  { return _ptrmap_size_in_bits != 0; }

-  bool check_region_crc() const;
+  bool check_region_crc(char* base) const;
  void print(outputStream* st, int region_index);
 };

--- a/src/hotspot/share/classfile/classFileParser.cpp
+++ b/src/hotspot/share/classfile/classFileParser.cpp
@@ -149,6 +149,8 @@

 #define JAVA_22_VERSION                   66

+#define JAVA_23_VERSION                   67
+
 void ClassFileParser::set_class_bad_constant_seen(short bad_constant) {
  assert((bad_constant == JVM_CONSTANT_Module ||
          bad_constant == JVM_CONSTANT_Package) && _major_version >= JAVA_9_VERSION,
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@@ -175,7 +175,7 @@ G1ConcurrentRefine::G1ConcurrentRefine(G1Policy* policy) :
 {}

 jint G1ConcurrentRefine::initialize() {
-  return _thread_control.initialize(this, max_num_threads());
+  return _thread_control.initialize(this, G1ConcRefinementThreads);
 }

 G1ConcurrentRefine* G1ConcurrentRefine::create(G1Policy* policy, jint* ecode) {
@@ -199,10 +199,6 @@ void G1ConcurrentRefine::threads_do(ThreadClosure *tc) {
  _thread_control.worker_threads_do(tc);
 }

-uint G1ConcurrentRefine::max_num_threads() {
-  return G1ConcRefinementThreads;
-}
-
 void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms,
                                                     size_t processed_logged_cards,
                                                     size_t predicted_thread_buffer_cards,
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
@@ -215,9 +215,6 @@ public:

  // Iterate over all concurrent refinement threads applying the given closure.
  void threads_do(ThreadClosure *tc);
-
-  // Maximum number of refinement threads.
-  static uint max_num_threads();
 };

 #endif // SHARE_GC_G1_G1CONCURRENTREFINE_HPP
--- a/src/hotspot/share/gc/g1/g1FromCardCache.cpp
+++ b/src/hotspot/share/gc/g1/g1FromCardCache.cpp
@@ -81,7 +81,7 @@ void G1FromCardCache::print(outputStream* out) {
 #endif

 uint G1FromCardCache::num_par_rem_sets() {
-  return G1DirtyCardQueueSet::num_par_ids() + G1ConcurrentRefine::max_num_threads() + MAX2(ConcGCThreads, ParallelGCThreads);
+  return G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads + MAX2(ConcGCThreads, ParallelGCThreads);
 }

 void G1FromCardCache::clear(uint region_idx) {
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@@ -91,11 +91,6 @@ class G1RemSetScanState : public CHeapObj<mtGC> {

  size_t _max_reserved_regions;

-  // Has this region that is part of the regions in the collection set been processed yet.
-  typedef bool G1RemsetIterState;
-
-  G1RemsetIterState volatile* _collection_set_iter_state;
-
  // Card table iteration claim for each heap region, from 0 (completely unscanned)
  // to (>=) HeapRegion::CardsPerRegion (completely scanned).
  uint volatile* _card_table_scan_state;
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
@@ -67,7 +67,7 @@ double G1RemSetSummary::rs_thread_vtime(uint thread) const {
 }

 G1RemSetSummary::G1RemSetSummary(bool should_update) :
-  _num_vtimes(G1ConcurrentRefine::max_num_threads()),
+  _num_vtimes(G1ConcRefinementThreads),
  _rs_threads_vtimes(NEW_C_HEAP_ARRAY(double, _num_vtimes, mtGC)) {

  memset(_rs_threads_vtimes, 0, sizeof(double) * _num_vtimes);
--- a/src/hotspot/share/gc/g1/g1RemSetTrackingPolicy.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetTrackingPolicy.cpp
@@ -38,18 +38,16 @@ bool G1RemSetTrackingPolicy::needs_scan_for_rebuild(HeapRegion* r) const {
 }

 void G1RemSetTrackingPolicy::update_at_allocate(HeapRegion* r) {
-  if (r->is_young()) {
-    // Always collect remembered set for young regions.
-    r->rem_set()->set_state_complete();
-  } else if (r->is_humongous()) {
-    // Collect remembered sets for humongous regions by default to allow eager reclaim.
-    r->rem_set()->set_state_complete();
-  } else if (r->is_old()) {
+  assert(r->is_young() || r->is_humongous() || r->is_old(),
+        "Region %u with unexpected heap region type %s", r->hrm_index(), r->get_type_str());
+  if (r->is_old()) {
    // By default, do not create remembered set for new old regions.
    r->rem_set()->set_state_untracked();
-  } else {
-    guarantee(false, "Unhandled region %u with heap region type %s", r->hrm_index(), r->get_type_str());
+    return;
  }
+  // Always collect remembered set for young regions and for humongous regions.
+  // Humongous regions need that for eager reclaim.
+  r->rem_set()->set_state_complete();
 }

 void G1RemSetTrackingPolicy::update_at_free(HeapRegion* r) {
--- a/src/hotspot/share/gc/g1/g1_globals.hpp
+++ b/src/hotspot/share/gc/g1/g1_globals.hpp
@@ -113,11 +113,11 @@
          "of the optimal occupancy to start marking.")                     \
          range(1, max_intx)                                                \
                                                                            \
-  product(uint, G1ConfidencePercent, 50,                                   \
+  product(uint, G1ConfidencePercent, 50,                                    \
          "Confidence level for MMU/pause predictions")                     \
          range(0, 100)                                                     \
                                                                            \
-  product(intx, G1SummarizeRSetStatsPeriod, 0, DIAGNOSTIC,                  \
+  product(uintx, G1SummarizeRSetStatsPeriod, 0, DIAGNOSTIC,                 \
          "The period (in number of GCs) at which we will generate "        \
          "update buffer processing info "                                  \
          "(0 means do not periodically generate this info); "              \
@@ -148,7 +148,7 @@
          "Number of entries in an SATB log buffer.")                       \
          constraint(G1SATBBufferSizeConstraintFunc, AtParse)               \
                                                                            \
-  develop(intx, G1SATBProcessCompletedThreshold, 20,                        \
+  develop(uintx, G1SATBProcessCompletedThreshold, 20,                       \
          "Number of completed buffers that triggers log processing.")      \
          range(0, max_jint)                                                \
                                                                            \
--- a/src/hotspot/share/gc/shared/adaptiveSizePolicy.hpp
+++ b/src/hotspot/share/gc/shared/adaptiveSizePolicy.hpp
@@ -344,17 +344,11 @@ class AdaptiveSizePolicy : public CHeapObj<mtGC> {
  AdaptiveWeightedAverage* avg_eden_live() const { return _avg_eden_live; }
  AdaptiveWeightedAverage* avg_old_live() const { return _avg_old_live; }

-  AdaptivePaddedAverage*  avg_survived() const { return _avg_survived; }
-  AdaptivePaddedNoZeroDevAverage*  avg_pretenured() { return _avg_pretenured; }
-
  // Methods indicating events of interest to the adaptive size policy,
  // called by GC algorithms. It is the responsibility of users of this
  // policy to call these methods at the correct times!
  virtual void minor_collection_begin();
  virtual void minor_collection_end(GCCause::Cause gc_cause);
-  virtual LinearLeastSquareFit* minor_pause_old_estimator() const {
-    return _minor_pause_old_estimator;
-  }

  LinearLeastSquareFit* minor_pause_young_estimator() {
    return _minor_pause_young_estimator;
@@ -404,10 +398,6 @@ class AdaptiveSizePolicy : public CHeapObj<mtGC> {
    _overhead_checker.set_gc_overhead_limit_exceeded(v);
  }

-  bool gc_overhead_limit_near() {
-    return _overhead_checker.gc_overhead_limit_near();
-  }
-
  void reset_gc_overhead_limit_count() {
    _overhead_checker.reset_gc_overhead_limit_count();
  }
--- a/src/hotspot/share/jfr/support/jfrNativeLibraryLoadEvent.cpp
+++ b/src/hotspot/share/jfr/support/jfrNativeLibraryLoadEvent.cpp
@@ -105,13 +105,15 @@ static void commit(HelperType& helper) {
  assert(thread != nullptr, "invariant");
  if (thread->is_Java_thread()) {
    JavaThread* jt = JavaThread::cast(thread);
-    if (jt->thread_state() != _thread_in_vm) {
-      assert(jt->thread_state() == _thread_in_native, "invariant");
+    if (jt->thread_state() == _thread_in_native) {
      // For a JavaThread to take a JFR stacktrace, it must be in _thread_in_vm. Can safepoint here.
      ThreadInVMfromNative transition(jt);
      event.commit();
      return;
    }
+    // If a thread comes here still _thread_in_Java, which can happen for example
+    // when loading the disassembler library in response to traps in JIT code - all is ok.
+    // Since there is no ljf, an event will be committed without a stacktrace.
  }
  event.commit();
 }
--- a/src/hotspot/share/oops/access.hpp
+++ b/src/hotspot/share/oops/access.hpp
@@ -53,8 +53,8 @@
 // * store_at: Store a value in an internal pointer relative to a base object.
 // * atomic_cmpxchg: Atomically compare-and-swap a new value at an address if previous value matched the compared value.
 // * atomic_cmpxchg_at: Atomically compare-and-swap a new value at an internal pointer address if previous value matched the compared value.
-// * atomic_xchg: Atomically swap a new value at an address if previous value matched the compared value.
-// * atomic_xchg_at: Atomically swap a new value at an internal pointer address if previous value matched the compared value.
+// * atomic_xchg: Atomically swap a new value at an address without checking the previous value.
+// * atomic_xchg_at: Atomically swap a new value at an internal pointer address without checking the previous value.
 // * arraycopy: Copy data from one heap array to another heap array. The ArrayAccess class has convenience functions for this.
 // * clone: Clone the contents of an object to a newly allocated object.
 //
@@ -83,12 +83,11 @@
 //             and whether the access is performed on the heap or outside. Then the
 //             appropriate BarrierSet::AccessBarrier is called to perform the access.
 //
-// The implementation of step 1-4 resides in in accessBackend.hpp, to allow selected
+// The implementation of step 1-4 resides in accessBackend.hpp, to allow selected
 // accesses to be accessible from only access.hpp, as opposed to access.inline.hpp.
 // Steps 5.a and 5.b require knowledge about the GC backends, and therefore needs to
 // include the various GC backend .inline.hpp headers. Their implementation resides in
-// access.inline.hpp. The accesses that are allowed through the access.hpp file
-// must be instantiated in access.cpp using the INSTANTIATE_HPP_ACCESS macro.
+// access.inline.hpp.

 template <DecoratorSet decorators = DECORATORS_NONE>
 class Access: public AllStatic {
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@@ -365,10 +365,10 @@
          "Level of detail of the ideal graph printout. "                   \
          "System-wide value, -1=printing is disabled, "                    \
          "0=print nothing except IGVPrintLevel directives, "               \
-          "5=all details printed. "                                         \
+          "6=all details printed. "                                         \
          "Level of detail of printouts can be set on a per-method level "  \
          "as well by using CompileCommand=option.")                        \
-          range(-1, 5)                                                      \
+          range(-1, 6)                                                      \
                                                                            \
  notproduct(intx, PrintIdealGraphPort, 4444,                               \
          "Ideal graph printer to network port")                            \
--- a/src/hotspot/share/opto/compile.cpp
+++ b/src/hotspot/share/opto/compile.cpp
@@ -1041,6 +1041,10 @@ void Compile::Init(bool aliasing) {
  Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist));
  set_decompile_count(0);

+#ifndef PRODUCT
+  Copy::zero_to_bytes(_igv_phase_iter, sizeof(_igv_phase_iter));
+#endif
+
  set_do_freq_based_layout(_directive->BlockLayoutByFrequencyOption);
  _loop_opts_cnt = LoopOptsCount;
  set_do_inlining(Inline);
@@ -2397,6 +2401,7 @@ void Compile::Optimize() {
  if (failing())  return;

  // Conditional Constant Propagation;
+  print_method(PHASE_BEFORE_CCP1, 2);
  PhaseCCP ccp( &igvn );
  assert( true, "Break here to ccp.dump_nodes_and_types(_root,999,1)");
  {
@@ -2972,6 +2977,8 @@ void Compile::Code_Gen() {
    if (failing()) {
      return;
    }
+
+    print_method(PHASE_REGISTER_ALLOCATION, 2);
  }

  // Prior to register allocation we kept empty basic blocks in case the
@@ -2989,6 +2996,7 @@ void Compile::Code_Gen() {
    cfg.fixup_flow();
    cfg.remove_unreachable_blocks();
    cfg.verify_dominator_tree();
+    print_method(PHASE_BLOCK_ORDERING, 3);
  }

  // Apply peephole optimizations
@@ -2996,12 +3004,14 @@ void Compile::Code_Gen() {
    TracePhase tp("peephole", &timers[_t_peephole]);
    PhasePeephole peep( _regalloc, cfg);
    peep.do_transform();
+    print_method(PHASE_PEEPHOLE, 3);
  }

  // Do late expand if CPU requires this.
  if (Matcher::require_postalloc_expand) {
    TracePhase tp("postalloc_expand", &timers[_t_postalloc_expand]);
    cfg.postalloc_expand(_regalloc);
+    print_method(PHASE_POSTALLOC_EXPAND, 3);
  }

  // Convert Nodes to instruction bits in a buffer
@@ -5102,6 +5112,10 @@ void Compile::print_method(CompilerPhaseType cpt, int level, Node* n) {
  ResourceMark rm;
  stringStream ss;
  ss.print_raw(CompilerPhaseTypeHelper::to_description(cpt));
+  int iter = ++_igv_phase_iter[cpt];
+  if (iter > 1) {
+    ss.print(" %d", iter);
+  }
  if (n != nullptr) {
    ss.print(": %d %s ", n->_idx, NodeClassNames[n->Opcode()]);
  }
--- a/src/hotspot/share/opto/compile.hpp
+++ b/src/hotspot/share/opto/compile.hpp
@@ -343,6 +343,7 @@ class Compile : public Phase {
  bool                  _print_intrinsics;      // True if we should print intrinsics for this compilation
 #ifndef PRODUCT
  uint                  _igv_idx;               // Counter for IGV node identifiers
+  uint                  _igv_phase_iter[PHASE_NUM_TYPES]; // Counters for IGV phase iterations
  bool                  _trace_opto_output;
  bool                  _parsed_irreducible_loop; // True if ciTypeFlow detected irreducible loops during parsing
 #endif
@@ -531,6 +532,7 @@ private:

 #ifndef PRODUCT
  IdealGraphPrinter* igv_printer() { return _igv_printer; }
+  void reset_igv_phase_iter(CompilerPhaseType cpt) { _igv_phase_iter[cpt] = 0; }
 #endif

  void log_late_inline(CallGenerator* cg);
--- a/src/hotspot/share/opto/graphKit.cpp
+++ b/src/hotspot/share/opto/graphKit.cpp
@@ -1563,6 +1563,11 @@ Node* GraphKit::make_load(Node* ctl, Node* adr, const Type* t, BasicType bt,
  if (((bt == T_OBJECT) && C->do_escape_analysis()) || C->eliminate_boxing()) {
    // Improve graph before escape analysis and boxing elimination.
    record_for_igvn(ld);
+    if (ld->is_DecodeN()) {
+      // Also record the actual load (LoadN) in case ld is DecodeN
+      assert(ld->in(1)->Opcode() == Op_LoadN, "Assumption invalid: input to DecodeN is not LoadN");
+      record_for_igvn(ld->in(1));
+    }
  }
  return ld;
 }
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -5387,6 +5387,10 @@ bool LibraryCallKit::inline_array_partition() {
    const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
    ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
    BasicType bt = elem_type->basic_type();
+    // Disable the intrinsic if the CPU does not support SIMD sort
+    if (!Matcher::supports_simd_sort(bt)) {
+      return false;
+    }
    address stubAddr = nullptr;
    stubAddr = StubRoutines::select_array_partition_function();
    // stub not loaded
@@ -5440,6 +5444,10 @@ bool LibraryCallKit::inline_array_sort() {
  const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
  ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
  BasicType bt = elem_type->basic_type();
+  // Disable the intrinsic if the CPU does not support SIMD sort
+  if (!Matcher::supports_simd_sort(bt)) {
+    return false;
+  }
  address stubAddr = nullptr;
  stubAddr = StubRoutines::select_arraysort_function();
  //stub not loaded
--- a/src/hotspot/share/opto/loopPredicate.cpp
+++ b/src/hotspot/share/opto/loopPredicate.cpp
@@ -1180,6 +1180,7 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
  }
  BoolNode* bol = test->as_Bool();
  if (invar.is_invariant(bol)) {
+    C->print_method(PHASE_BEFORE_LOOP_PREDICATION_IC, 4, iff);
    // Invariant test
    new_predicate_proj = create_new_if_for_predicate(parse_predicate_proj, nullptr,
                                                     reason,
@@ -1197,6 +1198,9 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
    IfNode* new_predicate_iff = new_predicate_proj->in(0)->as_If();
    _igvn.hash_delete(new_predicate_iff);
    new_predicate_iff->set_req(1, new_predicate_bol);
+
+    C->print_method(PHASE_AFTER_LOOP_PREDICATION_IC, 4, new_predicate_proj->in(0));
+
 #ifndef PRODUCT
    if (TraceLoopPredicate) {
      tty->print("Predicate invariant if%s: %d ", negated ? " negated" : "", new_predicate_iff->_idx);
@@ -1207,6 +1211,7 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
    }
 #endif
  } else if (cl != nullptr && loop->is_range_check_if(if_success_proj, this, invar DEBUG_ONLY(COMMA parse_predicate_proj))) {
+    C->print_method(PHASE_BEFORE_LOOP_PREDICATION_RC, 4, iff);
    // Range check for counted loops
    assert(if_success_proj->is_IfTrue(), "trap must be on false projection for a range check");
    const Node*    cmp    = bol->in(1)->as_Cmp();
@@ -1270,6 +1275,8 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
    new_predicate_proj = add_template_assertion_predicate(iff, loop, if_success_proj, parse_predicate_proj, upper_bound_proj, scale,
                                                          offset, init, limit, stride, rng, overflow, reason);

+    C->print_method(PHASE_AFTER_LOOP_PREDICATION_RC, 4, new_predicate_proj->in(0));
+
 #ifndef PRODUCT
    if (TraceLoopOpts && !TraceLoopPredicate) {
      tty->print("Predicate RC ");
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@@ -703,6 +703,9 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
  }
 #endif
  LoopNode* head = loop->_head->as_Loop();
+
+  C->print_method(PHASE_BEFORE_LOOP_PEELING, 4, head);
+
  bool counted_loop = head->is_CountedLoop();
  if (counted_loop) {
    CountedLoopNode *cl = head->as_CountedLoop();
@@ -795,6 +798,8 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
  peeled_dom_test_elim(loop,old_new);

  loop->record_for_igvn();
+
+  C->print_method(PHASE_AFTER_LOOP_PEELING, 4, new_head);
 }

 //------------------------------policy_maximally_unroll------------------------
@@ -1629,6 +1634,8 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
  CountedLoopEndNode *main_end = main_head->loopexit();
  assert(main_end->outcnt() == 2, "1 true, 1 false path only");

+  C->print_method(PHASE_BEFORE_PRE_MAIN_POST, 4, main_head);
+
  Node *pre_header= main_head->in(LoopNode::EntryControl);
  Node *init      = main_head->init_trip();
  Node *incr      = main_end ->incr();
@@ -1825,6 +1832,8 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
  // finds some, but we _know_ they are all useless.
  peeled_dom_test_elim(loop,old_new);
  loop->record_for_igvn();
+
+  C->print_method(PHASE_AFTER_PRE_MAIN_POST, 4, main_head);
 }

 //------------------------------insert_vector_post_loop------------------------
@@ -2127,6 +2136,9 @@ void PhaseIdealLoop::do_unroll(IdealLoopTree *loop, Node_List &old_new, bool adj
  assert(LoopUnrollLimit, "");
  CountedLoopNode *loop_head = loop->_head->as_CountedLoop();
  CountedLoopEndNode *loop_end = loop_head->loopexit();
+
+  C->print_method(PHASE_BEFORE_LOOP_UNROLLING, 4, loop_head);
+
 #ifndef PRODUCT
  if (PrintOpto && VerifyLoopOptimizations) {
    tty->print("Unrolling ");
@@ -2374,6 +2386,8 @@ void PhaseIdealLoop::do_unroll(IdealLoopTree *loop, Node_List &old_new, bool adj
    }
  }
 #endif
+
+  C->print_method(PHASE_AFTER_LOOP_UNROLLING, 4, clone_head);
 }

 //------------------------------do_maximally_unroll----------------------------
@@ -3003,6 +3017,8 @@ void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
      // stride_con and scale_con can be negative which will flip about the
      // sense of the test.

+      C->print_method(PHASE_BEFORE_RANGE_CHECK_ELIMINATION, 4, iff);
+
      // Perform the limit computations in jlong to avoid overflow
      jlong lscale_con = scale_con;
      Node* int_offset = offset;
@@ -3103,6 +3119,9 @@ void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
          --imax;
        }
      }
+
+      C->print_method(PHASE_AFTER_RANGE_CHECK_ELIMINATION, 4, cl);
+
    } // End of is IF
  }
  if (loop_entry != cl->skip_strip_mined()->in(LoopNode::EntryControl)) {
--- a/src/hotspot/share/opto/loopUnswitch.cpp
+++ b/src/hotspot/share/opto/loopUnswitch.cpp
@@ -134,6 +134,8 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree *loop, Node_List &old_new) {
  }
 #endif

+  C->print_method(PHASE_BEFORE_LOOP_UNSWITCHING, 4, head);
+
  // Need to revert back to normal loop
  if (head->is_CountedLoop() && !head->as_CountedLoop()->is_normal_loop()) {
    head->as_CountedLoop()->set_normal_loop();
@@ -200,6 +202,8 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree *loop, Node_List &old_new) {
  }
 #endif

+  C->print_method(PHASE_AFTER_LOOP_UNSWITCHING, 4, head_clone);
+
  C->set_major_progress();
 }

--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -1446,7 +1446,12 @@ void PhaseIdealLoop::split_if_with_blocks_post(Node *n) {
    }

    // Now split the IF
+    C->print_method(PHASE_BEFORE_SPLIT_IF, 4, iff);
+    if ((PrintOpto && VerifyLoopOptimizations) || TraceLoopOpts) {
+      tty->print_cr("Split-If");
+    }
    do_split_if(iff);
+    C->print_method(PHASE_AFTER_SPLIT_IF, 4, iff);
    return;
  }

@@ -3625,6 +3630,9 @@ bool PhaseIdealLoop::partial_peel( IdealLoopTree *loop, Node_List &old_new ) {
    }
  }
 #endif
+
+  C->print_method(PHASE_BEFORE_PARTIAL_PEELING, 4, head);
+
  VectorSet peel;
  VectorSet not_peel;
  Node_List peel_list;
@@ -3919,6 +3927,9 @@ bool PhaseIdealLoop::partial_peel( IdealLoopTree *loop, Node_List &old_new ) {
    }
  }
 #endif
+
+  C->print_method(PHASE_AFTER_PARTIAL_PEELING, 4, new_head_clone);
+
  return true;
 }

--- a/src/hotspot/share/opto/parse2.cpp
+++ b/src/hotspot/share/opto/parse2.cpp
@@ -2779,7 +2779,7 @@ void Parse::do_one_bytecode() {
  }

 #ifndef PRODUCT
-  constexpr int perBytecode = 5;
+  constexpr int perBytecode = 6;
  if (C->should_print_igv(perBytecode)) {
    IdealGraphPrinter* printer = C->igv_printer();
    char buffer[256];
--- a/src/hotspot/share/opto/phaseX.cpp
+++ b/src/hotspot/share/opto/phaseX.cpp
@@ -894,7 +894,7 @@ void PhaseIterGVN::verify_step(Node* n) {
 void PhaseIterGVN::trace_PhaseIterGVN(Node* n, Node* nn, const Type* oldtype) {
  const Type* newtype = type_or_null(n);
  if (nn != n || oldtype != newtype) {
-    C->print_method(PHASE_AFTER_ITER_GVN_STEP, 4, n);
+    C->print_method(PHASE_AFTER_ITER_GVN_STEP, 5, n);
  }
  if (TraceIterativeGVN) {
    uint wlsize = _worklist.size();
@@ -1025,6 +1025,7 @@ void PhaseIterGVN::trace_PhaseIterGVN_verbose(Node* n, int num_processed) {
 void PhaseIterGVN::optimize() {
  DEBUG_ONLY(uint num_processed  = 0;)
  NOT_PRODUCT(init_verifyPhaseIterGVN();)
+  NOT_PRODUCT(C->reset_igv_phase_iter(PHASE_AFTER_ITER_GVN_STEP);)
  C->print_method(PHASE_BEFORE_ITER_GVN, 3);
  if (StressIGVN) {
    shuffle_worklist();
--- a/src/hotspot/share/opto/phasetype.hpp
+++ b/src/hotspot/share/opto/phasetype.hpp
@@ -28,51 +28,77 @@
 #include "utilities/bitMap.inline.hpp"

 #define COMPILER_PHASES(flags) \
-  flags(BEFORE_STRINGOPTS,            "Before StringOpts") \
-  flags(AFTER_STRINGOPTS,             "After StringOpts") \
-  flags(BEFORE_REMOVEUSELESS,         "Before RemoveUseless") \
-  flags(AFTER_PARSING,                "After Parsing") \
-  flags(BEFORE_ITER_GVN,              "Before Iter GVN") \
-  flags(ITER_GVN1,                    "Iter GVN 1") \
-  flags(AFTER_ITER_GVN_STEP,          "After Iter GVN Step") \
-  flags(AFTER_ITER_GVN,               "After Iter GVN") \
-  flags(INCREMENTAL_INLINE_STEP,      "Incremental Inline Step") \
-  flags(INCREMENTAL_INLINE_CLEANUP,   "Incremental Inline Cleanup") \
-  flags(INCREMENTAL_INLINE,           "Incremental Inline") \
-  flags(INCREMENTAL_BOXING_INLINE,    "Incremental Boxing Inline") \
-  flags(EXPAND_VUNBOX,                "Expand VectorUnbox") \
-  flags(SCALARIZE_VBOX,               "Scalarize VectorBox") \
-  flags(INLINE_VECTOR_REBOX,          "Inline Vector Rebox Calls") \
-  flags(EXPAND_VBOX,                  "Expand VectorBox") \
-  flags(ELIMINATE_VBOX_ALLOC,         "Eliminate VectorBoxAllocate") \
-  flags(ITER_GVN_BEFORE_EA,           "Iter GVN before EA") \
-  flags(ITER_GVN_AFTER_VECTOR,        "Iter GVN after vector box elimination") \
-  flags(BEFORE_BEAUTIFY_LOOPS,        "Before beautify loops") \
-  flags(AFTER_BEAUTIFY_LOOPS,         "After beautify loops") \
-  flags(BEFORE_CLOOPS,                "Before CountedLoop") \
-  flags(AFTER_CLOOPS,                 "After CountedLoop") \
-  flags(PHASEIDEAL_BEFORE_EA,         "PhaseIdealLoop before EA") \
-  flags(AFTER_EA,                     "After Escape Analysis") \
-  flags(ITER_GVN_AFTER_EA,            "Iter GVN after EA") \
-  flags(ITER_GVN_AFTER_ELIMINATION,   "Iter GVN after eliminating allocations and locks") \
-  flags(PHASEIDEALLOOP1,              "PhaseIdealLoop 1") \
-  flags(PHASEIDEALLOOP2,              "PhaseIdealLoop 2") \
-  flags(PHASEIDEALLOOP3,              "PhaseIdealLoop 3") \
-  flags(CCP1,                         "PhaseCCP 1") \
-  flags(ITER_GVN2,                    "Iter GVN 2") \
-  flags(PHASEIDEALLOOP_ITERATIONS,    "PhaseIdealLoop iterations") \
-  flags(MACRO_EXPANSION,              "Macro expand") \
-  flags(BARRIER_EXPANSION,            "Barrier expand") \
-  flags(OPTIMIZE_FINISHED,            "Optimize finished") \
-  flags(BEFORE_MATCHING,              "Before matching") \
-  flags(MATCHING,                     "After matching") \
-  flags(GLOBAL_CODE_MOTION,           "Global code motion") \
-  flags(MACH_ANALYSIS,                "After mach analysis") \
-  flags(FINAL_CODE,                   "Final Code") \
-  flags(END,                          "End") \
-  flags(FAILURE,                      "Failure") \
-  flags(ALL,                          "All") \
-  flags(DEBUG,                        "Debug")
+  flags(BEFORE_STRINGOPTS,              "Before StringOpts") \
+  flags(AFTER_STRINGOPTS,               "After StringOpts") \
+  flags(BEFORE_REMOVEUSELESS,           "Before RemoveUseless") \
+  flags(AFTER_PARSING,                  "After Parsing") \
+  flags(BEFORE_ITER_GVN,                "Before Iter GVN") \
+  flags(ITER_GVN1,                      "Iter GVN 1") \
+  flags(AFTER_ITER_GVN_STEP,            "After Iter GVN Step") \
+  flags(AFTER_ITER_GVN,                 "After Iter GVN") \
+  flags(INCREMENTAL_INLINE_STEP,        "Incremental Inline Step") \
+  flags(INCREMENTAL_INLINE_CLEANUP,     "Incremental Inline Cleanup") \
+  flags(INCREMENTAL_INLINE,             "Incremental Inline") \
+  flags(INCREMENTAL_BOXING_INLINE,      "Incremental Boxing Inline") \
+  flags(EXPAND_VUNBOX,                  "Expand VectorUnbox") \
+  flags(SCALARIZE_VBOX,                 "Scalarize VectorBox") \
+  flags(INLINE_VECTOR_REBOX,            "Inline Vector Rebox Calls") \
+  flags(EXPAND_VBOX,                    "Expand VectorBox") \
+  flags(ELIMINATE_VBOX_ALLOC,           "Eliminate VectorBoxAllocate") \
+  flags(ITER_GVN_BEFORE_EA,             "Iter GVN before EA") \
+  flags(ITER_GVN_AFTER_VECTOR,          "Iter GVN after vector box elimination") \
+  flags(BEFORE_BEAUTIFY_LOOPS,          "Before beautify loops") \
+  flags(AFTER_BEAUTIFY_LOOPS,           "After beautify loops") \
+  flags(BEFORE_LOOP_UNROLLING,          "Before Loop Unrolling") \
+  flags(AFTER_LOOP_UNROLLING,           "After Loop Unrolling") \
+  flags(BEFORE_SPLIT_IF,                "Before Split-If") \
+  flags(AFTER_SPLIT_IF,                 "After Split-If") \
+  flags(BEFORE_LOOP_PREDICATION_IC,     "Before Loop Predication IC") \
+  flags(AFTER_LOOP_PREDICATION_IC,      "After Loop Predication IC") \
+  flags(BEFORE_LOOP_PREDICATION_RC,     "Before Loop Predication RC") \
+  flags(AFTER_LOOP_PREDICATION_RC,      "After Loop Predication RC") \
+  flags(BEFORE_PARTIAL_PEELING,         "Before Partial Peeling") \
+  flags(AFTER_PARTIAL_PEELING,          "After Partial Peeling") \
+  flags(BEFORE_LOOP_PEELING,            "Before Loop Peeling") \
+  flags(AFTER_LOOP_PEELING,             "After Loop Peeling") \
+  flags(BEFORE_LOOP_UNSWITCHING,        "Before Loop Unswitching") \
+  flags(AFTER_LOOP_UNSWITCHING,         "After Loop Unswitching") \
+  flags(BEFORE_RANGE_CHECK_ELIMINATION, "Before Range Check Elimination") \
+  flags(AFTER_RANGE_CHECK_ELIMINATION,  "After Range Check Elimination") \
+  flags(BEFORE_PRE_MAIN_POST,           "Before Pre/Main/Post Loops") \
+  flags(AFTER_PRE_MAIN_POST,            "After Pre/Main/Post Loops") \
+  flags(SUPERWORD1_BEFORE_SCHEDULE,     "Superword 1, Before Schedule") \
+  flags(SUPERWORD2_BEFORE_OUTPUT,       "Superword 2, Before Output") \
+  flags(SUPERWORD3_AFTER_OUTPUT,        "Superword 3, After Output") \
+  flags(BEFORE_CLOOPS,                  "Before CountedLoop") \
+  flags(AFTER_CLOOPS,                   "After CountedLoop") \
+  flags(PHASEIDEAL_BEFORE_EA,           "PhaseIdealLoop before EA") \
+  flags(AFTER_EA,                       "After Escape Analysis") \
+  flags(ITER_GVN_AFTER_EA,              "Iter GVN after EA") \
+  flags(ITER_GVN_AFTER_ELIMINATION,     "Iter GVN after eliminating allocations and locks") \
+  flags(PHASEIDEALLOOP1,                "PhaseIdealLoop 1") \
+  flags(PHASEIDEALLOOP2,                "PhaseIdealLoop 2") \
+  flags(PHASEIDEALLOOP3,                "PhaseIdealLoop 3") \
+  flags(BEFORE_CCP1,                    "Before PhaseCCP 1") \
+  flags(CCP1,                           "PhaseCCP 1") \
+  flags(ITER_GVN2,                      "Iter GVN 2") \
+  flags(PHASEIDEALLOOP_ITERATIONS,      "PhaseIdealLoop iterations") \
+  flags(MACRO_EXPANSION,                "Macro expand") \
+  flags(BARRIER_EXPANSION,              "Barrier expand") \
+  flags(OPTIMIZE_FINISHED,              "Optimize finished") \
+  flags(BEFORE_MATCHING,                "Before matching") \
+  flags(MATCHING,                       "After matching") \
+  flags(GLOBAL_CODE_MOTION,             "Global code motion") \
+  flags(REGISTER_ALLOCATION,            "Register Allocation") \
+  flags(BLOCK_ORDERING,                 "Block Ordering") \
+  flags(PEEPHOLE,                       "Peephole") \
+  flags(POSTALLOC_EXPAND,               "Post-Allocation Expand") \
+  flags(MACH_ANALYSIS,                  "After mach analysis") \
+  flags(FINAL_CODE,                     "Final Code") \
+  flags(END,                            "End") \
+  flags(FAILURE,                        "Failure") \
+  flags(ALL,                            "All") \
+  flags(DEBUG,                          "Debug")

 #define table_entry(name, description) PHASE_##name,
 enum CompilerPhaseType {
--- a/src/hotspot/share/opto/split_if.cpp
+++ b/src/hotspot/share/opto/split_if.cpp
@@ -591,12 +591,6 @@ void PhaseIdealLoop::handle_use( Node *use, Node *def, small_cache *cache, Node
 // Found an If getting its condition-code input from a Phi in the same block.
 // Split thru the Region.
 void PhaseIdealLoop::do_split_if(Node* iff, RegionNode** new_false_region, RegionNode** new_true_region) {
-  if (PrintOpto && VerifyLoopOptimizations) {
-    tty->print_cr("Split-if");
-  }
-  if (TraceLoopOpts) {
-    tty->print_cr("SplitIf");
-  }

  C->set_major_progress();
  RegionNode *region = iff->in(0)->as_Region();
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -2381,6 +2381,9 @@ void SuperWord::schedule() {
  }
 #endif

+  CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
+  _phase->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
+
  // (4) Use the memops_schedule to re-order the memops in all slices.
  schedule_reorder_memops(memops_schedule);
 }
@@ -2488,6 +2491,7 @@ bool SuperWord::output() {
    lpt()->dump_head();
  }
 #endif
+  _phase->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);

  // Ensure main loop's initial value is properly aligned
  //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
@@ -2808,6 +2812,8 @@ bool SuperWord::output() {
    }
  }

+  _phase->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
+
  return true;
 }

--- a/src/hotspot/share/prims/unsafe.cpp
+++ b/src/hotspot/share/prims/unsafe.cpp
@@ -390,7 +390,10 @@ UNSAFE_ENTRY_SCOPED(void, Unsafe_SetMemory0(JNIEnv *env, jobject unsafe, jobject
  oop base = JNIHandles::resolve(obj);
  void* p = index_oop_from_field_offset_long(base, offset);

-  Copy::fill_to_memory_atomic(p, sz, value);
+  {
+    GuardUnsafeAccess guard(thread);
+    Copy::fill_to_memory_atomic(p, sz, value);
+  }
 } UNSAFE_END

 UNSAFE_ENTRY_SCOPED(void, Unsafe_CopyMemory0(JNIEnv *env, jobject unsafe, jobject srcObj, jlong srcOffset, jobject dstObj, jlong dstOffset, jlong size)) {
--- a/src/hotspot/share/runtime/prefetch.hpp
+++ b/src/hotspot/share/runtime/prefetch.hpp
@@ -35,12 +35,6 @@

 class Prefetch : AllStatic {
 public:
-  enum style {
-    do_none,  // Do no prefetching
-    do_read,  // Do read prefetching
-    do_write  // Do write prefetching
-  };
-
  // Prefetch anticipating read; must not fault, semantically a no-op
  static void read(const void* loc, intx interval);

--- a/src/hotspot/share/services/heapDumper.cpp
+++ b/src/hotspot/share/services/heapDumper.cpp
@@ -1473,6 +1473,25 @@ void SymbolTableDumper::do_symbol(Symbol** p) {
  }
 }

+// Support class used to generate HPROF_GC_CLASS_DUMP records
+
+class ClassDumper : public KlassClosure {
+ private:
+  AbstractDumpWriter* _writer;
+  AbstractDumpWriter* writer() const { return _writer; }
+
+ public:
+  ClassDumper(AbstractDumpWriter* writer) : _writer(writer) {}
+
+  void do_klass(Klass* k) {
+    if (k->is_instance_klass()) {
+      DumperSupport::dump_instance_class(writer(), k);
+    } else {
+      DumperSupport::dump_array_class(writer(), k);
+    }
+  }
+};
+
 // Support class used to generate HPROF_GC_ROOT_JNI_LOCAL records

 class JNILocalsDumper : public OopClosure {
@@ -1860,21 +1879,25 @@ vframe* ThreadDumper::get_top_frame() const {
  return nullptr;
 }

+// Callback to dump thread-related data for unmounted virtual threads;
+// implemented by VM_HeapDumper.
+class UnmountedVThreadDumper {
+ public:
+  virtual void dump_vthread(oop vt, AbstractDumpWriter* segment_writer) = 0;
+};

-class VM_HeapDumper;
-
-// Support class using when iterating over the heap.
+// Support class used when iterating over the heap.
 class HeapObjectDumper : public ObjectClosure {
 private:
  AbstractDumpWriter* _writer;
  AbstractDumpWriter* writer()                  { return _writer; }
+  UnmountedVThreadDumper* _vthread_dumper;

  DumperClassCacheTable _class_cache;

 public:
-  HeapObjectDumper(AbstractDumpWriter* writer) {
-    _writer = writer;
-  }
+  HeapObjectDumper(AbstractDumpWriter* writer, UnmountedVThreadDumper* vthread_dumper)
+    : _writer(writer), _vthread_dumper(vthread_dumper) {}

  // called for each object in the heap
  void do_object(oop o);
@@ -1895,6 +1918,9 @@ void HeapObjectDumper::do_object(oop o) {
  if (o->is_instance()) {
    // create a HPROF_GC_INSTANCE record for each object
    DumperSupport::dump_instance(writer(), o, &_class_cache);
+    if (java_lang_VirtualThread::is_instance(o) && ThreadDumper::should_dump_vthread(o)) {
+      _vthread_dumper->dump_vthread(o, writer());
+    }
  } else if (o->is_objArray()) {
    // create a HPROF_GC_OBJ_ARRAY_DUMP record for each object array
    DumperSupport::dump_object_array(writer(), objArrayOop(o));
@@ -1908,16 +1934,52 @@ void HeapObjectDumper::do_object(oop o) {
 class DumperController : public CHeapObj<mtInternal> {
 private:
   Monitor* _lock;
+   Mutex* _global_writer_lock;
+
   const uint   _dumper_number;
   uint   _complete_number;

+   bool   _started; // VM dumper started and acquired global writer lock
+
 public:
   DumperController(uint number) :
-     _lock(new (std::nothrow) PaddedMonitor(Mutex::safepoint, "DumperController_lock")),
+     // _lock and _global_writer_lock are used for synchronization between GC worker threads inside safepoint,
+     // so we lock with _no_safepoint_check_flag.
+     // signal_start() acquires _lock when global writer is locked,
+     // its rank must be less than _global_writer_lock rank.
+     _lock(new (std::nothrow) PaddedMonitor(Mutex::nosafepoint - 1, "DumperController_lock")),
+     _global_writer_lock(new (std::nothrow) Mutex(Mutex::nosafepoint, "DumpWriter_lock")),
     _dumper_number(number),
-     _complete_number(0) { }
+     _complete_number(0),
+     _started(false)
+   {}

-   ~DumperController() { delete _lock; }
+   ~DumperController() {
+     delete _lock;
+     delete _global_writer_lock;
+   }
+
+   // parallel (non VM) dumpers must wait until VM dumper acquires global writer lock
+   void wait_for_start_signal() {
+     MonitorLocker ml(_lock, Mutex::_no_safepoint_check_flag);
+     while (_started == false) {
+       ml.wait();
+     }
+   }
+
+   void signal_start() {
+     MonitorLocker ml(_lock, Mutex::_no_safepoint_check_flag);
+     _started = true;
+     ml.notify_all();
+   }
+
+   void lock_global_writer() {
+     _global_writer_lock->lock_without_safepoint_check();
+   }
+
+   void unlock_global_writer() {
+     _global_writer_lock->unlock();
+   }

   void dumper_complete(DumpWriter* local_writer, DumpWriter* global_writer) {
     MonitorLocker ml(_lock, Mutex::_no_safepoint_check_flag);
@@ -1946,7 +2008,7 @@ private:
  int _dump_seq;

 private:
-  void merge_file(char* path);
+  void merge_file(const char* path);
  void merge_done();
  void set_error(const char* msg);

@@ -1958,8 +2020,28 @@ public:
    _dump_seq(dump_seq) {}

  void do_merge();
+
+  // returns path for the parallel DumpWriter (resource allocated)
+  static char* get_writer_path(const char* base_path, int seq);
+
 };

+char* DumpMerger::get_writer_path(const char* base_path, int seq) {
+  // approximate required buffer size
+  size_t buf_size = strlen(base_path)
+                    + 2                 // ".p"
+                    + 10                // number (that's enough for 2^32 parallel dumpers)
+                    + 1;                // '\0'
+
+  char* path = NEW_RESOURCE_ARRAY(char, buf_size);
+  memset(path, 0, buf_size);
+
+  os::snprintf(path, buf_size, "%s.p%d", base_path, seq);
+
+  return path;
+}
+
+
 void DumpMerger::merge_done() {
  // Writes the HPROF_HEAP_DUMP_END record.
  if (!_has_error) {
@@ -1980,8 +2062,7 @@ void DumpMerger::set_error(const char* msg) {
 // Merge segmented heap files via sendfile, it's more efficient than the
 // read+write combination, which would require transferring data to and from
 // user space.
-void DumpMerger::merge_file(char* path) {
-  assert(!SafepointSynchronize::is_at_safepoint(), "merging happens outside safepoint");
+void DumpMerger::merge_file(const char* path) {
  TraceTime timer("Merge segmented heap file directly", TRACETIME_LOG(Info, heapdump));

  int segment_fd = os::open(path, O_RDONLY, 0);
@@ -2018,8 +2099,7 @@ void DumpMerger::merge_file(char* path) {
 }
 #else
 // Generic implementation using read+write
-void DumpMerger::merge_file(char* path) {
-  assert(!SafepointSynchronize::is_at_safepoint(), "merging happens outside safepoint");
+void DumpMerger::merge_file(const char* path) {
  TraceTime timer("Merge segmented heap file", TRACETIME_LOG(Info, heapdump));

  fileStream segment_fs(path, "rb");
@@ -2044,7 +2124,6 @@ void DumpMerger::merge_file(char* path) {
 #endif

 void DumpMerger::do_merge() {
-  assert(!SafepointSynchronize::is_at_safepoint(), "merging happens outside safepoint");
  TraceTime timer("Merge heap files complete", TRACETIME_LOG(Info, heapdump));

  // Since contents in segmented heap file were already zipped, we don't need to zip
@@ -2054,10 +2133,9 @@ void DumpMerger::do_merge() {

  // Merge the content of the remaining files into base file. Regardless of whether
  // the merge process is successful or not, these segmented files will be deleted.
-  char path[JVM_MAXPATHLEN];
  for (int i = 0; i < _dump_seq; i++) {
-    memset(path, 0, JVM_MAXPATHLEN);
-    os::snprintf(path, JVM_MAXPATHLEN, "%s.p%d", _path, i);
+    ResourceMark rm;
+    const char* path = get_writer_path(_path, i);
    if (!_has_error) {
      merge_file(path);
    }
@@ -2087,7 +2165,7 @@ public:
 };

 // The VM operation that performs the heap dump
-class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
+class VM_HeapDumper : public VM_GC_Operation, public WorkerTask, public UnmountedVThreadDumper {
 private:
  static VM_HeapDumper*   _global_dumper;
  static DumpWriter*      _global_writer;
@@ -2107,10 +2185,15 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
  uint                    _num_dumper_threads;
  DumperController*       _dumper_controller;
  ParallelObjectIterator* _poi;
-  // worker id of VMDumper thread.
-  static const size_t VMDumperWorkerId = 0;
+
+  // Dumper id of VMDumper thread.
+  static const int VMDumperId = 0;
  // VM dumper dumps both heap and non-heap data, other dumpers dump heap-only data.
-  static bool is_vm_dumper(uint worker_id) { return worker_id == VMDumperWorkerId; }
+  static bool is_vm_dumper(int dumper_id) { return dumper_id == VMDumperId; }
+  // the 1st dumper calling get_next_dumper_id becomes VM dumper
+  int get_next_dumper_id() {
+    return Atomic::fetch_then_add(&_dump_seq, 1);
+  }

  // accessors and setters
  static VM_HeapDumper* dumper()         {  assert(_global_dumper != nullptr, "Error"); return _global_dumper; }
@@ -2129,17 +2212,11 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {

  bool skip_operation() const;

-  // create dump writer for every parallel dump thread
-  DumpWriter* create_local_writer();
-
-  // writes a HPROF_LOAD_CLASS record
+  // writes a HPROF_LOAD_CLASS record to global writer
  static void do_load_class(Klass* k);

-  // writes a HPROF_GC_CLASS_DUMP record for the given class
-  static void do_class_dump(Klass* k);
-
  // HPROF_GC_ROOT_THREAD_OBJ records for platform and mounted virtual threads
-  void dump_threads();
+  void dump_threads(AbstractDumpWriter* writer);

  void add_class_serial_number(Klass* k, int serial_num) {
    _klass_map->at_put_grow(serial_num, k);
@@ -2150,7 +2227,7 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
  }

  // HPROF_TRACE and HPROF_FRAME records for platform and mounted virtual threads
-  void dump_stack_traces();
+  void dump_stack_traces(AbstractDumpWriter* writer);

 public:
  VM_HeapDumper(DumpWriter* writer, bool gc_before_heap_dump, bool oome, uint num_dump_threads) :
@@ -2168,7 +2245,7 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
    _thread_serial_num = 1;
    _frame_serial_num = 1;

-    _dump_seq = 0;
+    _dump_seq = VMDumperId;
    _num_dumper_threads = num_dump_threads;
    _dumper_controller = nullptr;
    _poi = nullptr;
@@ -2202,12 +2279,15 @@ class VM_HeapDumper : public VM_GC_Operation, public WorkerTask {
  }
  int dump_seq()           { return _dump_seq; }
  bool is_parallel_dump()  { return _num_dumper_threads > 1; }
-  bool can_parallel_dump(WorkerThreads* workers);
+  void prepare_parallel_dump(WorkerThreads* workers);

  VMOp_Type type() const { return VMOp_HeapDumper; }
  virtual bool doit_prologue();
  void doit();
  void work(uint worker_id);
+
+  // UnmountedVThreadDumper implementation
+  void dump_vthread(oop vt, AbstractDumpWriter* segment_writer);
 };

 VM_HeapDumper* VM_HeapDumper::_global_dumper = nullptr;
@@ -2251,22 +2331,13 @@ void VM_HeapDumper::do_load_class(Klass* k) {
  writer()->write_symbolID(name);
 }

-// writes a HPROF_GC_CLASS_DUMP record for the given class
-void VM_HeapDumper::do_class_dump(Klass* k) {
-  if (k->is_instance_klass()) {
-    DumperSupport::dump_instance_class(writer(), k);
-  } else {
-    DumperSupport::dump_array_class(writer(), k);
-  }
-}
-
 // Write a HPROF_GC_ROOT_THREAD_OBJ record for platform/carrier and mounted virtual threads.
 // Then walk the stack so that locals and JNI locals are dumped.
-void VM_HeapDumper::dump_threads() {
-    for (int i = 0; i < _thread_dumpers_count; i++) {
-        _thread_dumpers[i]->dump_thread_obj(writer());
-        _thread_dumpers[i]->dump_stack_refs(writer());
-    }
+void VM_HeapDumper::dump_threads(AbstractDumpWriter* writer) {
+  for (int i = 0; i < _thread_dumpers_count; i++) {
+    _thread_dumpers[i]->dump_thread_obj(writer);
+    _thread_dumpers[i]->dump_stack_refs(writer);
+  }
 }

 bool VM_HeapDumper::doit_prologue() {
@@ -2280,31 +2351,21 @@ bool VM_HeapDumper::doit_prologue() {
  return VM_GC_Operation::doit_prologue();
 }

-bool VM_HeapDumper::can_parallel_dump(WorkerThreads* workers) {
-  bool can_parallel = true;
+void VM_HeapDumper::prepare_parallel_dump(WorkerThreads* workers) {
  uint num_active_workers = workers != nullptr ? workers->active_workers() : 0;
  uint num_requested_dump_threads = _num_dumper_threads;
  // check if we can dump in parallel based on requested and active threads
  if (num_active_workers <= 1 || num_requested_dump_threads <= 1) {
    _num_dumper_threads = 1;
-    can_parallel = false;
  } else {
-    // check if we have extra path room to accommodate segmented heap files
-    const char* base_path = writer()->get_file_path();
-    assert(base_path != nullptr, "sanity check");
-    if ((strlen(base_path) + 7/*.p\d\d\d\d\0*/) >= JVM_MAXPATHLEN) {
-      _num_dumper_threads = 1;
-      can_parallel = false;
-    } else {
-      _num_dumper_threads = clamp(num_requested_dump_threads, 2U, num_active_workers);
-    }
+    _num_dumper_threads = clamp(num_requested_dump_threads, 2U, num_active_workers);
  }
-
+  _dumper_controller = new (std::nothrow) DumperController(_num_dumper_threads);
+  bool can_parallel = _num_dumper_threads > 1;
  log_info(heapdump)("Requested dump threads %u, active dump threads %u, "
                     "actual dump threads %u, parallelism %s",
                     num_requested_dump_threads, num_active_workers,
                     _num_dumper_threads, can_parallel ? "true" : "false");
-  return can_parallel;
 }

 // The VM operation that dumps the heap. The dump consists of the following
@@ -2352,11 +2413,11 @@ void VM_HeapDumper::doit() {
  set_global_writer();

  WorkerThreads* workers = ch->safepoint_workers();
-  if (!can_parallel_dump(workers)) {
-    work(VMDumperWorkerId);
+  prepare_parallel_dump(workers);
+
+  if (!is_parallel_dump()) {
+    work(VMDumperId);
  } else {
-    uint heap_only_dumper_threads = _num_dumper_threads - 1 /* VMDumper thread */;
-    _dumper_controller = new (std::nothrow) DumperController(heap_only_dumper_threads);
    ParallelObjectIterator poi(_num_dumper_threads);
    _poi = &poi;
    workers->run_task(this, _num_dumper_threads);
@@ -2368,26 +2429,19 @@ void VM_HeapDumper::doit() {
  clear_global_writer();
 }

-// prepare DumpWriter for every parallel dump thread
-DumpWriter* VM_HeapDumper::create_local_writer() {
-  char* path = NEW_RESOURCE_ARRAY(char, JVM_MAXPATHLEN);
-  memset(path, 0, JVM_MAXPATHLEN);
-
-  // generate segmented heap file path
-  const char* base_path = writer()->get_file_path();
-  // share global compressor, local DumpWriter is not responsible for its life cycle
-  AbstractCompressor* compressor = writer()->compressor();
-  int seq = Atomic::fetch_then_add(&_dump_seq, 1);
-  os::snprintf(path, JVM_MAXPATHLEN, "%s.p%d", base_path, seq);
-
-  // create corresponding writer for that
-  DumpWriter* local_writer = new DumpWriter(path, writer()->is_overwrite(), compressor);
-  return local_writer;
-}
-
 void VM_HeapDumper::work(uint worker_id) {
  // VM Dumper works on all non-heap data dumping and part of heap iteration.
-  if (is_vm_dumper(worker_id)) {
+  int dumper_id = get_next_dumper_id();
+
+  if (is_vm_dumper(dumper_id)) {
+    // lock global writer, it will be unlocked after VM Dumper finishes with non-heap data
+    _dumper_controller->lock_global_writer();
+    _dumper_controller->signal_start();
+  } else {
+    _dumper_controller->wait_for_start_signal();
+  }
+
+  if (is_vm_dumper(dumper_id)) {
    TraceTime timer("Dump non-objects", TRACETIME_LOG(Info, heapdump));
    // Write the file header - we always use 1.0.2
    const char* header = "JAVA PROFILE 1.0.2";
@@ -2409,79 +2463,82 @@ void VM_HeapDumper::work(uint worker_id) {

    // write HPROF_FRAME and HPROF_TRACE records
    // this must be called after _klass_map is built when iterating the classes above.
-    dump_stack_traces();
+    dump_stack_traces(writer());

-    // HPROF_HEAP_DUMP/HPROF_HEAP_DUMP_SEGMENT starts here
-
-    // Writes HPROF_GC_CLASS_DUMP records
-    {
-      LockedClassesDo locked_dump_class(&do_class_dump);
-      ClassLoaderDataGraph::classes_do(&locked_dump_class);
-    }
-
-    // HPROF_GC_ROOT_THREAD_OBJ + frames + jni locals
-    dump_threads();
-
-    // HPROF_GC_ROOT_JNI_GLOBAL
-    JNIGlobalsDumper jni_dumper(writer());
-    JNIHandles::oops_do(&jni_dumper);
-    // technically not jni roots, but global roots
-    // for things like preallocated throwable backtraces
-    Universe::vm_global()->oops_do(&jni_dumper);
-    // HPROF_GC_ROOT_STICKY_CLASS
-    // These should be classes in the null class loader data, and not all classes
-    // if !ClassUnloading
-    StickyClassDumper class_dumper(writer());
-    ClassLoaderData::the_null_class_loader_data()->classes_do(&class_dumper);
+    // unlock global writer, so parallel dumpers can dump stack traces of unmounted virtual threads
+    _dumper_controller->unlock_global_writer();
  }

-  // Heap iteration.
-  // writes HPROF_GC_INSTANCE_DUMP records.
-  // After each sub-record is written check_segment_length will be invoked
-  // to check if the current segment exceeds a threshold. If so, a new
-  // segment is started.
-  // The HPROF_GC_CLASS_DUMP and HPROF_GC_INSTANCE_DUMP are the vast bulk
-  // of the heap dump.
-  if (!is_parallel_dump()) {
-    assert(is_vm_dumper(worker_id), "must be");
-    // == Serial dump
-    ResourceMark rm;
-    TraceTime timer("Dump heap objects", TRACETIME_LOG(Info, heapdump));
-    HeapObjectDumper obj_dumper(writer());
-    Universe::heap()->object_iterate(&obj_dumper);
-    writer()->finish_dump_segment();
-    // Writes the HPROF_HEAP_DUMP_END record because merge does not happen in serial dump
-    DumperSupport::end_of_dump(writer());
-    writer()->flush();
-  } else {
-    // == Parallel dump
-    ResourceMark rm;
-    TraceTime timer("Dump heap objects in parallel", TRACETIME_LOG(Info, heapdump));
-    DumpWriter* local_writer = is_vm_dumper(worker_id) ? writer() : create_local_writer();
-    if (!local_writer->has_error()) {
-      HeapObjectDumper obj_dumper(local_writer);
-      _poi->object_iterate(&obj_dumper, worker_id);
-      local_writer->finish_dump_segment();
-      local_writer->flush();
+  // HPROF_HEAP_DUMP/HPROF_HEAP_DUMP_SEGMENT starts here
+
+  ResourceMark rm;
+  // share global compressor, local DumpWriter is not responsible for its life cycle
+  DumpWriter segment_writer(DumpMerger::get_writer_path(writer()->get_file_path(), dumper_id),
+                            writer()->is_overwrite(), writer()->compressor());
+  if (!segment_writer.has_error()) {
+    if (is_vm_dumper(dumper_id)) {
+      // dump some non-heap subrecords to heap dump segment
+      TraceTime timer("Dump non-objects (part 2)", TRACETIME_LOG(Info, heapdump));
+      // Writes HPROF_GC_CLASS_DUMP records
+      ClassDumper class_dumper(&segment_writer);
+      ClassLoaderDataGraph::classes_do(&class_dumper);
+
+      // HPROF_GC_ROOT_THREAD_OBJ + frames + jni locals
+      dump_threads(&segment_writer);
+
+      // HPROF_GC_ROOT_JNI_GLOBAL
+      JNIGlobalsDumper jni_dumper(&segment_writer);
+      JNIHandles::oops_do(&jni_dumper);
+      // technically not jni roots, but global roots
+      // for things like preallocated throwable backtraces
+      Universe::vm_global()->oops_do(&jni_dumper);
+      // HPROF_GC_ROOT_STICKY_CLASS
+      // These should be classes in the null class loader data, and not all classes
+      // if !ClassUnloading
+      StickyClassDumper stiky_class_dumper(&segment_writer);
+      ClassLoaderData::the_null_class_loader_data()->classes_do(&stiky_class_dumper);
    }
-    if (is_vm_dumper(worker_id)) {
-      _dumper_controller->wait_all_dumpers_complete();
+
+    // Heap iteration.
+    // writes HPROF_GC_INSTANCE_DUMP records.
+    // After each sub-record is written check_segment_length will be invoked
+    // to check if the current segment exceeds a threshold. If so, a new
+    // segment is started.
+    // The HPROF_GC_CLASS_DUMP and HPROF_GC_INSTANCE_DUMP are the vast bulk
+    // of the heap dump.
+
+    TraceTime timer(is_parallel_dump() ? "Dump heap objects in parallel" : "Dump heap objects", TRACETIME_LOG(Info, heapdump));
+    HeapObjectDumper obj_dumper(&segment_writer, this);
+    if (!is_parallel_dump()) {
+      Universe::heap()->object_iterate(&obj_dumper);
    } else {
-      _dumper_controller->dumper_complete(local_writer, writer());
-      delete local_writer;
-      return;
+      // == Parallel dump
+      _poi->object_iterate(&obj_dumper, worker_id);
    }
+
+    segment_writer.finish_dump_segment();
+    segment_writer.flush();
+  }
+
+  _dumper_controller->dumper_complete(&segment_writer, writer());
+
+  if (is_vm_dumper(dumper_id)) {
+    _dumper_controller->wait_all_dumpers_complete();
+
+    // flush global writer
+    writer()->flush();
+
+    // At this point, all fragments of the heapdump have been written to separate files.
+    // We need to merge them into a complete heapdump and write HPROF_HEAP_DUMP_END at that time.
  }
-  // At this point, all fragments of the heapdump have been written to separate files.
-  // We need to merge them into a complete heapdump and write HPROF_HEAP_DUMP_END at that time.
 }

-void VM_HeapDumper::dump_stack_traces() {
+void VM_HeapDumper::dump_stack_traces(AbstractDumpWriter* writer) {
  // write a HPROF_TRACE record without any frames to be referenced as object alloc sites
-  DumperSupport::write_header(writer(), HPROF_TRACE, 3 * sizeof(u4));
-  writer()->write_u4((u4)STACK_TRACE_ID);
-  writer()->write_u4(0);                    // thread number
-  writer()->write_u4(0);                    // frame count
+  DumperSupport::write_header(writer, HPROF_TRACE, 3 * sizeof(u4));
+  writer->write_u4((u4)STACK_TRACE_ID);
+  writer->write_u4(0);                    // thread number
+  writer->write_u4(0);                    // frame count

  // max number if every platform thread is carrier with mounted virtual thread
  _thread_dumpers = NEW_C_HEAP_ARRAY(ThreadDumper*, Threads::number_of_threads() * 2, mtInternal);
@@ -2505,7 +2562,7 @@ void VM_HeapDumper::dump_stack_traces() {
          add_oom_frame = false;
        }
        thread_dumper->init_serial_nums(&_thread_serial_num, &_frame_serial_num);
-        thread_dumper->dump_stack_traces(writer(), _klass_map);
+        thread_dumper->dump_stack_traces(writer, _klass_map);
      }

      // platform or carrier thread
@@ -2515,11 +2572,27 @@ void VM_HeapDumper::dump_stack_traces() {
        thread_dumper->add_oom_frame(_oome_constructor);
      }
      thread_dumper->init_serial_nums(&_thread_serial_num, &_frame_serial_num);
-      thread_dumper->dump_stack_traces(writer(), _klass_map);
+      thread_dumper->dump_stack_traces(writer, _klass_map);
    }
  }
 }

+void VM_HeapDumper::dump_vthread(oop vt, AbstractDumpWriter* segment_writer) {
+  // unmounted vthread has no JavaThread
+  ThreadDumper thread_dumper(ThreadDumper::ThreadType::UnmountedVirtual, nullptr, vt);
+  thread_dumper.init_serial_nums(&_thread_serial_num, &_frame_serial_num);
+
+  // write HPROF_TRACE/HPROF_FRAME records to global writer
+  _dumper_controller->lock_global_writer();
+  thread_dumper.dump_stack_traces(writer(), _klass_map);
+  _dumper_controller->unlock_global_writer();
+
+  // write HPROF_GC_ROOT_THREAD_OBJ/HPROF_GC_ROOT_JAVA_FRAME/HPROF_GC_ROOT_JNI_LOCAL subrecord
+  // to segment writer
+  thread_dumper.dump_thread_obj(segment_writer);
+  thread_dumper.dump_stack_refs(segment_writer);
+}
+
 // dump the heap to given path.
 int HeapDumper::dump(const char* path, outputStream* out, int compression, bool overwrite, uint num_dump_threads) {
  assert(path != nullptr && strlen(path) > 0, "path missing");
@@ -2561,28 +2634,27 @@ int HeapDumper::dump(const char* path, outputStream* out, int compression, bool
  // record any error that the writer may have encountered
  set_error(writer.error());

-  // For serial dump, once VM_HeapDumper completes, the whole heap dump process
-  // is done, no further phases needed. For parallel dump, the whole heap dump
-  // process is done in two phases
+  // Heap dump process is done in two phases
  //
  // Phase 1: Concurrent threads directly write heap data to multiple heap files.
  //          This is done by VM_HeapDumper, which is performed within safepoint.
  //
  // Phase 2: Merge multiple heap files into one complete heap dump file.
  //          This is done by DumpMerger, which is performed outside safepoint
-  if (dumper.is_parallel_dump()) {
-    DumpMerger merger(path, &writer, dumper.dump_seq());
-    Thread* current_thread = Thread::current();
-    if (current_thread->is_AttachListener_thread()) {
-      // perform heapdump file merge operation in the current thread prevents us
-      // from occupying the VM Thread, which in turn affects the occurrence of
-      // GC and other VM operations.
-      merger.do_merge();
-    } else {
-      // otherwise, performs it by VM thread
-      VM_HeapDumpMerge op(&merger);
-      VMThread::execute(&op);
-    }
+
+  DumpMerger merger(path, &writer, dumper.dump_seq());
+  Thread* current_thread = Thread::current();
+  if (current_thread->is_AttachListener_thread()) {
+    // perform heapdump file merge operation in the current thread prevents us
+    // from occupying the VM Thread, which in turn affects the occurrence of
+    // GC and other VM operations.
+    merger.do_merge();
+  } else {
+    // otherwise, performs it by VM thread
+    VM_HeapDumpMerge op(&merger);
+    VMThread::execute(&op);
+  }
+  if (writer.error() != nullptr) {
    set_error(writer.error());
  }

--- a/src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+#ifndef AVX2_QSORT_32BIT
+#define AVX2_QSORT_32BIT
+
+#include "avx2-emu-funcs.hpp"
+#include "xss-common-qsort.h"
+
+/*
+ * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+
+// ymm                  7, 6, 5, 4, 3, 2, 1, 0
+#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3
+#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7
+#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2
+#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4
+
+/*
+ * Assumes ymm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+template <typename vtype, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm) {
+    const typename vtype::opmask_t oxAA = _mm256_set_epi32(
+        0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
+    const typename vtype::opmask_t oxCC = _mm256_set_epi32(
+        0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
+    const typename vtype::opmask_t oxF0 = _mm256_set_epi32(
+        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0, 0);
+
+    const typename vtype::ymmi_t rev_index = vtype::seti(NETWORK_32BIT_AVX2_2);
+    ymm = cmp_merge<vtype>(
+        ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
+    ymm = cmp_merge<vtype>(
+        ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_1), ymm), oxCC);
+    ymm = cmp_merge<vtype>(
+        ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
+    ymm = cmp_merge<vtype>(ymm, vtype::permutexvar(rev_index, ymm), oxF0);
+    ymm = cmp_merge<vtype>(
+        ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_3), ymm), oxCC);
+    ymm = cmp_merge<vtype>(
+        ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
+    return ymm;
+}
+
+struct avx2_32bit_swizzle_ops;
+
+template <>
+struct avx2_vector<int32_t> {
+    using type_t = int32_t;
+    using reg_t = __m256i;
+    using ymmi_t = __m256i;
+    using opmask_t = __m256i;
+    static const uint8_t numlanes = 8;
+#ifdef XSS_MINIMAL_NETWORK_SORT
+    static constexpr int network_sort_threshold = numlanes;
+#else
+    static constexpr int network_sort_threshold = 256;
+#endif
+    static constexpr int partition_unroll_factor = 4;
+
+    using swizzle_ops = avx2_32bit_swizzle_ops;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
+    static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
+    static reg_t zmm_max() {
+        return _mm256_set1_epi32(type_max());
+    }  // TODO: this should broadcast bits as is?
+    static opmask_t get_partial_loadmask(uint64_t num_to_read) {
+        auto mask = ((0x1ull << num_to_read) - 0x1ull);
+        return convert_int_to_avx2_mask(mask);
+    }
+    static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
+        return _mm256_xor_si256(x, y);
+    }
+    static opmask_t ge(reg_t x, reg_t y) {
+        opmask_t equal = eq(x, y);
+        opmask_t greater = _mm256_cmpgt_epi32(x, y);
+        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(equal),
+                                                _mm256_castsi256_ps(greater)));
+    }
+    static opmask_t gt(reg_t x, reg_t y) { return _mm256_cmpgt_epi32(x, y); }
+    static opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi32(x, y); }
+    template <int scale>
+    static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index,
+                                void const *base) {
+        return _mm256_mask_i32gather_epi32(src, base, index, mask, scale);
+    }
+    template <int scale>
+    static reg_t i64gather(__m256i index, void const *base) {
+        return _mm256_i32gather_epi32((int const *)base, index, scale);
+    }
+    static reg_t loadu(void const *mem) {
+        return _mm256_loadu_si256((reg_t const *)mem);
+    }
+    static reg_t max(reg_t x, reg_t y) { return _mm256_max_epi32(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
+        return avx2_emu_mask_compressstoreu32<type_t>(mem, mask, x);
+    }
+    static reg_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm256_maskload_epi32((const int *)mem, mask);
+    }
+    static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
+        reg_t dst = _mm256_maskload_epi32((type_t *)mem, mask);
+        return mask_mov(x, mask, dst);
+    }
+    static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
+        return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(x),
+                                                    _mm256_castsi256_ps(y),
+                                                    _mm256_castsi256_ps(mask)));
+    }
+    static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
+        return _mm256_maskstore_epi32((type_t *)mem, mask, x);
+    }
+    static reg_t min(reg_t x, reg_t y) { return _mm256_min_epi32(x, y); }
+    static reg_t permutexvar(__m256i idx, reg_t ymm) {
+        return _mm256_permutevar8x32_epi32(ymm, idx);
+        // return avx2_emu_permutexvar_epi32(idx, ymm);
+    }
+    static reg_t permutevar(reg_t ymm, __m256i idx) {
+        return _mm256_permutevar8x32_epi32(ymm, idx);
+    }
+    static reg_t reverse(reg_t ymm) {
+        const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
+        return permutexvar(rev_index, ymm);
+    }
+    static type_t reducemax(reg_t v) {
+        return avx2_emu_reduce_max32<type_t>(v);
+    }
+    static type_t reducemin(reg_t v) {
+        return avx2_emu_reduce_min32<type_t>(v);
+    }
+    static reg_t set1(type_t v) { return _mm256_set1_epi32(v); }
+    template <uint8_t mask>
+    static reg_t shuffle(reg_t ymm) {
+        return _mm256_shuffle_epi32(ymm, mask);
+    }
+    static void storeu(void *mem, reg_t x) {
+        _mm256_storeu_si256((__m256i *)mem, x);
+    }
+    static reg_t sort_vec(reg_t x) {
+        return sort_ymm_32bit<avx2_vector<type_t>>(x);
+    }
+    static reg_t cast_from(__m256i v) { return v; }
+    static __m256i cast_to(reg_t v) { return v; }
+    static int double_compressstore(type_t *left_addr, type_t *right_addr,
+                                    opmask_t k, reg_t reg) {
+        return avx2_double_compressstore32<type_t>(left_addr, right_addr, k,
+                                                   reg);
+    }
+};
+
+template <>
+struct avx2_vector<float> {
+    using type_t = float;
+    using reg_t = __m256;
+    using ymmi_t = __m256i;
+    using opmask_t = __m256i;
+    static const uint8_t numlanes = 8;
+#ifdef XSS_MINIMAL_NETWORK_SORT
+    static constexpr int network_sort_threshold = numlanes;
+#else
+    static constexpr int network_sort_threshold = 256;
+#endif
+    static constexpr int partition_unroll_factor = 4;
+
+    using swizzle_ops = avx2_32bit_swizzle_ops;
+
+    static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
+    static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
+    static reg_t zmm_max() { return _mm256_set1_ps(type_max()); }
+
+    static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+
+    static reg_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm256_maskload_ps((const float *)mem, mask);
+    }
+    static opmask_t ge(reg_t x, reg_t y) {
+        return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GE_OQ));
+    }
+    static opmask_t gt(reg_t x, reg_t y) {
+        return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GT_OQ));
+    }
+    static opmask_t eq(reg_t x, reg_t y) {
+        return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_EQ_OQ));
+    }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read) {
+        auto mask = ((0x1ull << num_to_read) - 0x1ull);
+        return convert_int_to_avx2_mask(mask);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask) {
+        return convert_avx2_mask_to_int(mask);
+    }
+    template <int type>
+    static opmask_t fpclass(reg_t x) {
+        if constexpr (type == (0x01 | 0x80)) {
+            return _mm256_castps_si256(_mm256_cmp_ps(x, x, _CMP_UNORD_Q));
+        } else {
+            static_assert(type == (0x01 | 0x80), "should not reach here");
+        }
+    }
+    template <int scale>
+    static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index,
+                                void const *base) {
+        return _mm256_mask_i32gather_ps(src, base, index,
+                                        _mm256_castsi256_ps(mask), scale);
+        ;
+    }
+    template <int scale>
+    static reg_t i64gather(__m256i index, void const *base) {
+        return _mm256_i32gather_ps((float *)base, index, scale);
+    }
+    static reg_t loadu(void const *mem) {
+        return _mm256_loadu_ps((float const *)mem);
+    }
+    static reg_t max(reg_t x, reg_t y) { return _mm256_max_ps(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
+        return avx2_emu_mask_compressstoreu32<type_t>(mem, mask, x);
+    }
+    static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
+        reg_t dst = _mm256_maskload_ps((type_t *)mem, mask);
+        return mask_mov(x, mask, dst);
+    }
+    static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
+        return _mm256_blendv_ps(x, y, _mm256_castsi256_ps(mask));
+    }
+    static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
+        return _mm256_maskstore_ps((type_t *)mem, mask, x);
+    }
+    static reg_t min(reg_t x, reg_t y) { return _mm256_min_ps(x, y); }
+    static reg_t permutexvar(__m256i idx, reg_t ymm) {
+        return _mm256_permutevar8x32_ps(ymm, idx);
+    }
+    static reg_t permutevar(reg_t ymm, __m256i idx) {
+        return _mm256_permutevar8x32_ps(ymm, idx);
+    }
+    static reg_t reverse(reg_t ymm) {
+        const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
+        return permutexvar(rev_index, ymm);
+    }
+    static type_t reducemax(reg_t v) {
+        return avx2_emu_reduce_max32<type_t>(v);
+    }
+    static type_t reducemin(reg_t v) {
+        return avx2_emu_reduce_min32<type_t>(v);
+    }
+    static reg_t set1(type_t v) { return _mm256_set1_ps(v); }
+    template <uint8_t mask>
+    static reg_t shuffle(reg_t ymm) {
+        return _mm256_castsi256_ps(
+            _mm256_shuffle_epi32(_mm256_castps_si256(ymm), mask));
+    }
+    static void storeu(void *mem, reg_t x) {
+        _mm256_storeu_ps((float *)mem, x);
+    }
+    static reg_t sort_vec(reg_t x) {
+        return sort_ymm_32bit<avx2_vector<type_t>>(x);
+    }
+    static reg_t cast_from(__m256i v) { return _mm256_castsi256_ps(v); }
+    static __m256i cast_to(reg_t v) { return _mm256_castps_si256(v); }
+    static int double_compressstore(type_t *left_addr, type_t *right_addr,
+                                    opmask_t k, reg_t reg) {
+        return avx2_double_compressstore32<type_t>(left_addr, right_addr, k,
+                                                   reg);
+    }
+};
+
+struct avx2_32bit_swizzle_ops {
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(
+        typename vtype::reg_t reg) {
+        __m256i v = vtype::cast_to(reg);
+
+        if constexpr (scale == 2) {
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, 0b10110001);
+            v = _mm256_castps_si256(vf);
+        } else if constexpr (scale == 4) {
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, 0b01001110);
+            v = _mm256_castps_si256(vf);
+        } else if constexpr (scale == 8) {
+            v = _mm256_permute2x128_si256(v, v, 0b00000001);
+        } else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v);
+    }
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n(
+        typename vtype::reg_t reg) {
+        __m256i v = vtype::cast_to(reg);
+
+        if constexpr (scale == 2) {
+            return swap_n<vtype, 2>(reg);
+        } else if constexpr (scale == 4) {
+            constexpr uint64_t mask = 0b00011011;
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, mask);
+            v = _mm256_castps_si256(vf);
+        } else if constexpr (scale == 8) {
+            return vtype::reverse(reg);
+        } else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v);
+    }
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n(
+        typename vtype::reg_t reg, typename vtype::reg_t other) {
+        __m256i v1 = vtype::cast_to(reg);
+        __m256i v2 = vtype::cast_to(other);
+
+        if constexpr (scale == 2) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b01010101);
+        } else if constexpr (scale == 4) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b00110011);
+        } else if constexpr (scale == 8) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b00001111);
+        } else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v1);
+    }
+};
+
+#endif  // AVX2_QSORT_32BIT
--- a/src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+#ifndef AVX2_EMU_FUNCS
+#define AVX2_EMU_FUNCS
+
+#include <array>
+#include <utility>
+
+#include "xss-common-qsort.h"
+
+constexpr auto avx2_mask_helper_lut32 = [] {
+    std::array<std::array<int32_t, 8>, 256> lut{};
+    for (int64_t i = 0; i <= 0xFF; i++) {
+        std::array<int32_t, 8> entry{};
+        for (int j = 0; j < 8; j++) {
+            if (((i >> j) & 1) == 1)
+                entry[j] = 0xFFFFFFFF;
+            else
+                entry[j] = 0;
+        }
+        lut[i] = entry;
+    }
+    return lut;
+}();
+
+constexpr auto avx2_compressstore_lut32_gen = [] {
+    std::array<std::array<std::array<int32_t, 8>, 256>, 2> lutPair{};
+    auto &permLut = lutPair[0];
+    auto &leftLut = lutPair[1];
+    for (int64_t i = 0; i <= 0xFF; i++) {
+        std::array<int32_t, 8> indices{};
+        std::array<int32_t, 8> leftEntry = {0, 0, 0, 0, 0, 0, 0, 0};
+        int right = 7;
+        int left = 0;
+        for (int j = 0; j < 8; j++) {
+            bool ge = (i >> j) & 1;
+            if (ge) {
+                indices[right] = j;
+                right--;
+            } else {
+                indices[left] = j;
+                leftEntry[left] = 0xFFFFFFFF;
+                left++;
+            }
+        }
+        permLut[i] = indices;
+        leftLut[i] = leftEntry;
+    }
+    return lutPair;
+}();
+
+constexpr auto avx2_compressstore_lut32_perm = avx2_compressstore_lut32_gen[0];
+constexpr auto avx2_compressstore_lut32_left = avx2_compressstore_lut32_gen[1];
+
+
+X86_SIMD_SORT_INLINE
+__m256i convert_int_to_avx2_mask(int32_t m) {
+    return _mm256_loadu_si256(
+        (const __m256i *)avx2_mask_helper_lut32[m].data());
+}
+
+X86_SIMD_SORT_INLINE
+int32_t convert_avx2_mask_to_int(__m256i m) {
+    return _mm256_movemask_ps(_mm256_castsi256_ps(m));
+}
+
+// Emulators for intrinsics missing from AVX2 compared to AVX512
+template <typename T>
+T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x) {
+    using vtype = avx2_vector<T>;
+    using reg_t = typename vtype::reg_t;
+
+    reg_t inter1 =
+        vtype::max(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
+    reg_t inter2 = vtype::max(
+        inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
+    T arr[vtype::numlanes];
+    vtype::storeu(arr, inter2);
+    return std::max(arr[0], arr[7]);
+}
+
+template <typename T>
+T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x) {
+    using vtype = avx2_vector<T>;
+    using reg_t = typename vtype::reg_t;
+
+    reg_t inter1 =
+        vtype::min(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
+    reg_t inter2 = vtype::min(
+        inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
+    T arr[vtype::numlanes];
+    vtype::storeu(arr, inter2);
+    return std::min(arr[0], arr[7]);
+}
+
+template <typename T>
+void avx2_emu_mask_compressstoreu32(void *base_addr,
+                                    typename avx2_vector<T>::opmask_t k,
+                                    typename avx2_vector<T>::reg_t reg) {
+    using vtype = avx2_vector<T>;
+
+    T *leftStore = (T *)base_addr;
+
+    int32_t shortMask = convert_avx2_mask_to_int(k);
+    const __m256i &perm = _mm256_loadu_si256(
+        (const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
+    const __m256i &left = _mm256_loadu_si256(
+        (const __m256i *)avx2_compressstore_lut32_left[shortMask].data());
+
+    typename vtype::reg_t temp = vtype::permutevar(reg, perm);
+
+    vtype::mask_storeu(leftStore, left, temp);
+}
+
+
+template <typename T>
+int avx2_double_compressstore32(void *left_addr, void *right_addr,
+                                typename avx2_vector<T>::opmask_t k,
+                                typename avx2_vector<T>::reg_t reg) {
+    using vtype = avx2_vector<T>;
+
+    T *leftStore = (T *)left_addr;
+    T *rightStore = (T *)right_addr;
+
+    int32_t shortMask = convert_avx2_mask_to_int(k);
+    const __m256i &perm = _mm256_loadu_si256(
+        (const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
+
+    typename vtype::reg_t temp = vtype::permutevar(reg, perm);
+
+    vtype::storeu(leftStore, temp);
+    vtype::storeu(rightStore, temp);
+
+    return _mm_popcnt_u32(shortMask);
+}
+
+
+template <typename T>
+typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
+                                            typename avx2_vector<T>::reg_t y) {
+    using vtype = avx2_vector<T>;
+    typename vtype::opmask_t nlt = vtype::gt(x, y);
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y),
+                                                _mm256_castsi256_pd(x),
+                                                _mm256_castsi256_pd(nlt)));
+}
+
+template <typename T>
+typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
+                                            typename avx2_vector<T>::reg_t y) {
+    using vtype = avx2_vector<T>;
+    typename vtype::opmask_t nlt = vtype::gt(x, y);
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x),
+                                                _mm256_castsi256_pd(y),
+                                                _mm256_castsi256_pd(nlt)));
+}
+
+#endif
--- a/src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp
+++ b/src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "simdsort-support.hpp"
+#ifdef __SIMDSORT_SUPPORTED_LINUX
+
+#pragma GCC target("avx2")
+#include "avx2-32bit-qsort.hpp"
+#include "classfile_constants.h"
+
+
+#define DLL_PUBLIC __attribute__((visibility("default")))
+#define INSERTION_SORT_THRESHOLD_32BIT 16
+
+extern "C" {
+
+    DLL_PUBLIC void avx2_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) {
+        switch(elem_type) {
+            case JVM_T_INT:
+                avx2_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                break;
+            case JVM_T_FLOAT:
+                avx2_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                break;
+            default:
+                assert(false, "Unexpected type");
+        }
+    }
+
+    DLL_PUBLIC void avx2_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
+        switch(elem_type) {
+            case JVM_T_INT:
+                avx2_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                break;
+            case JVM_T_FLOAT:
+                avx2_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                break;
+            default:
+                assert(false, "Unexpected type");
+        }
+    }
+
+}
+
+#endif
--- a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
@@ -28,7 +28,7 @@
 #ifndef AVX512_QSORT_32BIT
 #define AVX512_QSORT_32BIT

-#include "avx512-common-qsort.h"
+#include "xss-common-qsort.h"

 /*
 * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
@@ -43,130 +43,204 @@
 #define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
 #define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8

+template <typename vtype, typename reg_t>
+X86_SIMD_SORT_INLINE reg_t sort_zmm_32bit(reg_t zmm);
+
+struct avx512_32bit_swizzle_ops;
+
 template <>
 struct zmm_vector<int32_t> {
    using type_t = int32_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
+    using reg_t = __m512i;
+    using halfreg_t = __m256i;
    using opmask_t = __mmask16;
    static const uint8_t numlanes = 16;
+#ifdef XSS_MINIMAL_NETWORK_SORT
+    static constexpr int network_sort_threshold = numlanes;
+#else
+    static constexpr int network_sort_threshold = 512;
+#endif
+    static constexpr int partition_unroll_factor = 8;
+
+    using swizzle_ops = avx512_32bit_swizzle_ops;

    static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
    static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
+    static reg_t zmm_max() { return _mm512_set1_epi32(type_max()); }

    static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
-    static opmask_t ge(zmm_t x, zmm_t y) {
+
+    static opmask_t ge(reg_t x, reg_t y) {
        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
    }
-    static opmask_t gt(zmm_t x, zmm_t y) {
+
+    static opmask_t gt(reg_t x, reg_t y) {
        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_GT);
    }
+
+    static opmask_t get_partial_loadmask(uint64_t num_to_read) {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base) {
+    static halfreg_t i64gather(__m512i index, void const *base) {
        return _mm512_i64gather_epi32(index, base, scale);
    }
-    static zmm_t merge(ymm_t y1, ymm_t y2) {
-        zmm_t z1 = _mm512_castsi256_si512(y1);
+    static reg_t merge(halfreg_t y1, halfreg_t y2) {
+        reg_t z1 = _mm512_castsi256_si512(y1);
        return _mm512_inserti32x8(z1, y2, 1);
    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+    static reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+    static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
        return _mm512_mask_loadu_epi32(x, mask, mem);
    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+    static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
        return _mm512_mask_mov_epi32(x, mask, y);
    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+    static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
        return _mm512_mask_storeu_epi32(mem, mask, x);
    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+    static reg_t min(reg_t x, reg_t y) { return _mm512_min_epi32(x, y); }
+    static reg_t max(reg_t x, reg_t y) { return _mm512_max_epi32(x, y); }
+    static reg_t permutexvar(__m512i idx, reg_t zmm) {
        return _mm512_permutexvar_epi32(idx, zmm);
    }
-    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi32(v); }
-    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
+    static type_t reducemax(reg_t v) { return _mm512_reduce_max_epi32(v); }
+    static type_t reducemin(reg_t v) { return _mm512_reduce_min_epi32(v); }
+    static reg_t set1(type_t v) { return _mm512_set1_epi32(v); }
    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm) {
+    static reg_t shuffle(reg_t zmm) {
        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
    }
-    static void storeu(void *mem, zmm_t x) {
+    static void storeu(void *mem, reg_t x) {
        return _mm512_storeu_si512(mem, x);
    }

-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
+    static halfreg_t max(halfreg_t x, halfreg_t y) {
+        return _mm256_max_epi32(x, y);
+    }
+    static halfreg_t min(halfreg_t x, halfreg_t y) {
+        return _mm256_min_epi32(x, y);
+    }
+    static reg_t reverse(reg_t zmm) {
+        const auto rev_index = _mm512_set_epi32(NETWORK_32BIT_5);
+        return permutexvar(rev_index, zmm);
+    }
+    static reg_t sort_vec(reg_t x) {
+        return sort_zmm_32bit<zmm_vector<type_t>>(x);
+    }
+    static reg_t cast_from(__m512i v) { return v; }
+    static __m512i cast_to(reg_t v) { return v; }
+    static int double_compressstore(type_t *left_addr, type_t *right_addr,
+                                    opmask_t k, reg_t reg) {
+        return avx512_double_compressstore<zmm_vector<type_t>>(
+            left_addr, right_addr, k, reg);
+    }
 };
 template <>
 struct zmm_vector<float> {
    using type_t = float;
-    using zmm_t = __m512;
-    using ymm_t = __m256;
+    using reg_t = __m512;
+    using halfreg_t = __m256;
    using opmask_t = __mmask16;
    static const uint8_t numlanes = 16;
+#ifdef XSS_MINIMAL_NETWORK_SORT
+    static constexpr int network_sort_threshold = numlanes;
+#else
+    static constexpr int network_sort_threshold = 512;
+#endif
+    static constexpr int partition_unroll_factor = 8;
+
+    using swizzle_ops = avx512_32bit_swizzle_ops;

    static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
    static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
-    static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
+    static reg_t zmm_max() { return _mm512_set1_ps(type_max()); }

    static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
-    static opmask_t ge(zmm_t x, zmm_t y) {
+    static opmask_t ge(reg_t x, reg_t y) {
        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
    }
-    static opmask_t gt(zmm_t x, zmm_t y) {
+    static opmask_t gt(reg_t x, reg_t y) {
        return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
    }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read) {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask) { return mask; }
+    template <int type>
+    static opmask_t fpclass(reg_t x) {
+        return _mm512_fpclass_ps_mask(x, type);
+    }
    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base) {
+    static halfreg_t i64gather(__m512i index, void const *base) {
        return _mm512_i64gather_ps(index, base, scale);
    }
-    static zmm_t merge(ymm_t y1, ymm_t y2) {
-        zmm_t z1 = _mm512_castsi512_ps(
+    static reg_t merge(halfreg_t y1, halfreg_t y2) {
+        reg_t z1 = _mm512_castsi512_ps(
            _mm512_castsi256_si512(_mm256_castps_si256(y1)));
        return _mm512_insertf32x8(z1, y2, 1);
    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+    static reg_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
+    static reg_t max(reg_t x, reg_t y) { return _mm512_max_ps(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) {
        return _mm512_mask_compressstoreu_ps(mem, mask, x);
    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+    static reg_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm512_maskz_loadu_ps(mask, mem);
+    }
+    static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) {
        return _mm512_mask_loadu_ps(x, mask, mem);
    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+    static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) {
        return _mm512_mask_mov_ps(x, mask, y);
    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+    static void mask_storeu(void *mem, opmask_t mask, reg_t x) {
        return _mm512_mask_storeu_ps(mem, mask, x);
    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+    static reg_t min(reg_t x, reg_t y) { return _mm512_min_ps(x, y); }
+    static reg_t permutexvar(__m512i idx, reg_t zmm) {
        return _mm512_permutexvar_ps(idx, zmm);
    }
-    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_ps(v); }
-    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_ps(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
+    static type_t reducemax(reg_t v) { return _mm512_reduce_max_ps(v); }
+    static type_t reducemin(reg_t v) { return _mm512_reduce_min_ps(v); }
+    static reg_t set1(type_t v) { return _mm512_set1_ps(v); }
    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm) {
+    static reg_t shuffle(reg_t zmm) {
        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
    }
-    static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
+    static void storeu(void *mem, reg_t x) { return _mm512_storeu_ps(mem, x); }

-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
+    static halfreg_t max(halfreg_t x, halfreg_t y) {
+        return _mm256_max_ps(x, y);
+    }
+    static halfreg_t min(halfreg_t x, halfreg_t y) {
+        return _mm256_min_ps(x, y);
+    }
+    static reg_t reverse(reg_t zmm) {
+        const auto rev_index = _mm512_set_epi32(NETWORK_32BIT_5);
+        return permutexvar(rev_index, zmm);
+    }
+    static reg_t sort_vec(reg_t x) {
+        return sort_zmm_32bit<zmm_vector<type_t>>(x);
+    }
+    static reg_t cast_from(__m512i v) { return _mm512_castsi512_ps(v); }
+    static __m512i cast_to(reg_t v) { return _mm512_castps_si512(v); }
+    static int double_compressstore(type_t *left_addr, type_t *right_addr,
+                                    opmask_t k, reg_t reg) {
+        return avx512_double_compressstore<zmm_vector<type_t>>(
+            left_addr, right_addr, k, reg);
+    }
 };

 /*
 * Assumes zmm is random and performs a full sorting network defined in
 * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
 */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) {
+template <typename vtype, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_INLINE reg_t sort_zmm_32bit(reg_t zmm) {
    zmm = cmp_merge<vtype>(
        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
    zmm = cmp_merge<vtype>(
@@ -193,249 +267,71 @@ X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) {
    return zmm;
 }

-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) {
-    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm),
-        0xFF00);
-    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
-        0xF0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
-    return zmm;
-}
+struct avx512_32bit_swizzle_ops {
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(
+        typename vtype::reg_t reg) {
+        __m512i v = vtype::cast_to(reg);

-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1,
-                                                      zmm_t *zmm2) {
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
-    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
-    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
-    // 2) Recursive half cleaner for each
-    *zmm1 = bitonic_merge_zmm_32bit<vtype>(zmm3);
-    *zmm2 = bitonic_merge_zmm_32bit<vtype>(zmm4);
-}
+        if constexpr (scale == 2) {
+            v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b10110001);
+        } else if constexpr (scale == 4) {
+            v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b01001110);
+        } else if constexpr (scale == 8) {
+            v = _mm512_shuffle_i64x2(v, v, 0b10110001);
+        } else if constexpr (scale == 16) {
+            v = _mm512_shuffle_i64x2(v, v, 0b01001110);
+        } else {
+            static_assert(scale == -1, "should not be reached");
+        }

-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) {
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) {
-    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm_32bit<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm_32bit<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm_32bit<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm_32bit<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N) {
-    typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
-    typename vtype::zmm_t zmm =
-        vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm_32bit<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N) {
-    if (N <= 16) {
-        sort_16_32bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
-    zmm1 = sort_zmm_32bit<vtype>(zmm1);
-    zmm2 = sort_zmm_32bit<vtype>(zmm2);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm1, &zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N) {
-    if (N <= 32) {
-        sort_32_32bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
-    load_mask1 &= combined_mask & 0xFFFF;
-    load_mask2 &= (combined_mask >> 16) & 0xFFFF;
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
-    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_four_zmm_32bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) {
-    if (N <= 64) {
-        sort_64_32bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    zmm[2] = vtype::loadu(arr + 32);
-    zmm[3] = vtype::loadu(arr + 48);
-    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
-    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
-    if (N != 128) {
-        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
-        load_mask1 &= combined_mask & 0xFFFF;
-        load_mask2 &= (combined_mask >> 16) & 0xFFFF;
-        load_mask3 &= (combined_mask >> 32) & 0xFFFF;
-        load_mask4 &= (combined_mask >> 48) & 0xFFFF;
-    }
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
-    zmm[4] = sort_zmm_32bit<vtype>(zmm[4]);
-    zmm[5] = sort_zmm_32bit<vtype>(zmm[5]);
-    zmm[6] = sort_zmm_32bit<vtype>(zmm[6]);
-    zmm[7] = sort_zmm_32bit<vtype>(zmm[7]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[4], &zmm[5]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[6], &zmm[7]);
-    bitonic_merge_four_zmm_32bit<vtype>(zmm);
-    bitonic_merge_four_zmm_32bit<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm_32bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::storeu(arr + 32, zmm[2]);
-    vtype::storeu(arr + 48, zmm[3]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-
-template <typename vtype, typename type_t>
-static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
-                         int64_t max_iters) {
-    /*
-     * Resort to std::sort if quicksort isnt making any progress
-     */
-    if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
-        return;
+        return vtype::cast_from(v);
    }

-    type_t pivot = get_pivot_scalar<type_t>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
-        arr, left, right + 1, pivot, &smallest, &biggest, false);
-    if (pivot != smallest)
-        qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n(
+        typename vtype::reg_t reg) {
+        __m512i v = vtype::cast_to(reg);

-template <>
-void inline avx512_qsort<int32_t>(int32_t *arr, int64_t fromIndex, int64_t toIndex) {
-    int64_t arrsize = toIndex - fromIndex;
-    if (arrsize > 1) {
-        qsort_32bit_<zmm_vector<int32_t>, int32_t>(arr, fromIndex, toIndex - 1,
-                                                   2 * (int64_t)log2(arrsize));
-    }
-}
+        if constexpr (scale == 2) {
+            return swap_n<vtype, 2>(reg);
+        } else if constexpr (scale == 4) {
+            __m512i mask = _mm512_set_epi32(12, 13, 14, 15, 8, 9, 10, 11, 4, 5,
+                                            6, 7, 0, 1, 2, 3);
+            v = _mm512_permutexvar_epi32(mask, v);
+        } else if constexpr (scale == 8) {
+            __m512i mask = _mm512_set_epi32(8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
+                                            2, 3, 4, 5, 6, 7);
+            v = _mm512_permutexvar_epi32(mask, v);
+        } else if constexpr (scale == 16) {
+            return vtype::reverse(reg);
+        } else {
+            static_assert(scale == -1, "should not be reached");
+        }

-template <>
-void inline avx512_qsort<float>(float *arr, int64_t fromIndex, int64_t toIndex) {
-    int64_t arrsize = toIndex - fromIndex;
-    if (arrsize > 1) {
-        qsort_32bit_<zmm_vector<float>, float>(arr, fromIndex, toIndex - 1,
-                                               2 * (int64_t)log2(arrsize));
+        return vtype::cast_from(v);
    }
-}
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n(
+        typename vtype::reg_t reg, typename vtype::reg_t other) {
+        __m512i v1 = vtype::cast_to(reg);
+        __m512i v2 = vtype::cast_to(other);
+
+        if constexpr (scale == 2) {
+            v1 = _mm512_mask_blend_epi32(0b0101010101010101, v1, v2);
+        } else if constexpr (scale == 4) {
+            v1 = _mm512_mask_blend_epi32(0b0011001100110011, v1, v2);
+        } else if constexpr (scale == 8) {
+            v1 = _mm512_mask_blend_epi32(0b0000111100001111, v1, v2);
+        } else if constexpr (scale == 16) {
+            v1 = _mm512_mask_blend_epi32(0b0000000011111111, v1, v2);
+        } else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v1);
+    }
+};

 #endif  // AVX512_QSORT_32BIT
--- a/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
-
-#ifndef AVX512_64BIT_COMMON
-#define AVX512_64BIT_COMMON
-#include "avx512-common-qsort.h"
-
-/*
- * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-// ZMM                  7, 6, 5, 4, 3, 2, 1, 0
-#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
-
-template <>
-struct zmm_vector<int64_t> {
-    using type_t = int64_t;
-    using zmm_t = __m512i;
-    using zmmi_t = __m512i;
-    using ymm_t = __m512i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max() { return X86_SIMD_SORT_MAX_INT64; }
-    static type_t type_min() { return X86_SIMD_SORT_MIN_INT64; }
-    static zmm_t zmm_max() {
-        return _mm512_set1_epi64(type_max());
-    }  // TODO: this should broadcast bits as is?
-
-    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
-                       int v8) {
-        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
-        return _kxor_mask8(x, y);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
-    static opmask_t le(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_LE);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static opmask_t gt(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_GT);
-    }
-    static opmask_t eq(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ);
-    }
-    template <int scale>
-    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
-                                void const *base) {
-        return _mm512_mask_i64gather_epi64(src, mask, index, base, scale);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_epi64(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi64(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
-    }
-    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
-        return _mm512_maskz_loadu_epi64(mask, mem);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm512_mask_loadu_epi64(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm512_mask_mov_epi64(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_storeu_epi64(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi64(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
-        return _mm512_permutexvar_epi64(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi64(v); }
-    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi64(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm) {
-        __m512d temp = _mm512_castsi512_pd(zmm);
-        return _mm512_castpd_si512(
-            _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
-    }
-    static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); }
-};
-template <>
-struct zmm_vector<double> {
-    using type_t = double;
-    using zmm_t = __m512d;
-    using zmmi_t = __m512i;
-    using ymm_t = __m512d;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max() { return X86_SIMD_SORT_INFINITY; }
-    static type_t type_min() { return -X86_SIMD_SORT_INFINITY; }
-    static zmm_t zmm_max() { return _mm512_set1_pd(type_max()); }
-
-    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
-                       int v8) {
-        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-
-    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
-        return _mm512_maskz_loadu_pd(mask, mem);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ);
-    }
-    static opmask_t gt(zmm_t x, zmm_t y) {
-        return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ);
-    }
-    static opmask_t eq(zmm_t x, zmm_t y) {
-        return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);
-    }
-    template <int type>
-    static opmask_t fpclass(zmm_t x) {
-        return _mm512_fpclass_pd_mask(x, type);
-    }
-    template <int scale>
-    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
-                                void const *base) {
-        return _mm512_mask_i64gather_pd(src, mask, index, base, scale);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_pd(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_pd(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_pd(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_compressstoreu_pd(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm512_mask_loadu_pd(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm512_mask_mov_pd(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_storeu_pd(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_pd(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
-        return _mm512_permutexvar_pd(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_pd(v); }
-    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_pd(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_pd(v); }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm) {
-        return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x) { _mm512_storeu_pd(mem, x); }
-};
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) {
-    const typename vtype::zmmi_t rev_index = vtype::seti(NETWORK_64BIT_2);
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_1), zmm), 0xCC);
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    zmm = cmp_merge<vtype>(zmm, vtype::permutexvar(rev_index, zmm), 0xF0);
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_3), zmm), 0xCC);
-    zmm = cmp_merge<vtype>(
-        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    return zmm;
-}
-
-
-#endif
--- a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
--- a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
- * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
-#ifndef AVX512_QSORT_COMMON
-#define AVX512_QSORT_COMMON
-
-/*
- * Quicksort using AVX-512. The ideas and code are based on these two research
- * papers [1] and [2]. On a high level, the idea is to vectorize quicksort
- * partitioning using AVX-512 compressstore instructions. If the array size is
- * < 128, then use Bitonic sorting network implemented on 512-bit registers.
- * The precise network definitions depend on the dtype and are defined in
- * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
- * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
- * network. The core implementations of the vectorized qsort functions
- * avx512_qsort<T>(T*, int64_t) are modified versions of avx2 quicksort
- * presented in the paper [2] and source code associated with that paper [3].
- *
- * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
- *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- *
- * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier:
- * MIT
- *
- * [4]
- * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- */
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-#include <limits>
-
-/*
-Workaround for the bug in GCC12 (that was fixed in GCC 12.3.1).
-More details are available at: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-*/
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#include <immintrin.h>
-#pragma GCC diagnostic pop
-
-#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
-#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
-#define X86_SIMD_SORT_INFINITYH 0x7c00
-#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
-#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
-#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
-#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
-#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
-#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
-#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
-#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
-#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
-#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
-#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
-#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
-#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
-#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
-#define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH)
-#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
-#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
-#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-
-#ifdef _MSC_VER
-#define X86_SIMD_SORT_INLINE static inline
-#define X86_SIMD_SORT_FINLINE static __forceinline
-#elif defined(__CYGWIN__)
-/*
- * Force inline in cygwin to work around a compiler bug. See
- * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
- */
-#define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
-#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
-#elif defined(__GNUC__)
-#define X86_SIMD_SORT_INLINE static inline
-#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
-#else
-#define X86_SIMD_SORT_INLINE static
-#define X86_SIMD_SORT_FINLINE static
-#endif
-
-#define LIKELY(x) __builtin_expect((x), 1)
-#define UNLIKELY(x) __builtin_expect((x), 0)
-
-template <typename type>
-struct zmm_vector;
-
-template <typename type>
-struct ymm_vector;
-
-// Regular quicksort routines:
-template <typename T>
-void avx512_qsort(T *arr, int64_t arrsize);
-
-template <typename T>
-void inline avx512_qsort(T *arr, int64_t from_index, int64_t to_index);
-
-template <typename T>
-bool is_a_nan(T elem) {
-    return std::isnan(elem);
-}
-
-template <typename T>
-X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) {
-    // median of 8 equally spaced elements
-    int64_t NUM_ELEMENTS = 8;
-    int64_t MID = NUM_ELEMENTS / 2;
-    int64_t size = (right - left) / NUM_ELEMENTS;
-    T temp[NUM_ELEMENTS];
-    for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)];
-    std::sort(temp, temp + NUM_ELEMENTS);
-    return temp[MID];
-}
-
-template <typename vtype, typename T = typename vtype::type_t>
-bool comparison_func_ge(const T &a, const T &b) {
-    return a < b;
-}
-
-template <typename vtype, typename T = typename vtype::type_t>
-bool comparison_func_gt(const T &a, const T &b) {
-    return a <= b;
-}
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-static void COEX(mm_t &a, mm_t &b) {
-    mm_t temp = a;
-    a = vtype::min(a, b);
-    b = vtype::max(temp, b);
-}
-template <typename vtype, typename zmm_t = typename vtype::zmm_t,
-          typename opmask_t = typename vtype::opmask_t>
-static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) {
-    zmm_t min = vtype::min(in2, in1);
-    zmm_t max = vtype::max(in2, in1);
-    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
-}
-/*
- * Parition one ZMM register based on the pivot and returns the
- * number of elements that are greater than or equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right,
-                                    const zmm_t curr_vec, const zmm_t pivot_vec,
-                                    zmm_t *smallest_vec, zmm_t *biggest_vec, bool use_gt) {
-    /* which elements are larger than or equal to the pivot */
-    typename vtype::opmask_t mask;
-    if (use_gt) mask = vtype::gt(curr_vec, pivot_vec);
-    else mask = vtype::ge(curr_vec, pivot_vec);
-    //mask = vtype::ge(curr_vec, pivot_vec);
-    int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)mask);
-    vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(mask),
-                               curr_vec);
-    vtype::mask_compressstoreu(arr + right - amount_ge_pivot, mask,
-                               curr_vec);
-    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
-    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_ge_pivot;
-}
-/*
- * Parition an array based on the pivot and returns the index of the
- * first element that is greater than or equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
-                                       type_t pivot, type_t *smallest,
-                                       type_t *biggest, bool use_gt) {
-    auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
-    /* make array length divisible by vtype::numlanes , shortening the array */
-    for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
-        *smallest = std::min(*smallest, arr[left], comparison_func);
-        *biggest = std::max(*biggest, arr[left], comparison_func);
-        if (!comparison_func(arr[left], pivot)) {
-            std::swap(arr[left], arr[--right]);
-        } else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than vtype::numlanes elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    if (right - left == vtype::numlanes) {
-        zmm_t vec = vtype::loadu(arr + left);
-        int32_t amount_ge_pivot =
-            partition_vec<vtype>(arr, left, left + vtype::numlanes, vec,
-                                 pivot_vec, &min_vec, &max_vec, use_gt);
-        *smallest = vtype::reducemin(min_vec);
-        *biggest = vtype::reducemax(max_vec);
-        return left + (vtype::numlanes - amount_ge_pivot);
-    }
-
-    // first and last vtype::numlanes values are partitioned at the end
-    zmm_t vec_left = vtype::loadu(arr + left);
-    zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
-    // store points of the vectors
-    int64_t r_store = right - vtype::numlanes;
-    int64_t l_store = left;
-    // indices for loading the elements
-    left += vtype::numlanes;
-    right -= vtype::numlanes;
-    while (right - left != 0) {
-        zmm_t curr_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + vtype::numlanes) - right < left - l_store) {
-            right -= vtype::numlanes;
-            curr_vec = vtype::loadu(arr + right);
-        } else {
-            curr_vec = vtype::loadu(arr + left);
-            left += vtype::numlanes;
-        }
-        // partition the current vector and save it on both sides of the array
-        int32_t amount_ge_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
-                                 curr_vec, pivot_vec, &min_vec, &max_vec, use_gt);
-        ;
-        r_store -= amount_ge_pivot;
-        l_store += (vtype::numlanes - amount_ge_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    int32_t amount_ge_pivot =
-        partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes, vec_left,
-                             pivot_vec, &min_vec, &max_vec, use_gt);
-    l_store += (vtype::numlanes - amount_ge_pivot);
-    amount_ge_pivot =
-        partition_vec<vtype>(arr, l_store, l_store + vtype::numlanes, vec_right,
-                             pivot_vec, &min_vec, &max_vec, use_gt);
-    l_store += (vtype::numlanes - amount_ge_pivot);
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-
-template <typename vtype, int num_unroll,
-          typename type_t = typename vtype::type_t>
-static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
-                                                int64_t right, type_t pivot,
-                                                type_t *smallest,
-                                                type_t *biggest, bool use_gt) {
-    if (right - left <= 2 * num_unroll * vtype::numlanes) {
-        return partition_avx512<vtype>(arr, left, right, pivot, smallest,
-                                       biggest, use_gt);
-    }
-
-    auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
-    /* make array length divisible by 8*vtype::numlanes , shortening the array
-     */
-    for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
-         --i) {
-        *smallest = std::min(*smallest, arr[left], comparison_func);
-        *biggest = std::max(*biggest, arr[left], comparison_func);
-        if (!comparison_func(arr[left], pivot)) {
-            std::swap(arr[left], arr[--right]);
-        } else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than vtype::numlanes elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    // We will now have atleast 16 registers worth of data to process:
-    // left and right vtype::numlanes values are partitioned at the end
-    zmm_t vec_left[num_unroll], vec_right[num_unroll];
-#pragma GCC unroll 8
-    for (int ii = 0; ii < num_unroll; ++ii) {
-        vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
-        vec_right[ii] =
-            vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii)));
-    }
-    // store points of the vectors
-    int64_t r_store = right - vtype::numlanes;
-    int64_t l_store = left;
-    // indices for loading the elements
-    left += num_unroll * vtype::numlanes;
-    right -= num_unroll * vtype::numlanes;
-    while (right - left != 0) {
-        zmm_t curr_vec[num_unroll];
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + vtype::numlanes) - right < left - l_store) {
-            right -= num_unroll * vtype::numlanes;
-#pragma GCC unroll 8
-            for (int ii = 0; ii < num_unroll; ++ii) {
-                curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
-            }
-        } else {
-#pragma GCC unroll 8
-            for (int ii = 0; ii < num_unroll; ++ii) {
-                curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
-            }
-            left += num_unroll * vtype::numlanes;
-        }
-// partition the current vector and save it on both sides of the array
-#pragma GCC unroll 8
-        for (int ii = 0; ii < num_unroll; ++ii) {
-            int32_t amount_ge_pivot = partition_vec<vtype>(
-                arr, l_store, r_store + vtype::numlanes, curr_vec[ii],
-                pivot_vec, &min_vec, &max_vec, use_gt);
-            l_store += (vtype::numlanes - amount_ge_pivot);
-            r_store -= amount_ge_pivot;
-        }
-    }
-
-/* partition and save vec_left[8] and vec_right[8] */
-#pragma GCC unroll 8
-    for (int ii = 0; ii < num_unroll; ++ii) {
-        int32_t amount_ge_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
-                                 vec_left[ii], pivot_vec, &min_vec, &max_vec, use_gt);
-        l_store += (vtype::numlanes - amount_ge_pivot);
-        r_store -= amount_ge_pivot;
-    }
-#pragma GCC unroll 8
-    for (int ii = 0; ii < num_unroll; ++ii) {
-        int32_t amount_ge_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
-                                 vec_right[ii], pivot_vec, &min_vec, &max_vec, use_gt);
-        l_store += (vtype::numlanes - amount_ge_pivot);
-        r_store -= amount_ge_pivot;
-    }
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-
-// to_index (exclusive)
-template <typename vtype, typename type_t>
-static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) {
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
-            arr, from_index, to_index, pivot, &smallest, &biggest, use_gt);
-    return pivot_index;
-}
-
-// partitioning functions
-template <typename T>
-void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){
-    const T pivot1 = arr[index_pivot1];
-    const T pivot2 = arr[index_pivot2];
-
-    const int64_t low = from_index;
-    const int64_t high = to_index;
-    const int64_t start = low + 1;
-    const int64_t end = high - 1;
-
-
-    std::swap(arr[index_pivot1], arr[low]);
-    std::swap(arr[index_pivot2], arr[end]);
-
-
-    const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, start, end, pivot2, true); // use_gt = true
-    std::swap(arr[end], arr[pivot_index2]);
-    int64_t upper = pivot_index2;
-
-    // if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning
-    if (upper == start) {
-        pivot_indices[0] = low;
-        pivot_indices[1] = upper;
-        return;
-    }
-
-    const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false)
-    int64_t lower = pivot_index1 - 1;
-    std::swap(arr[low], arr[lower]);
-
-    pivot_indices[0] = lower;
-    pivot_indices[1] = upper;
-}
-
-template <typename T>
-void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot){
-    const T pivot = arr[index_pivot];
-
-    const int64_t low = from_index;
-    const int64_t high = to_index;
-    const int64_t end = high - 1;
-
-
-    const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, low, high, pivot, false); // use_gt = false (use_ge)
-    int64_t lower = pivot_index1;
-
-    const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, pivot_index1, high, pivot, true); // use_gt = true
-    int64_t upper = pivot_index2;
-
-    pivot_indices[0] = lower;
-    pivot_indices[1] = upper;
-}
-
-template <typename T>
-void inline avx512_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) {
-    if (index_pivot1 != index_pivot2) {
-        avx512_dual_pivot_partition<T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-    }
-    else {
-        avx512_single_pivot_partition<T>(arr, from_index, to_index, pivot_indices, index_pivot1);
-    }
-}
-
-template <typename T>
-void inline insertion_sort(T *arr, int32_t from_index, int32_t to_index) {
-    for (int i, k = from_index; ++k < to_index; ) {
-        T ai = arr[i = k];
-
-        if (ai < arr[i - 1]) {
-            while (--i >= from_index && ai < arr[i]) {
-                arr[i + 1] = arr[i];
-            }
-            arr[i + 1] = ai;
-        }
-    }
-}
-
-template <typename T>
-void inline avx512_fast_sort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) {
-    int32_t size = to_index - from_index;
-
-    if (size <= INS_SORT_THRESHOLD) {
-        insertion_sort<T>(arr, from_index, to_index);
-    }
-    else {
-        avx512_qsort<T>(arr, from_index, to_index);
-    }
-}
-
-
-
-#endif  // AVX512_QSORT_COMMON
--- a/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp
@@ -21,12 +21,15 @@
 * questions.
 *
 */
+#include "simdsort-support.hpp"
+#ifdef __SIMDSORT_SUPPORTED_LINUX

 #pragma GCC target("avx512dq", "avx512f")
 #include "avx512-32bit-qsort.hpp"
 #include "avx512-64bit-qsort.hpp"
 #include "classfile_constants.h"

+
 #define DLL_PUBLIC __attribute__((visibility("default")))
 #define INSERTION_SORT_THRESHOLD_32BIT 16
 #define INSERTION_SORT_THRESHOLD_64BIT 20
@@ -36,35 +39,41 @@ extern "C" {
    DLL_PUBLIC void avx512_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) {
        switch(elem_type) {
            case JVM_T_INT:
-                avx512_fast_sort<int32_t>((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                avx512_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
                break;
            case JVM_T_LONG:
-                avx512_fast_sort<int64_t>((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+                avx512_fast_sort((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
                break;
            case JVM_T_FLOAT:
-                avx512_fast_sort<float>((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                avx512_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
                break;
            case JVM_T_DOUBLE:
-                avx512_fast_sort<double>((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+                avx512_fast_sort((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
                break;
+            default:
+                assert(false, "Unexpected type");
        }
    }

    DLL_PUBLIC void avx512_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
        switch(elem_type) {
            case JVM_T_INT:
-                avx512_fast_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                avx512_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                break;
            case JVM_T_LONG:
-                avx512_fast_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                avx512_fast_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                break;
            case JVM_T_FLOAT:
-                avx512_fast_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                avx512_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                break;
            case JVM_T_DOUBLE:
-                avx512_fast_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                avx512_fast_partition((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                break;
+            default:
+                assert(false, "Unexpected type");
        }
    }

 }
+
+#endif
--- a/src/java.base/linux/native/libsimdsort/simdsort-support.hpp
+++ b/src/java.base/linux/native/libsimdsort/simdsort-support.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SIMDSORT_SUPPORT_HPP
+#define SIMDSORT_SUPPORT_HPP
+#include <stdio.h>
+#include <stdlib.h>
+
+#undef assert
+#define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }}
+
+
+// GCC >= 7.5 is needed to build AVX2 portions of libsimdsort using C++17 features
+#if defined(_LP64) && (defined(__GNUC__) && ((__GNUC__ > 7) || ((__GNUC__ == 7) && (__GNUC_MINOR__ >= 5))))
+#define __SIMDSORT_SUPPORTED_LINUX
+#endif
+
+#endif //SIMDSORT_SUPPORT_HPP
--- a/src/java.base/linux/native/libsimdsort/xss-common-includes.h
+++ b/src/java.base/linux/native/libsimdsort/xss-common-includes.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+#ifndef XSS_COMMON_INCLUDES
+#define XSS_COMMON_INCLUDES
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+/*
+Workaround for the bug in GCC12 (that was fixed in GCC 12.3.1).
+More details are available at:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+*/
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#include <immintrin.h>
+#pragma GCC diagnostic pop
+#include <limits>
+#include <vector>
+
+#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
+#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
+#define X86_SIMD_SORT_INFINITYH 0x7c00
+#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
+#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
+#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
+#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
+#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
+#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
+#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
+#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
+#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
+#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
+#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
+#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
+#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
+#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
+#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
+#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
+#define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH)
+#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
+#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
+#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
+#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
+
+#define PRAGMA(x) _Pragma(#x)
+#define UNUSED(x) (void)(x)
+
+/* Compiler specific macros specific */
+#if defined(__GNUC__)
+#define X86_SIMD_SORT_INLINE static inline
+#define X86_SIMD_SORT_FINLINE static inline __attribute__((always_inline))
+#else
+#define X86_SIMD_SORT_INLINE static
+#define X86_SIMD_SORT_FINLINE static
+#endif
+
+#if __GNUC__ >= 8
+#define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(GCC unroll num)
+#else
+#define X86_SIMD_SORT_UNROLL_LOOP(num)
+#endif
+
+typedef size_t arrsize_t;
+
+template <typename type>
+struct zmm_vector;
+
+template <typename type>
+struct ymm_vector;
+
+template <typename type>
+struct avx2_vector;
+
+#endif // XSS_COMMON_INCLUDES
--- a/src/java.base/linux/native/libsimdsort/xss-common-qsort.h
+++ b/src/java.base/linux/native/libsimdsort/xss-common-qsort.h
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+#ifndef XSS_COMMON_QSORT
+#define XSS_COMMON_QSORT
+
+/*
+ * Quicksort using AVX-512. The ideas and code are based on these two research
+ * papers [1] and [2]. On a high level, the idea is to vectorize quicksort
+ * partitioning using AVX-512 compressstore instructions. If the array size is
+ * < 128, then use Bitonic sorting network implemented on 512-bit registers.
+ * The precise network definitions depend on the dtype and are defined in
+ * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
+ * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
+ * network. The core implementations of the vectorized qsort functions
+ * avx512_qsort<T>(T*, arrsize_t) are modified versions of avx2 quicksort
+ * presented in the paper [2] and source code associated with that paper [3].
+ *
+ * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
+ *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
+ *
+ * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
+ * Skylake https://arxiv.org/pdf/1704.08579.pdf
+ *
+ * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier:
+ * MIT
+ *
+ * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
+ *
+ */
+
+#include "xss-common-includes.h"
+#include "xss-pivot-selection.hpp"
+#include "xss-network-qsort.hpp"
+
+
+template <typename T>
+bool is_a_nan(T elem) {
+    return std::isnan(elem);
+}
+
+template <typename T>
+X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) {
+    // median of 8 equally spaced elements
+    int64_t NUM_ELEMENTS = 8;
+    int64_t MID = NUM_ELEMENTS / 2;
+    int64_t size = (right - left) / NUM_ELEMENTS;
+    T temp[NUM_ELEMENTS];
+    for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)];
+    std::sort(temp, temp + NUM_ELEMENTS);
+    return temp[MID];
+}
+
+template <typename vtype, typename T = typename vtype::type_t>
+bool comparison_func_ge(const T &a, const T &b) {
+    return a < b;
+}
+
+template <typename vtype, typename T = typename vtype::type_t>
+bool comparison_func_gt(const T &a, const T &b) {
+    return a <= b;
+}
+
+/*
+ * COEX == Compare and Exchange two registers by swapping min and max values
+ */
+template <typename vtype, typename mm_t>
+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) {
+    mm_t temp = a;
+    a = vtype::min(a, b);
+    b = vtype::max(temp, b);
+}
+
+template <typename vtype, typename reg_t = typename vtype::reg_t,
+          typename opmask_t = typename vtype::opmask_t>
+X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) {
+    reg_t min = vtype::min(in2, in1);
+    reg_t max = vtype::max(in2, in1);
+    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
+}
+
+template <typename vtype, typename type_t, typename reg_t>
+int avx512_double_compressstore(type_t *left_addr, type_t *right_addr,
+                                typename vtype::opmask_t k, reg_t reg) {
+    int amount_ge_pivot = _mm_popcnt_u32((int)k);
+
+    vtype::mask_compressstoreu(left_addr, vtype::knot_opmask(k), reg);
+    vtype::mask_compressstoreu(right_addr + vtype::numlanes - amount_ge_pivot,
+                               k, reg);
+
+    return amount_ge_pivot;
+}
+
+// Generic function dispatches to AVX2 or AVX512 code
+template <typename vtype, typename type_t,
+          typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store,
+                                             const reg_t curr_vec,
+                                             const reg_t pivot_vec,
+                                             reg_t &smallest_vec,
+                                             reg_t &biggest_vec, bool use_gt) {
+    //typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
+    typename vtype::opmask_t mask;
+    if (use_gt) mask = vtype::gt(curr_vec, pivot_vec);
+    else mask = vtype::ge(curr_vec, pivot_vec);
+
+    int amount_ge_pivot =
+        vtype::double_compressstore(l_store, r_store, mask, curr_vec);
+
+    smallest_vec = vtype::min(curr_vec, smallest_vec);
+    biggest_vec = vtype::max(curr_vec, biggest_vec);
+
+    return amount_ge_pivot;
+}
+
+/*
+ * Parition an array based on the pivot and returns the index of the
+ * first element that is greater than or equal to the pivot.
+ */
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, arrsize_t left,
+                                                arrsize_t right, type_t pivot,
+                                                type_t *smallest,
+                                                type_t *biggest,
+                                                bool use_gt) {
+    auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
+    /* make array length divisible by vtype::numlanes , shortening the array */
+    for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
+        *smallest = std::min(*smallest, arr[left], comparison_func);
+        *biggest = std::max(*biggest, arr[left], comparison_func);
+        if (!comparison_func(arr[left], pivot)) {
+            std::swap(arr[left], arr[--right]);
+        } else {
+            ++left;
+        }
+    }
+
+    if (left == right)
+        return left; /* less than vtype::numlanes elements in the array */
+
+    using reg_t = typename vtype::reg_t;
+    reg_t pivot_vec = vtype::set1(pivot);
+    reg_t min_vec = vtype::set1(*smallest);
+    reg_t max_vec = vtype::set1(*biggest);
+
+    if (right - left == vtype::numlanes) {
+        reg_t vec = vtype::loadu(arr + left);
+        arrsize_t unpartitioned = right - left - vtype::numlanes;
+        arrsize_t l_store = left;
+
+        arrsize_t amount_ge_pivot =
+            partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                                 vec, pivot_vec, min_vec, max_vec, use_gt);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        *smallest = vtype::reducemin(min_vec);
+        *biggest = vtype::reducemax(max_vec);
+        return l_store;
+    }
+
+    // first and last vtype::numlanes values are partitioned at the end
+    reg_t vec_left = vtype::loadu(arr + left);
+    reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
+    // store points of the vectors
+    arrsize_t unpartitioned = right - left - vtype::numlanes;
+    arrsize_t l_store = left;
+    // indices for loading the elements
+    left += vtype::numlanes;
+    right -= vtype::numlanes;
+    while (right - left != 0) {
+        reg_t curr_vec;
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((l_store + unpartitioned + vtype::numlanes) - right <
+            left - l_store) {
+            right -= vtype::numlanes;
+            curr_vec = vtype::loadu(arr + right);
+        } else {
+            curr_vec = vtype::loadu(arr + left);
+            left += vtype::numlanes;
+        }
+        // partition the current vector and save it on both sides of the array
+        arrsize_t amount_ge_pivot =
+            partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                                 curr_vec, pivot_vec, min_vec, max_vec, use_gt);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        unpartitioned -= vtype::numlanes;
+    }
+
+    /* partition and save vec_left and vec_right */
+    arrsize_t amount_ge_pivot =
+        partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                             vec_left, pivot_vec, min_vec, max_vec, use_gt);
+    l_store += (vtype::numlanes - amount_ge_pivot);
+    unpartitioned -= vtype::numlanes;
+
+    amount_ge_pivot =
+        partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                             vec_right, pivot_vec, min_vec, max_vec, use_gt);
+    l_store += (vtype::numlanes - amount_ge_pivot);
+    unpartitioned -= vtype::numlanes;
+
+    *smallest = vtype::reducemin(min_vec);
+    *biggest = vtype::reducemax(max_vec);
+    return l_store;
+}
+
+template <typename vtype, int num_unroll,
+          typename type_t = typename vtype::type_t>
+X86_SIMD_SORT_INLINE arrsize_t
+partition_avx512_unrolled(type_t *arr, arrsize_t left, arrsize_t right,
+                          type_t pivot, type_t *smallest, type_t *biggest, bool use_gt) {
+    if constexpr (num_unroll == 0) {
+        return partition_avx512<vtype>(arr, left, right, pivot, smallest,
+                                       biggest, use_gt);
+    }
+
+    /* Use regular partition_avx512 for smaller arrays */
+    if (right - left < 3 * num_unroll * vtype::numlanes) {
+        return partition_avx512<vtype>(arr, left, right, pivot, smallest,
+                                       biggest, use_gt);
+    }
+
+    auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
+    /* make array length divisible by vtype::numlanes, shortening the array */
+    for (int32_t i = ((right - left) % (vtype::numlanes)); i > 0; --i) {
+        *smallest = std::min(*smallest, arr[left], comparison_func);
+        *biggest = std::max(*biggest, arr[left], comparison_func);
+        if (!comparison_func(arr[left], pivot)) {
+            std::swap(arr[left], arr[--right]);
+        } else {
+            ++left;
+        }
+    }
+
+    arrsize_t unpartitioned = right - left - vtype::numlanes;
+    arrsize_t l_store = left;
+
+    using reg_t = typename vtype::reg_t;
+    reg_t pivot_vec = vtype::set1(pivot);
+    reg_t min_vec = vtype::set1(*smallest);
+    reg_t max_vec = vtype::set1(*biggest);
+
+    /* Calculate and load more registers to make the rest of the array a
+     * multiple of num_unroll. These registers will be partitioned at the very
+     * end. */
+    int vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
+    reg_t vec_align[num_unroll];
+    for (int i = 0; i < vecsToPartition; i++) {
+        vec_align[i] = vtype::loadu(arr + left + i * vtype::numlanes);
+    }
+    left += vecsToPartition * vtype::numlanes;
+
+    /* We will now have atleast 3*num_unroll registers worth of data to
+     * process. Load left and right vtype::numlanes*num_unroll values into
+     * registers to make space for in-place parition. The vec_left and
+     * vec_right registers are partitioned at the end */
+    reg_t vec_left[num_unroll], vec_right[num_unroll];
+    X86_SIMD_SORT_UNROLL_LOOP(8)
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
+        vec_right[ii] =
+            vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii)));
+    }
+    /* indices for loading the elements */
+    left += num_unroll * vtype::numlanes;
+    right -= num_unroll * vtype::numlanes;
+    while (right - left != 0) {
+        reg_t curr_vec[num_unroll];
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((l_store + unpartitioned + vtype::numlanes) - right <
+            left - l_store) {
+            right -= num_unroll * vtype::numlanes;
+            X86_SIMD_SORT_UNROLL_LOOP(8)
+            for (int ii = 0; ii < num_unroll; ++ii) {
+                curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
+                /*
+                 * error: '_mm_prefetch' needs target feature mmx on clang-cl
+                 */
+#if !(defined(_MSC_VER) && defined(__clang__))
+                _mm_prefetch((char *)(arr + right + ii * vtype::numlanes -
+                                      num_unroll * vtype::numlanes),
+                             _MM_HINT_T0);
+#endif
+            }
+        } else {
+            X86_SIMD_SORT_UNROLL_LOOP(8)
+            for (int ii = 0; ii < num_unroll; ++ii) {
+                curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
+                /*
+                 * error: '_mm_prefetch' needs target feature mmx on clang-cl
+                 */
+#if !(defined(_MSC_VER) && defined(__clang__))
+                _mm_prefetch((char *)(arr + left + ii * vtype::numlanes +
+                                      num_unroll * vtype::numlanes),
+                             _MM_HINT_T0);
+#endif
+            }
+            left += num_unroll * vtype::numlanes;
+        }
+        /* partition the current vector and save it on both sides of the array
+         * */
+        X86_SIMD_SORT_UNROLL_LOOP(8)
+        for (int ii = 0; ii < num_unroll; ++ii) {
+            arrsize_t amount_ge_pivot = partition_vec<vtype>(
+                arr + l_store, arr + l_store + unpartitioned, curr_vec[ii],
+                pivot_vec, min_vec, max_vec, use_gt);
+            l_store += (vtype::numlanes - amount_ge_pivot);
+            unpartitioned -= vtype::numlanes;
+        }
+    }
+
+    /* partition and save vec_left[num_unroll] and vec_right[num_unroll] */
+    X86_SIMD_SORT_UNROLL_LOOP(8)
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        arrsize_t amount_ge_pivot =
+            partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                                 vec_left[ii], pivot_vec, min_vec, max_vec, use_gt);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        unpartitioned -= vtype::numlanes;
+    }
+    X86_SIMD_SORT_UNROLL_LOOP(8)
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        arrsize_t amount_ge_pivot =
+            partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                                 vec_right[ii], pivot_vec, min_vec, max_vec, use_gt);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        unpartitioned -= vtype::numlanes;
+    }
+
+    /* partition and save vec_align[vecsToPartition] */
+    X86_SIMD_SORT_UNROLL_LOOP(8)
+    for (int ii = 0; ii < vecsToPartition; ++ii) {
+        arrsize_t amount_ge_pivot =
+            partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned,
+                                 vec_align[ii], pivot_vec, min_vec, max_vec, use_gt);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        unpartitioned -= vtype::numlanes;
+    }
+
+    *smallest = vtype::reducemin(min_vec);
+    *biggest = vtype::reducemax(max_vec);
+    return l_store;
+}
+
+template <typename vtype, int maxN>
+void sort_n(typename vtype::type_t *arr, int N);
+
+template <typename vtype, typename type_t>
+static void qsort_(type_t *arr, arrsize_t left, arrsize_t right,
+                   arrsize_t max_iters) {
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1, comparison_func_ge<vtype>);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <=
+     * vtype::network_sort_threshold
+     */
+    if (right + 1 - left <= vtype::network_sort_threshold) {
+        sort_n<vtype, vtype::network_sort_threshold>(
+            arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_blocks<vtype, type_t>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+
+    arrsize_t pivot_index =
+        partition_avx512_unrolled<vtype, vtype::partition_unroll_factor>(
+            arr, left, right + 1, pivot, &smallest, &biggest, false);
+
+    if (pivot != smallest)
+        qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
+    if (pivot != biggest) qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
+}
+
+// Hooks for OpenJDK sort
+// to_index (exclusive)
+template <typename vtype, typename type_t>
+static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) {
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
+            arr, from_index, to_index, pivot, &smallest, &biggest, use_gt);
+    return pivot_index;
+}
+
+// partitioning functions
+template <typename vtype, typename T>
+X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){
+    const T pivot1 = arr[index_pivot1];
+    const T pivot2 = arr[index_pivot2];
+
+    const int64_t low = from_index;
+    const int64_t high = to_index;
+    const int64_t start = low + 1;
+    const int64_t end = high - 1;
+
+
+    std::swap(arr[index_pivot1], arr[low]);
+    std::swap(arr[index_pivot2], arr[end]);
+
+
+    const int64_t pivot_index2 = vectorized_partition<vtype, T>(arr, start, end, pivot2, true); // use_gt = true
+    std::swap(arr[end], arr[pivot_index2]);
+    int64_t upper = pivot_index2;
+
+    // if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning
+    if (upper == start) {
+        pivot_indices[0] = low;
+        pivot_indices[1] = upper;
+        return;
+    }
+
+    const int64_t pivot_index1 = vectorized_partition<vtype, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false)
+    int64_t lower = pivot_index1 - 1;
+    std::swap(arr[low], arr[lower]);
+
+    pivot_indices[0] = lower;
+    pivot_indices[1] = upper;
+}
+
+template <typename vtype, typename T>
+X86_SIMD_SORT_INLINE void simd_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot) {
+    const T pivot = arr[index_pivot];
+
+    const int64_t low = from_index;
+    const int64_t high = to_index;
+    const int64_t end = high - 1;
+
+
+    const int64_t pivot_index1 = vectorized_partition<vtype, T>(arr, low, high, pivot, false); // use_gt = false (use_ge)
+    int64_t lower = pivot_index1;
+
+    const int64_t pivot_index2 = vectorized_partition<vtype, T>(arr, pivot_index1, high, pivot, true); // use_gt = true
+    int64_t upper = pivot_index2;
+
+    pivot_indices[0] = lower;
+    pivot_indices[1] = upper;
+}
+
+template <typename vtype, typename T>
+X86_SIMD_SORT_INLINE void simd_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) {
+    if (index_pivot1 != index_pivot2) {
+        simd_dual_pivot_partition<vtype, T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    }
+    else {
+        simd_single_pivot_partition<vtype, T>(arr, from_index, to_index, pivot_indices, index_pivot1);
+    }
+}
+
+template <typename T>
+X86_SIMD_SORT_INLINE void insertion_sort(T *arr, int32_t from_index, int32_t to_index) {
+    for (int i, k = from_index; ++k < to_index; ) {
+        T ai = arr[i = k];
+        if (ai < arr[i - 1]) {
+            while (--i >= from_index && ai < arr[i]) {
+                arr[i + 1] = arr[i];
+            }
+            arr[i + 1] = ai;
+        }
+    }
+}
+
+template <typename vtype, typename T>
+X86_SIMD_SORT_INLINE void simd_fast_sort(T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD)
+{
+    arrsize_t arrsize = to_index - from_index;
+    if (arrsize <= INS_SORT_THRESHOLD) {
+        insertion_sort<T>(arr, from_index, to_index);
+    } else {
+        qsort_<vtype, T>(arr, from_index, to_index - 1, 2 * (arrsize_t)log2(arrsize));
+    }
+}
+
+#define DEFINE_METHODS(ISA, VTYPE) \
+    template <typename T> \
+    X86_SIMD_SORT_INLINE void ISA##_fast_sort( \
+            T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) \
+    { \
+        simd_fast_sort<VTYPE, T>(arr, from_index, to_index, INS_SORT_THRESHOLD); \
+    } \
+    template <typename T> \
+    X86_SIMD_SORT_INLINE void ISA##_fast_partition( \
+            T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) \
+    { \
+        simd_fast_partition<VTYPE, T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); \
+    }
+
+DEFINE_METHODS(avx2, avx2_vector<T>)
+DEFINE_METHODS(avx512, zmm_vector<T>)
+
+#endif  // XSS_COMMON_QSORT
--- a/src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+#ifndef XSS_NETWORK_QSORT
+#define XSS_NETWORK_QSORT
+
+#include "xss-common-qsort.h"
+#include "xss-optimal-networks.hpp"
+
+template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs) {
+    if constexpr (numVecs == 1) {
+        UNUSED(regs);
+        return;
+    } else if constexpr (numVecs == 2) {
+        COEX<vtype>(regs[0], regs[1]);
+    } else if constexpr (numVecs == 4) {
+        optimal_sort_4<vtype>(regs);
+    } else if constexpr (numVecs == 8) {
+        optimal_sort_8<vtype>(regs);
+    } else if constexpr (numVecs == 16) {
+        optimal_sort_16<vtype>(regs);
+    } else if constexpr (numVecs == 32) {
+        optimal_sort_32<vtype>(regs);
+    } else {
+        static_assert(numVecs == -1, "should not reach here");
+    }
+}
+
+/*
+ * Swizzle ops explained:
+ * swap_n<scale>: swap neighbouring blocks of size <scale/2> within block of
+ * size <scale> reg i        = [7,6,5,4,3,2,1,0] swap_n<2>:   =
+ * [[6,7],[4,5],[2,3],[0,1]] swap_n<4>:   = [[5,4,7,6],[1,0,3,2]] swap_n<8>:   =
+ * [[3,2,1,0,7,6,5,4]] reverse_n<scale>: reverse elements within block of size
+ * <scale> reg i        = [7,6,5,4,3,2,1,0] rev_n<2>:    =
+ * [[6,7],[4,5],[2,3],[0,1]] rev_n<4>:    = [[4,5,6,7],[0,1,2,3]] rev_n<8>:    =
+ * [[0,1,2,3,4,5,6,7]] merge_n<scale>: merge blocks of <scale/2> elements from
+ * two regs reg b,a      = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b] merge_n<2>   =
+ * [a,b,a,b,a,b,a,b] merge_n<4>   = [a,a,b,b,a,a,b,b] merge_n<8>   =
+ * [a,a,a,a,b,b,b,b]
+ */
+
+template <typename vtype, int numVecs, int scale, bool first = true>
+X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg) {
+    using reg_t = typename vtype::reg_t;
+    using swizzle = typename vtype::swizzle_ops;
+    if constexpr (scale <= 1) {
+        UNUSED(reg);
+        return;
+    } else {
+        if constexpr (first) {
+            // Use reverse then merge
+            X86_SIMD_SORT_UNROLL_LOOP(64)
+            for (int i = 0; i < numVecs; i++) {
+                reg_t &v = reg[i];
+                reg_t rev = swizzle::template reverse_n<vtype, scale>(v);
+                COEX<vtype>(rev, v);
+                v = swizzle::template merge_n<vtype, scale>(v, rev);
+            }
+        } else {
+            // Use swap then merge
+            X86_SIMD_SORT_UNROLL_LOOP(64)
+            for (int i = 0; i < numVecs; i++) {
+                reg_t &v = reg[i];
+                reg_t swap = swizzle::template swap_n<vtype, scale>(v);
+                COEX<vtype>(swap, v);
+                v = swizzle::template merge_n<vtype, scale>(v, swap);
+            }
+        }
+        internal_merge_n_vec<vtype, numVecs, scale / 2, false>(reg);
+    }
+}
+
+template <typename vtype, int numVecs, int scale,
+          typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void merge_substep_n_vec(reg_t *regs) {
+    using swizzle = typename vtype::swizzle_ops;
+    if constexpr (numVecs <= 1) {
+        UNUSED(regs);
+        return;
+    }
+
+    // Reverse upper half of vectors
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = numVecs / 2; i < numVecs; i++) {
+        regs[i] = swizzle::template reverse_n<vtype, scale>(regs[i]);
+    }
+    // Do compare exchanges
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = 0; i < numVecs / 2; i++) {
+        COEX<vtype>(regs[i], regs[numVecs - 1 - i]);
+    }
+
+    merge_substep_n_vec<vtype, numVecs / 2, scale>(regs);
+    merge_substep_n_vec<vtype, numVecs / 2, scale>(regs + numVecs / 2);
+}
+
+template <typename vtype, int numVecs, int scale,
+          typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void merge_step_n_vec(reg_t *regs) {
+    // Do cross vector merges
+    merge_substep_n_vec<vtype, numVecs, scale>(regs);
+
+    // Do internal vector merges
+    internal_merge_n_vec<vtype, numVecs, scale>(regs);
+}
+
+template <typename vtype, int numVecs, int numPer = 2,
+          typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs) {
+    if constexpr (numPer > vtype::numlanes) {
+        UNUSED(regs);
+        return;
+    } else {
+        merge_step_n_vec<vtype, numVecs, numPer>(regs);
+        merge_n_vec<vtype, numVecs, numPer * 2>(regs);
+    }
+}
+
+template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) {
+    static_assert(numVecs > 0, "numVecs should be > 0");
+    if constexpr (numVecs > 1) {
+        if (N * 2 <= numVecs * vtype::numlanes) {
+            sort_n_vec<vtype, numVecs / 2>(arr, N);
+            return;
+        }
+    }
+
+    reg_t vecs[numVecs];
+
+    // Generate masks for loading and storing
+    typename vtype::opmask_t ioMasks[numVecs - numVecs / 2];
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
+        uint64_t num_to_read =
+            std::min((uint64_t)std::max(0, N - i * vtype::numlanes),
+                     (uint64_t)vtype::numlanes);
+        ioMasks[j] = vtype::get_partial_loadmask(num_to_read);
+    }
+
+    // Unmasked part of the load
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = 0; i < numVecs / 2; i++) {
+        vecs[i] = vtype::loadu(arr + i * vtype::numlanes);
+    }
+    // Masked part of the load
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
+        vecs[i] = vtype::mask_loadu(vtype::zmm_max(), ioMasks[j],
+                                    arr + i * vtype::numlanes);
+    }
+
+    /* Run the initial sorting network to sort the columns of the [numVecs x
+     * num_lanes] matrix
+     */
+    bitonic_sort_n_vec<vtype, numVecs>(vecs);
+
+    // Merge the vectors using bitonic merging networks
+    merge_n_vec<vtype, numVecs>(vecs);
+
+    // Unmasked part of the store
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = 0; i < numVecs / 2; i++) {
+        vtype::storeu(arr + i * vtype::numlanes, vecs[i]);
+    }
+    // Masked part of the store
+    X86_SIMD_SORT_UNROLL_LOOP(64)
+    for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
+        vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
+    }
+}
+
+template <typename vtype, int maxN>
+X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) {
+    constexpr int numVecs = maxN / vtype::numlanes;
+    constexpr bool isMultiple = (maxN == (vtype::numlanes * numVecs));
+    constexpr bool powerOfTwo = (numVecs != 0 && !(numVecs & (numVecs - 1)));
+    static_assert(powerOfTwo == true && isMultiple == true,
+                  "maxN must be vtype::numlanes times a power of 2");
+
+    sort_n_vec<vtype, numVecs>(arr, N);
+}
+#endif
--- a/src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp
+++ b/src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) All of these sources
+// files are generated from the optimal networks described in
+// https://bertdobbelaere.github.io/sorting_networks.html
+
+template <typename vtype, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs) {
+    COEX<vtype>(vecs[0], vecs[2]);
+    COEX<vtype>(vecs[1], vecs[3]);
+
+    COEX<vtype>(vecs[0], vecs[1]);
+    COEX<vtype>(vecs[2], vecs[3]);
+
+    COEX<vtype>(vecs[1], vecs[2]);
+}
+
+template <typename vtype, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void optimal_sort_8(reg_t *vecs) {
+    COEX<vtype>(vecs[0], vecs[2]);
+    COEX<vtype>(vecs[1], vecs[3]);
+    COEX<vtype>(vecs[4], vecs[6]);
+    COEX<vtype>(vecs[5], vecs[7]);
+
+    COEX<vtype>(vecs[0], vecs[4]);
+    COEX<vtype>(vecs[1], vecs[5]);
+    COEX<vtype>(vecs[2], vecs[6]);
+    COEX<vtype>(vecs[3], vecs[7]);
+
+    COEX<vtype>(vecs[0], vecs[1]);
+    COEX<vtype>(vecs[2], vecs[3]);
+    COEX<vtype>(vecs[4], vecs[5]);
+    COEX<vtype>(vecs[6], vecs[7]);
+
+    COEX<vtype>(vecs[2], vecs[4]);
+    COEX<vtype>(vecs[3], vecs[5]);
+
+    COEX<vtype>(vecs[1], vecs[4]);
+    COEX<vtype>(vecs[3], vecs[6]);
+
+    COEX<vtype>(vecs[1], vecs[2]);
+    COEX<vtype>(vecs[3], vecs[4]);
+    COEX<vtype>(vecs[5], vecs[6]);
+}
+
+template <typename vtype, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void optimal_sort_16(reg_t *vecs) {
+    COEX<vtype>(vecs[0], vecs[13]);
+    COEX<vtype>(vecs[1], vecs[12]);
+    COEX<vtype>(vecs[2], vecs[15]);
+    COEX<vtype>(vecs[3], vecs[14]);
+    COEX<vtype>(vecs[4], vecs[8]);
+    COEX<vtype>(vecs[5], vecs[6]);
+    COEX<vtype>(vecs[7], vecs[11]);
+    COEX<vtype>(vecs[9], vecs[10]);
+
+    COEX<vtype>(vecs[0], vecs[5]);
+    COEX<vtype>(vecs[1], vecs[7]);
+    COEX<vtype>(vecs[2], vecs[9]);
+    COEX<vtype>(vecs[3], vecs[4]);
+    COEX<vtype>(vecs[6], vecs[13]);
+    COEX<vtype>(vecs[8], vecs[14]);
+    COEX<vtype>(vecs[10], vecs[15]);
+    COEX<vtype>(vecs[11], vecs[12]);
+
+    COEX<vtype>(vecs[0], vecs[1]);
+    COEX<vtype>(vecs[2], vecs[3]);
+    COEX<vtype>(vecs[4], vecs[5]);
+    COEX<vtype>(vecs[6], vecs[8]);
+    COEX<vtype>(vecs[7], vecs[9]);
+    COEX<vtype>(vecs[10], vecs[11]);
+    COEX<vtype>(vecs[12], vecs[13]);
+    COEX<vtype>(vecs[14], vecs[15]);
+
+    COEX<vtype>(vecs[0], vecs[2]);
+    COEX<vtype>(vecs[1], vecs[3]);
+    COEX<vtype>(vecs[4], vecs[10]);
+    COEX<vtype>(vecs[5], vecs[11]);
+    COEX<vtype>(vecs[6], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[9]);
+    COEX<vtype>(vecs[12], vecs[14]);
+    COEX<vtype>(vecs[13], vecs[15]);
+
+    COEX<vtype>(vecs[1], vecs[2]);
+    COEX<vtype>(vecs[3], vecs[12]);
+    COEX<vtype>(vecs[4], vecs[6]);
+    COEX<vtype>(vecs[5], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[10]);
+    COEX<vtype>(vecs[9], vecs[11]);
+    COEX<vtype>(vecs[13], vecs[14]);
+
+    COEX<vtype>(vecs[1], vecs[4]);
+    COEX<vtype>(vecs[2], vecs[6]);
+    COEX<vtype>(vecs[5], vecs[8]);
+    COEX<vtype>(vecs[7], vecs[10]);
+    COEX<vtype>(vecs[9], vecs[13]);
+    COEX<vtype>(vecs[11], vecs[14]);
+
+    COEX<vtype>(vecs[2], vecs[4]);
+    COEX<vtype>(vecs[3], vecs[6]);
+    COEX<vtype>(vecs[9], vecs[12]);
+    COEX<vtype>(vecs[11], vecs[13]);
+
+    COEX<vtype>(vecs[3], vecs[5]);
+    COEX<vtype>(vecs[6], vecs[8]);
+    COEX<vtype>(vecs[7], vecs[9]);
+    COEX<vtype>(vecs[10], vecs[12]);
+
+    COEX<vtype>(vecs[3], vecs[4]);
+    COEX<vtype>(vecs[5], vecs[6]);
+    COEX<vtype>(vecs[7], vecs[8]);
+    COEX<vtype>(vecs[9], vecs[10]);
+    COEX<vtype>(vecs[11], vecs[12]);
+
+    COEX<vtype>(vecs[6], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[9]);
+}
+
+template <typename vtype, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void optimal_sort_32(reg_t *vecs) {
+    COEX<vtype>(vecs[0], vecs[1]);
+    COEX<vtype>(vecs[2], vecs[3]);
+    COEX<vtype>(vecs[4], vecs[5]);
+    COEX<vtype>(vecs[6], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[9]);
+    COEX<vtype>(vecs[10], vecs[11]);
+    COEX<vtype>(vecs[12], vecs[13]);
+    COEX<vtype>(vecs[14], vecs[15]);
+    COEX<vtype>(vecs[16], vecs[17]);
+    COEX<vtype>(vecs[18], vecs[19]);
+    COEX<vtype>(vecs[20], vecs[21]);
+    COEX<vtype>(vecs[22], vecs[23]);
+    COEX<vtype>(vecs[24], vecs[25]);
+    COEX<vtype>(vecs[26], vecs[27]);
+    COEX<vtype>(vecs[28], vecs[29]);
+    COEX<vtype>(vecs[30], vecs[31]);
+
+    COEX<vtype>(vecs[0], vecs[2]);
+    COEX<vtype>(vecs[1], vecs[3]);
+    COEX<vtype>(vecs[4], vecs[6]);
+    COEX<vtype>(vecs[5], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[10]);
+    COEX<vtype>(vecs[9], vecs[11]);
+    COEX<vtype>(vecs[12], vecs[14]);
+    COEX<vtype>(vecs[13], vecs[15]);
+    COEX<vtype>(vecs[16], vecs[18]);
+    COEX<vtype>(vecs[17], vecs[19]);
+    COEX<vtype>(vecs[20], vecs[22]);
+    COEX<vtype>(vecs[21], vecs[23]);
+    COEX<vtype>(vecs[24], vecs[26]);
+    COEX<vtype>(vecs[25], vecs[27]);
+    COEX<vtype>(vecs[28], vecs[30]);
+    COEX<vtype>(vecs[29], vecs[31]);
+
+    COEX<vtype>(vecs[0], vecs[4]);
+    COEX<vtype>(vecs[1], vecs[5]);
+    COEX<vtype>(vecs[2], vecs[6]);
+    COEX<vtype>(vecs[3], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[12]);
+    COEX<vtype>(vecs[9], vecs[13]);
+    COEX<vtype>(vecs[10], vecs[14]);
+    COEX<vtype>(vecs[11], vecs[15]);
+    COEX<vtype>(vecs[16], vecs[20]);
+    COEX<vtype>(vecs[17], vecs[21]);
+    COEX<vtype>(vecs[18], vecs[22]);
+    COEX<vtype>(vecs[19], vecs[23]);
+    COEX<vtype>(vecs[24], vecs[28]);
+    COEX<vtype>(vecs[25], vecs[29]);
+    COEX<vtype>(vecs[26], vecs[30]);
+    COEX<vtype>(vecs[27], vecs[31]);
+
+    COEX<vtype>(vecs[0], vecs[8]);
+    COEX<vtype>(vecs[1], vecs[9]);
+    COEX<vtype>(vecs[2], vecs[10]);
+    COEX<vtype>(vecs[3], vecs[11]);
+    COEX<vtype>(vecs[4], vecs[12]);
+    COEX<vtype>(vecs[5], vecs[13]);
+    COEX<vtype>(vecs[6], vecs[14]);
+    COEX<vtype>(vecs[7], vecs[15]);
+    COEX<vtype>(vecs[16], vecs[24]);
+    COEX<vtype>(vecs[17], vecs[25]);
+    COEX<vtype>(vecs[18], vecs[26]);
+    COEX<vtype>(vecs[19], vecs[27]);
+    COEX<vtype>(vecs[20], vecs[28]);
+    COEX<vtype>(vecs[21], vecs[29]);
+    COEX<vtype>(vecs[22], vecs[30]);
+    COEX<vtype>(vecs[23], vecs[31]);
+
+    COEX<vtype>(vecs[0], vecs[16]);
+    COEX<vtype>(vecs[1], vecs[8]);
+    COEX<vtype>(vecs[2], vecs[4]);
+    COEX<vtype>(vecs[3], vecs[12]);
+    COEX<vtype>(vecs[5], vecs[10]);
+    COEX<vtype>(vecs[6], vecs[9]);
+    COEX<vtype>(vecs[7], vecs[14]);
+    COEX<vtype>(vecs[11], vecs[13]);
+    COEX<vtype>(vecs[15], vecs[31]);
+    COEX<vtype>(vecs[17], vecs[24]);
+    COEX<vtype>(vecs[18], vecs[20]);
+    COEX<vtype>(vecs[19], vecs[28]);
+    COEX<vtype>(vecs[21], vecs[26]);
+    COEX<vtype>(vecs[22], vecs[25]);
+    COEX<vtype>(vecs[23], vecs[30]);
+    COEX<vtype>(vecs[27], vecs[29]);
+
+    COEX<vtype>(vecs[1], vecs[2]);
+    COEX<vtype>(vecs[3], vecs[5]);
+    COEX<vtype>(vecs[4], vecs[8]);
+    COEX<vtype>(vecs[6], vecs[22]);
+    COEX<vtype>(vecs[7], vecs[11]);
+    COEX<vtype>(vecs[9], vecs[25]);
+    COEX<vtype>(vecs[10], vecs[12]);
+    COEX<vtype>(vecs[13], vecs[14]);
+    COEX<vtype>(vecs[17], vecs[18]);
+    COEX<vtype>(vecs[19], vecs[21]);
+    COEX<vtype>(vecs[20], vecs[24]);
+    COEX<vtype>(vecs[23], vecs[27]);
+    COEX<vtype>(vecs[26], vecs[28]);
+    COEX<vtype>(vecs[29], vecs[30]);
+
+    COEX<vtype>(vecs[1], vecs[17]);
+    COEX<vtype>(vecs[2], vecs[18]);
+    COEX<vtype>(vecs[3], vecs[19]);
+    COEX<vtype>(vecs[4], vecs[20]);
+    COEX<vtype>(vecs[5], vecs[10]);
+    COEX<vtype>(vecs[7], vecs[23]);
+    COEX<vtype>(vecs[8], vecs[24]);
+    COEX<vtype>(vecs[11], vecs[27]);
+    COEX<vtype>(vecs[12], vecs[28]);
+    COEX<vtype>(vecs[13], vecs[29]);
+    COEX<vtype>(vecs[14], vecs[30]);
+    COEX<vtype>(vecs[21], vecs[26]);
+
+    COEX<vtype>(vecs[3], vecs[17]);
+    COEX<vtype>(vecs[4], vecs[16]);
+    COEX<vtype>(vecs[5], vecs[21]);
+    COEX<vtype>(vecs[6], vecs[18]);
+    COEX<vtype>(vecs[7], vecs[9]);
+    COEX<vtype>(vecs[8], vecs[20]);
+    COEX<vtype>(vecs[10], vecs[26]);
+    COEX<vtype>(vecs[11], vecs[23]);
+    COEX<vtype>(vecs[13], vecs[25]);
+    COEX<vtype>(vecs[14], vecs[28]);
+    COEX<vtype>(vecs[15], vecs[27]);
+    COEX<vtype>(vecs[22], vecs[24]);
+
+    COEX<vtype>(vecs[1], vecs[4]);
+    COEX<vtype>(vecs[3], vecs[8]);
+    COEX<vtype>(vecs[5], vecs[16]);
+    COEX<vtype>(vecs[7], vecs[17]);
+    COEX<vtype>(vecs[9], vecs[21]);
+    COEX<vtype>(vecs[10], vecs[22]);
+    COEX<vtype>(vecs[11], vecs[19]);
+    COEX<vtype>(vecs[12], vecs[20]);
+    COEX<vtype>(vecs[14], vecs[24]);
+    COEX<vtype>(vecs[15], vecs[26]);
+    COEX<vtype>(vecs[23], vecs[28]);
+    COEX<vtype>(vecs[27], vecs[30]);
+
+    COEX<vtype>(vecs[2], vecs[5]);
+    COEX<vtype>(vecs[7], vecs[8]);
+    COEX<vtype>(vecs[9], vecs[18]);
+    COEX<vtype>(vecs[11], vecs[17]);
+    COEX<vtype>(vecs[12], vecs[16]);
+    COEX<vtype>(vecs[13], vecs[22]);
+    COEX<vtype>(vecs[14], vecs[20]);
+    COEX<vtype>(vecs[15], vecs[19]);
+    COEX<vtype>(vecs[23], vecs[24]);
+    COEX<vtype>(vecs[26], vecs[29]);
+
+    COEX<vtype>(vecs[2], vecs[4]);
+    COEX<vtype>(vecs[6], vecs[12]);
+    COEX<vtype>(vecs[9], vecs[16]);
+    COEX<vtype>(vecs[10], vecs[11]);
+    COEX<vtype>(vecs[13], vecs[17]);
+    COEX<vtype>(vecs[14], vecs[18]);
+    COEX<vtype>(vecs[15], vecs[22]);
+    COEX<vtype>(vecs[19], vecs[25]);
+    COEX<vtype>(vecs[20], vecs[21]);
+    COEX<vtype>(vecs[27], vecs[29]);
+
+    COEX<vtype>(vecs[5], vecs[6]);
+    COEX<vtype>(vecs[8], vecs[12]);
+    COEX<vtype>(vecs[9], vecs[10]);
+    COEX<vtype>(vecs[11], vecs[13]);
+    COEX<vtype>(vecs[14], vecs[16]);
+    COEX<vtype>(vecs[15], vecs[17]);
+    COEX<vtype>(vecs[18], vecs[20]);
+    COEX<vtype>(vecs[19], vecs[23]);
+    COEX<vtype>(vecs[21], vecs[22]);
+    COEX<vtype>(vecs[25], vecs[26]);
+
+    COEX<vtype>(vecs[3], vecs[5]);
+    COEX<vtype>(vecs[6], vecs[7]);
+    COEX<vtype>(vecs[8], vecs[9]);
+    COEX<vtype>(vecs[10], vecs[12]);
+    COEX<vtype>(vecs[11], vecs[14]);
+    COEX<vtype>(vecs[13], vecs[16]);
+    COEX<vtype>(vecs[15], vecs[18]);
+    COEX<vtype>(vecs[17], vecs[20]);
+    COEX<vtype>(vecs[19], vecs[21]);
+    COEX<vtype>(vecs[22], vecs[23]);
+    COEX<vtype>(vecs[24], vecs[25]);
+    COEX<vtype>(vecs[26], vecs[28]);
+
+    COEX<vtype>(vecs[3], vecs[4]);
+    COEX<vtype>(vecs[5], vecs[6]);
+    COEX<vtype>(vecs[7], vecs[8]);
+    COEX<vtype>(vecs[9], vecs[10]);
+    COEX<vtype>(vecs[11], vecs[12]);
+    COEX<vtype>(vecs[13], vecs[14]);
+    COEX<vtype>(vecs[15], vecs[16]);
+    COEX<vtype>(vecs[17], vecs[18]);
+    COEX<vtype>(vecs[19], vecs[20]);
+    COEX<vtype>(vecs[21], vecs[22]);
+    COEX<vtype>(vecs[23], vecs[24]);
+    COEX<vtype>(vecs[25], vecs[26]);
+    COEX<vtype>(vecs[27], vecs[28]);
+}
--- a/src/java.base/linux/native/libsimdsort/xss-pivot-selection.hpp
+++ b/src/java.base/linux/native/libsimdsort/xss-pivot-selection.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+template <typename vtype, typename mm_t>
+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left,
+                                      const arrsize_t right) {
+    using reg_t = typename vtype::reg_t;
+    type_t samples[vtype::numlanes];
+    arrsize_t delta = (right - left) / vtype::numlanes;
+    for (int i = 0; i < vtype::numlanes; i++) {
+        samples[i] = arr[left + i * delta];
+    }
+    reg_t rand_vec = vtype::loadu(samples);
+    reg_t sort = vtype::sort_vec(rand_vec);
+
+    return ((type_t *)&sort)[vtype::numlanes / 2];
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left,
+                                             const arrsize_t right) {
+    if (right - left <= 1024) {
+        return get_pivot<vtype>(arr, left, right);
+    }
+
+    using reg_t = typename vtype::reg_t;
+    constexpr int numVecs = 5;
+
+    arrsize_t width = (right - vtype::numlanes) - left;
+    arrsize_t delta = width / numVecs;
+
+    reg_t vecs[numVecs];
+    // Load data
+    for (int i = 0; i < numVecs; i++) {
+        vecs[i] = vtype::loadu(arr + left + delta * i);
+    }
+
+    // Implement sorting network (from
+    // https://bertdobbelaere.github.io/sorting_networks.html)
+    COEX<vtype>(vecs[0], vecs[3]);
+    COEX<vtype>(vecs[1], vecs[4]);
+
+    COEX<vtype>(vecs[0], vecs[2]);
+    COEX<vtype>(vecs[1], vecs[3]);
+
+    COEX<vtype>(vecs[0], vecs[1]);
+    COEX<vtype>(vecs[2], vecs[4]);
+
+    COEX<vtype>(vecs[1], vecs[2]);
+    COEX<vtype>(vecs[3], vecs[4]);
+
+    COEX<vtype>(vecs[2], vecs[3]);
+
+    // Calculate median of the middle vector
+    reg_t &vec = vecs[numVecs / 2];
+    vec = vtype::sort_vec(vec);
+
+    type_t data[vtype::numlanes];
+    vtype::storeu(data, vec);
+    return data[vtype::numlanes / 2];
+}
--- a/src/java.base/share/classes/java/lang/StringUTF16.java
+++ b/src/java.base/share/classes/java/lang/StringUTF16.java
@@ -42,15 +42,10 @@ import static java.lang.String.LATIN1;

 final class StringUTF16 {

+    // Return a new byte array for a UTF16-coded string for len chars
+    // Throw an exception if out of range
    public static byte[] newBytesFor(int len) {
-        if (len < 0) {
-            throw new NegativeArraySizeException();
-        }
-        if (len > MAX_LENGTH) {
-            throw new OutOfMemoryError("UTF16 String size is " + len +
-                                       ", should be less than " + MAX_LENGTH);
-        }
-        return new byte[len << 1];
+        return new byte[newBytesLength(len)];
    }

    // Check the size of a UTF16-coded string
@@ -59,7 +54,7 @@ final class StringUTF16 {
        if (len < 0) {
            throw new NegativeArraySizeException();
        }
-        if (len > MAX_LENGTH) {
+        if (len >= MAX_LENGTH) {
            throw new OutOfMemoryError("UTF16 String size is " + len +
                                       ", should be less than " + MAX_LENGTH);
        }
--- a/src/java.base/share/classes/java/lang/Thread.java
+++ b/src/java.base/share/classes/java/lang/Thread.java
@@ -1647,7 +1647,7 @@ public class Thread implements Runnable {
     *       interrupt the wait.
     *       For more information, see
     *       <a href="{@docRoot}/java.base/java/lang/doc-files/threadPrimitiveDeprecation.html">Why
-     *       are Thread.stop, Thread.suspend and Thread.resume Deprecated?</a>.
+     *       is Thread.stop deprecated and the ability to stop a thread removed?</a>.
     */
    @Deprecated(since="1.2", forRemoval=true)
    public final void stop() {
@@ -1788,44 +1788,6 @@ public class Thread implements Runnable {
        return eetop != 0;
    }

-    /**
-     * Throws {@code UnsupportedOperationException}.
-     *
-     * @throws  UnsupportedOperationException always
-     *
-     * @deprecated This method was originally specified to suspend a thread.
-     *     It was inherently deadlock-prone. If the target thread held a lock on
-     *     a monitor protecting a critical system resource when it was suspended,
-     *     no thread could access the resource until the target thread was resumed.
-     *     If the thread intending to resume the target thread attempted to lock
-     *     the monitor prior to calling {@code resume}, deadlock would result.
-     *     Such deadlocks typically manifested themselves as "frozen" processes.
-     *     For more information, see
-     *     <a href="{@docRoot}/java.base/java/lang/doc-files/threadPrimitiveDeprecation.html">Why
-     *     are Thread.stop, Thread.suspend and Thread.resume Deprecated?</a>.
-     */
-    @Deprecated(since="1.2", forRemoval=true)
-    public final void suspend() {
-        throw new UnsupportedOperationException();
-    }
-
-    /**
-     * Throws {@code UnsupportedOperationException}.
-     *
-     * @throws  UnsupportedOperationException always
-     *
-     * @deprecated This method was originally specified to resume a thread
-     *     suspended with {@link #suspend()}. Suspending a thread was
-     *     inherently deadlock-prone.
-     *     For more information, see
-     *     <a href="{@docRoot}/java.base/java/lang/doc-files/threadPrimitiveDeprecation.html">Why
-     *     are Thread.stop, Thread.suspend and Thread.resume Deprecated?</a>.
-     */
-    @Deprecated(since="1.2", forRemoval=true)
-    public final void resume() {
-        throw new UnsupportedOperationException();
-    }
-
    /**
     * Changes the priority of this thread.
     *
--- a/src/java.base/share/classes/java/lang/ThreadGroup.java
+++ b/src/java.base/share/classes/java/lang/ThreadGroup.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1995, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1995, 2023, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -554,17 +554,6 @@ public class ThreadGroup implements Thread.UncaughtExceptionHandler {
        return i;
    }

-    /**
-     * Throws {@code UnsupportedOperationException}.
-     *
-     * @deprecated This method was originally specified to stop all threads in
-     *             the thread group. It was inherently unsafe.
-     */
-    @Deprecated(since="1.2", forRemoval=true)
-    public final void stop() {
-        throw new UnsupportedOperationException();
-    }
-
    /**
     * Interrupts all {@linkplain Thread#isAlive() live} platform threads in
     * this thread group and its subgroups.
@@ -587,28 +576,6 @@ public class ThreadGroup implements Thread.UncaughtExceptionHandler {
        }
    }

-    /**
-     * Throws {@code UnsupportedOperationException}.
-     *
-     * @deprecated This method was originally specified to suspend all threads
-     *             in the thread group.
-     */
-    @Deprecated(since="1.2", forRemoval=true)
-    public final void suspend() {
-        throw new UnsupportedOperationException();
-    }
-
-    /**
-     * Throws {@code UnsupportedOperationException}.
-     *
-     * @deprecated This method was originally specified to resume all threads
-     *             in the thread group.
-     */
-    @Deprecated(since="1.2", forRemoval=true)
-    public final void resume() {
-        throw new UnsupportedOperationException();
-    }
-
    /**
     * Does nothing.
     *
--- a/src/java.base/share/classes/java/lang/classfile/ClassFile.java
+++ b/src/java.base/share/classes/java/lang/classfile/ClassFile.java
@@ -1475,6 +1475,9 @@ public sealed interface ClassFile
    /** The class major version of JAVA_22. */
    int JAVA_22_VERSION = 66;

+    /** 67 */
+    int JAVA_23_VERSION = 67;
+
    /**
     * A minor version number indicating a class uses preview features
     * of a Java SE version since 12, for major versions {@value
@@ -1486,7 +1489,7 @@ public sealed interface ClassFile
     * {@return the latest major Java version}
     */
    static int latestMajorVersion() {
-        return JAVA_22_VERSION;
+        return JAVA_23_VERSION;
    }

    /**
--- a/src/java.base/share/classes/java/lang/classfile/attribute/ModuleAttribute.java
+++ b/src/java.base/share/classes/java/lang/classfile/attribute/ModuleAttribute.java
@@ -202,7 +202,7 @@ public sealed interface ModuleAttribute
        }

        /**
-         * Sets the module flags
+         * Sets the module version
         * @param version the module version
         * @return this builder
         */
--- a/src/java.base/share/classes/java/lang/doc-files/threadPrimitiveDeprecation.html
+++ b/src/java.base/share/classes/java/lang/doc-files/threadPrimitiveDeprecation.html
@@ -1,6 +1,6 @@
 <!doctype html>
 <!--
- Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
+ Copyright (c) 2005, 2023, Oracle and/or its affiliates. All rights reserved.
 DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

 This code is free software; you can redistribute it and/or modify it
@@ -158,173 +158,5 @@ wouldn't respond to <code>Thread.stop</code> either.</em> Such
 cases include deliberate denial-of-service attacks, and I/O
 operations for which thread.stop and thread.interrupt do not work
 properly.</p>
-<hr>
-<h2>Why are <code>Thread.suspend</code> and
-<code>Thread.resume</code> deprecated and the ability to suspend or
-resume a thread removed?</h2>
-<p><code>Thread.suspend</code> was inherently deadlock-prone. If the
-target thread held a lock on a monitor protecting a critical
-system resource when it is suspended, no thread could access the
-resource until the target thread was resumed. If the thread intending
-to resume the target thread attempted to lock the monitor prior
-to calling <code>resume</code>, deadlock resulted. Such deadlocks
-typically manifest themselves as "frozen" processes.</p>
-<hr>
-<h2>What should I use instead of <code>Thread.suspend</code> and
-<code>Thread.resume</code>?</h2>
-<p>As with <code>Thread.stop</code>, the prudent approach is to
-have the "target thread" poll a variable indicating the desired
-state of the thread (active or suspended). When the desired state
-is suspended, the thread waits using <code>Object.wait</code>. When
-the thread is resumed, the target thread is notified using
-<code>Object.notify</code>.</p>
-<p>For example, suppose your applet contains the following
-mousePressed event handler, which toggles the state of a thread
-called <code>blinker</code>:</p>
-<pre>
-    private boolean threadSuspended;
-
-    Public void mousePressed(MouseEvent e) {
-        e.consume();
-
-        if (threadSuspended)
-            blinker.resume();
-        else
-            blinker.suspend();  // DEADLOCK-PRONE!
-
-        threadSuspended = !threadSuspended;
-    }
-</pre>
-You can avoid the use of <code>Thread.suspend</code> and
-<code>Thread.resume</code> by replacing the event handler above
-with:
-<pre>
-    public synchronized void mousePressed(MouseEvent e) {
-        e.consume();
-
-        threadSuspended = !threadSuspended;
-
-        if (!threadSuspended)
-            notify();
-    }
-</pre>
-and adding the following code to the "run loop":
-<pre>
-                synchronized(this) {
-                    while (threadSuspended)
-                        wait();
-                }
-</pre>
-The <code>wait</code> method throws the
-<code>InterruptedException</code>, so it must be inside a <code>try
-... catch</code> clause. It's fine to put it in the same clause as
-the <code>sleep</code>. The check should follow (rather than
-precede) the <code>sleep</code> so the window is immediately
-repainted when the thread is "resumed." The resulting
-<code>run</code> method follows:
-<pre>
-    public void run() {
-        while (true) {
-            try {
-                Thread.sleep(interval);
-
-                synchronized(this) {
-                    while (threadSuspended)
-                        wait();
-                }
-            } catch (InterruptedException e){
-            }
-            repaint();
-        }
-    }
-</pre>
-Note that the <code>notify</code> in the <code>mousePressed</code>
-method and the <code>wait</code> in the <code>run</code> method are
-inside <code>synchronized</code> blocks. This is required by the
-language, and ensures that <code>wait</code> and
-<code>notify</code> are properly serialized. In practical terms,
-this eliminates race conditions that could cause the "suspended"
-thread to miss a <code>notify</code> and remain suspended
-indefinitely.
-<p>While the cost of synchronization in Java is decreasing as the
-platform matures, it will never be free. A simple trick can be used
-to remove the synchronization that we've added to each iteration of
-the "run loop." The synchronized block that was added is replaced
-by a slightly more complex piece of code that enters a synchronized
-block only if the thread has actually been suspended:</p>
-<pre>
-                if (threadSuspended) {
-                    synchronized(this) {
-                        while (threadSuspended)
-                            wait();
-                    }
-                }
-</pre>
-<p>In the absence of explicit synchronization,
-<code>threadSuspended</code> must be made <code>volatile</code> to ensure
-prompt communication of the suspend-request.</p>
-The resulting <code>run</code> method is:
-<pre>
-    private volatile boolean threadSuspended;
-
-    public void run() {
-        while (true) {
-            try {
-                Thread.sleep(interval);
-
-                if (threadSuspended) {
-                    synchronized(this) {
-                        while (threadSuspended)
-                            wait();
-                    }
-                }
-            } catch (InterruptedException e){
-            }
-            repaint();
-        }
-    }
-</pre>
-<hr>
-<h2>Can I combine the two techniques to produce a thread that may
-be safely "stopped" or "suspended"?</h2>
-Yes, it's reasonably straightforward. The one subtlety is that the
-target thread may already be suspended at the time that another
-thread tries to stop it. If the <code>stop</code> method merely sets
-the state variable (<code>blinker</code>) to null, the target thread
-will remain suspended (waiting on the monitor), rather than exiting
-gracefully as it should. If the applet is restarted, multiple
-threads could end up waiting on the monitor at the same time,
-resulting in erratic behavior.
-<p>To rectify this situation, the <code>stop</code> method must ensure
-that the target thread resumes immediately if it is suspended. Once
-the target thread resumes, it must recognize immediately that it
-has been stopped, and exit gracefully. Here's how the resulting
-<code>run</code> and <code>stop</code> methods look:</p>
-<pre>
-    public void run() {
-        Thread thisThread = Thread.currentThread();
-        while (blinker == thisThread) {
-            try {
-                Thread.sleep(interval);
-
-                synchronized(this) {
-                    while (threadSuspended &amp;&amp; blinker==thisThread)
-                        wait();
-                }
-            } catch (InterruptedException e){
-            }
-            repaint();
-        }
-    }
-
-    public synchronized void stop() {
-        blinker = null;
-        notify();
-    }
-</pre>
-If the <code>stop</code> method calls <code>Thread.interrupt</code>, as
-described above, it needn't call <code>notify</code> as well, but it
-still must be synchronized. This ensures that the target thread
-won't miss an interrupt due to a race condition.
 </body>
 </html>
--- a/src/java.base/share/classes/java/lang/foreign/MemoryLayout.java
+++ b/src/java.base/share/classes/java/lang/foreign/MemoryLayout.java
@@ -631,6 +631,9 @@ public sealed interface MemoryLayout
     *     <li>The accessed memory segment must be
     *     {@link MemorySegment#isAccessibleBy(Thread) accessible} from the thread
     *     performing the access operation, or a {@link WrongThreadException} is thrown.</li>
+     *     <li>For write operations, the accessed memory segment must not be
+     *     {@link MemorySegment#isReadOnly() read only}, or an
+     *     {@link IllegalArgumentException} is thrown.</li>
     *     <li>The {@linkplain MemorySegment#scope() scope} associated with the accessed
     *     segment must be {@linkplain MemorySegment.Scope#isAlive() alive}, or an
     *     {@link IllegalStateException} is thrown.</li>
--- a/src/java.base/share/classes/java/lang/foreign/MemorySegment.java
+++ b/src/java.base/share/classes/java/lang/foreign/MemorySegment.java
@@ -869,7 +869,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         this segment is not {@linkplain Scope#isAlive() alive}
     * @throws WrongThreadException if this method is called from a thread {@code T},
     *         such that {@code isAccessibleBy(T) == false}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    MemorySegment fill(byte value);
@@ -894,7 +894,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         {@code src} is not {@linkplain Scope#isAlive() alive}
     * @throws WrongThreadException if this method is called from a thread {@code T},
     *         such that {@code src.isAccessibleBy(T) == false}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     * @return this segment
     */
@@ -1269,6 +1269,8 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         this segment is not {@linkplain Scope#isAlive() alive}
     * @throws WrongThreadException if this method is called from a thread {@code T},
     *         such that {@code isAccessibleBy(T) == false}
+     * @throws IllegalArgumentException if this segment is
+     *         {@linkplain #isReadOnly() read-only}
     */
    void setString(long offset, String str);

@@ -1306,6 +1308,8 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         such that {@code isAccessibleBy(T) == false}
     * @throws IllegalArgumentException if {@code charset} is not a
     *         {@linkplain StandardCharsets standard charset}
+     * @throws IllegalArgumentException if this segment is
+     *         {@linkplain #isReadOnly() read-only}
     */
    void setString(long offset, String str, Charset charset);

@@ -1493,7 +1497,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IndexOutOfBoundsException if {@code dstOffset > dstSegment.byteSize() - bytes}
     * @throws IndexOutOfBoundsException if either {@code srcOffset},
     *         {@code dstOffset} or {@code bytes} are {@code < 0}
-     * @throws UnsupportedOperationException if {@code dstSegment} is
+     * @throws IllegalArgumentException if {@code dstSegment} is
     *         {@linkplain #isReadOnly() read-only}
     */
    @ForceInline
@@ -1552,7 +1556,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         {@code dstSegment} is not {@linkplain Scope#isAlive() alive}
     * @throws WrongThreadException if this method is called from a thread {@code T},
     *         such that {@code dstSegment.isAccessibleBy(T) == false}
-     * @throws UnsupportedOperationException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
     * @throws IndexOutOfBoundsException if {@code elementCount * srcLayout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code elementCount * dtsLayout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code srcOffset > srcSegment.byteSize() - (elementCount * srcLayout.byteSize())}
@@ -1605,7 +1609,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfByte layout, long offset, byte value);
@@ -1643,7 +1647,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfBoolean layout, long offset, boolean value);
@@ -1681,7 +1685,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfChar layout, long offset, char value);
@@ -1719,7 +1723,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfShort layout, long offset, short value);
@@ -1757,7 +1761,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfInt layout, long offset, int value);
@@ -1795,7 +1799,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfFloat layout, long offset, float value);
@@ -1833,7 +1837,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfLong layout, long offset, long value);
@@ -1871,7 +1875,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the provided layout
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is
+     * @throws IllegalArgumentException if this segment is
     *         {@linkplain #isReadOnly() read-only}
     */
    void set(ValueLayout.OfDouble layout, long offset, double value);
@@ -1921,8 +1925,10 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IndexOutOfBoundsException if {@code offset > byteSize() - layout.byteSize()}
     * @throws UnsupportedOperationException if this segment is
     *         {@linkplain #isReadOnly() read-only}
-     * @throws UnsupportedOperationException if {@code value} is not a
+     * @throws IllegalArgumentException if {@code value} is not a
     *         {@linkplain #isNative() native} segment
+     * @throws IllegalArgumentException if this segment is
+     *         {@linkplain #isReadOnly() read-only}
     */
    void set(AddressLayout layout, long offset, MemorySegment value);

@@ -2055,7 +2061,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfByte layout, long index, byte value);

@@ -2078,7 +2084,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfBoolean layout, long index, boolean value);

@@ -2101,7 +2107,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfShort layout, long index, short value);

@@ -2146,7 +2152,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfInt layout, long index, int value);

@@ -2191,7 +2197,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfFloat layout, long index, float value);

@@ -2236,7 +2242,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfLong layout, long index, long value);

@@ -2281,7 +2287,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IllegalArgumentException if {@code layout.byteAlignment() > layout.byteSize()}
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
-     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if this segment is {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(ValueLayout.OfDouble layout, long index, double value);

@@ -2336,7 +2342,9 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code index * layout.byteSize() > byteSize() - layout.byteSize()}
     * @throws UnsupportedOperationException if this segment is {@linkplain #isReadOnly() read-only}
-     * @throws UnsupportedOperationException if {@code value} is not a {@linkplain #isNative() native} segment
+     * @throws IllegalArgumentException if {@code value} is not a {@linkplain #isNative() native} segment
+     * @throws IllegalArgumentException if this segment is
+     *         {@linkplain #isReadOnly() read-only}
     */
    void setAtIndex(AddressLayout layout, long index, MemorySegment value);

@@ -2460,7 +2468,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     *         <a href="MemorySegment.html#segment-alignment">incompatible with the alignment constraint</a>
     *         in the source element layout
     * @throws IllegalArgumentException if {@code dstLayout.byteAlignment() > dstLayout.byteSize()}
-     * @throws UnsupportedOperationException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
+     * @throws IllegalArgumentException if {@code dstSegment} is {@linkplain #isReadOnly() read-only}
     * @throws IndexOutOfBoundsException if {@code elementCount * dstLayout.byteSize()} overflows
     * @throws IndexOutOfBoundsException if {@code dstOffset > dstSegment.byteSize() - (elementCount * dstLayout.byteSize())}
     * @throws IndexOutOfBoundsException if {@code srcIndex > srcArray.length - elementCount}
--- a/src/java.base/share/classes/java/lang/foreign/SegmentAllocator.java
+++ b/src/java.base/share/classes/java/lang/foreign/SegmentAllocator.java
@@ -350,7 +350,7 @@ public interface SegmentAllocator {
     *
     * @param layout the layout of the block of memory to be allocated
     * @param value  the value to be set in the newly allocated memory segment
-     * @throws UnsupportedOperationException if {@code value} is not
+     * @throws IllegalArgumentException if {@code value} is not
     *         a {@linkplain MemorySegment#isNative() native} segment
     */
    default MemorySegment allocateFrom(AddressLayout layout, MemorySegment value) {
@@ -670,9 +670,11 @@ public interface SegmentAllocator {
     *
     * @param segment the segment from which the returned allocator should slice from
     * @return a new slicing allocator
+     * @throws IllegalArgumentException if the {@code segment} is
+     *         {@linkplain MemorySegment#isReadOnly() read-only}
     */
    static SegmentAllocator slicingAllocator(MemorySegment segment) {
-        Objects.requireNonNull(segment);
+        assertWritable(segment);
        return new SlicingAllocator(segment);
    }

@@ -700,9 +702,19 @@ public interface SegmentAllocator {
     * @param segment the memory segment to be recycled by the returned allocator
     * @return an allocator that recycles an existing segment upon each new
     *         allocation request
+     * @throws IllegalArgumentException if the {@code segment} is
+     *         {@linkplain MemorySegment#isReadOnly() read-only}
     */
    static SegmentAllocator prefixAllocator(MemorySegment segment) {
-        return (AbstractMemorySegmentImpl)Objects.requireNonNull(segment);
+        assertWritable(segment);
+        return (AbstractMemorySegmentImpl)segment;
+    }
+
+    private static void assertWritable(MemorySegment segment) {
+        // Implicit null check
+        if (segment.isReadOnly()) {
+            throw new IllegalArgumentException("read-only segment");
+        }
    }

    @ForceInline
--- a/src/java.base/share/classes/java/lang/invoke/MethodHandles.java
+++ b/src/java.base/share/classes/java/lang/invoke/MethodHandles.java
@@ -1841,7 +1841,7 @@ public class MethodHandles {
         *                           <a href="MethodHandles.Lookup.html#secmgr">refuses access</a>
         * @throws NullPointerException if {@code bytes} is {@code null}
         * @since 9
-         * @see Lookup#privateLookupIn
+         * @see MethodHandles#privateLookupIn
         * @see Lookup#dropLookupMode
         * @see ClassLoader#defineClass(String,byte[],int,int,ProtectionDomain)
         */
--- a/src/java.base/share/classes/java/lang/reflect/ClassFileFormatVersion.java
+++ b/src/java.base/share/classes/java/lang/reflect/ClassFileFormatVersion.java
@@ -294,6 +294,18 @@ public enum ClassFileFormatVersion {
     * <cite>The Java Virtual Machine Specification, Java SE 22 Edition</cite></a>
     */
    RELEASE_22(66),
+
+    /**
+     * The version introduced by the Java Platform, Standard Edition
+     * 23.
+     *
+     * @since 23
+     *
+     * @see <a
+     * href="https://docs.oracle.com/javase/specs/jvms/se23/html/index.html">
+     * <cite>The Java Virtual Machine Specification, Java SE 23 Edition</cite></a>
+     */
+    RELEASE_23(67),
    ; // Reduce code churn when appending new constants

    // Note to maintainers: when adding constants for newer releases,
@@ -309,7 +321,7 @@ public enum ClassFileFormatVersion {
     * {@return the latest class file format version}
     */
    public static ClassFileFormatVersion latest() {
-        return RELEASE_22;
+        return RELEASE_23;
    }

    /**
--- a/src/java.base/share/classes/java/nio/Heap-X-Buffer.java.template
+++ b/src/java.base/share/classes/java/nio/Heap-X-Buffer.java.template
@@ -29,6 +29,7 @@ package java.nio;

 import java.lang.foreign.MemorySegment;
 import java.util.Objects;
+import jdk.internal.util.ArraysSupport;

 /**
 #if[rw]
@@ -705,6 +706,9 @@ class Heap$Type$Buffer$RW$
                                                                   addr, segment)));
    }

+    public int hashCode() {
+        return ArraysSupport.vectorizedHashCode(hb, ix(position()), remaining(), 1, ArraysSupport.T_BYTE);
+    }

 #end[byte]

@@ -733,6 +737,9 @@ class Heap$Type$Buffer$RW$
                                      offset, segment);
    }

+    public int hashCode() {
+        return ArraysSupport.vectorizedHashCode(hb, ix(position()), remaining(), 1, ArraysSupport.T_CHAR);
+    }
 #end[char]


--- a/src/java.base/share/classes/java/text/ChoiceFormat.java
+++ b/src/java.base/share/classes/java/text/ChoiceFormat.java
@@ -88,7 +88,6 @@ import java.util.Arrays;
 * <p>
 * Below is an example of constructing a ChoiceFormat with arrays to format
 * and parse values:
- * <blockquote>
 * {@snippet lang=java :
 * double[] limits = {1,2,3,4,5,6,7};
 * String[] dayOfWeekNames = {"Sun","Mon","Tue","Wed","Thur","Fri","Sat"};
@@ -100,34 +99,27 @@ import java.util.Arrays;
 *                              + form.parse(form.format(i),status));
 * }
 * }
- * </blockquote>
+ *
+ * <p>
 * For more sophisticated patterns, {@code ChoiceFormat} can be used with
 * {@link MessageFormat} to produce accurate forms for singular and plural:
- * <blockquote>
 * {@snippet lang=java :
- * double[] filelimits = {0,1,2};
- * String[] filepart = {"are no files","is one file","are {2} files"};
- * ChoiceFormat fileform = new ChoiceFormat(filelimits, filepart);
- * Format[] testFormats = {fileform, null, NumberFormat.getInstance()};
- * MessageFormat pattform = new MessageFormat("There {0} on {1}");
- * pattform.setFormats(testFormats);
- * Object[] testArgs = {null, "ADisk", null};
- * for (int i = 0; i < 4; ++i) {
- *     testArgs[0] = Integer.valueOf(i);
- *     testArgs[2] = testArgs[0];
- *     System.out.println(pattform.format(testArgs));
+ * MessageFormat msgFmt = new MessageFormat("The disk \"{0}\" contains {1}.");
+ * double[] fileLimits = {0,1,2};
+ * String[] filePart = {"no files","one file","{1,number} files"};
+ * ChoiceFormat fileChoices = new ChoiceFormat(fileLimits, filePart);
+ * msgFmt.setFormatByArgumentIndex(1, fileChoices);
+ * Object[] args = {"MyDisk", 1273};
+ * System.out.println(msgFmt.format(args));
 * }
- * }
- * </blockquote>
- * Would output the following:
- * <blockquote>
- * <pre>{@code
- * There are no files on ADisk
- * There is one file on ADisk
- * There are 2 files on ADisk
- * There are 3 files on ADisk
- * }</pre>
- * </blockquote>
+ * The output with different values for {@code fileCount}:
+ * <blockquote><pre>
+ * The disk "MyDisk" contains no files.
+ * The disk "MyDisk" contains one file.
+ * The disk "MyDisk" contains 1,273 files.
+ * </pre></blockquote>
+ * See {@link MessageFormat##pattern_caveats MessageFormat} for caveats regarding
+ * {@code MessageFormat} patterns within a {@code ChoiceFormat} pattern.
 *
 * <h2><a id="patterns">Patterns</a></h2>
 * A {@code ChoiceFormat} pattern has the following syntax:
@@ -194,7 +186,6 @@ import java.util.Arrays;
 * {@code new ChoiceFormat("1# ''one'' ").format(1)} returns {@code " 'one' "}.
 *
 * <p>Below is an example of constructing a ChoiceFormat with a pattern:
- * <blockquote>
 * {@snippet lang=java :
 * ChoiceFormat fmt = new ChoiceFormat(
 *      "-1#is negative| 0#is zero or fraction | 1#is one |1.0<is 1+ |2#is two |2<is more than 2.");
@@ -210,7 +201,6 @@ import java.util.Arrays;
 * System.out.println(fmt.format(Double.NaN)); // outputs "is negative"
 * System.out.println(fmt.format(Double.POSITIVE_INFINITY)); // outputs "is more than 2."
 * }
- * </blockquote>
 *
 * <h2><a id="synchronization">Synchronization</a></h2>
 *
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alex Menkov	cf948548c3	8321565: [REDO] Heap dump does not contain virtual Thread stack references Reviewed-by: sspitsyn, yyang, dholmes	2023-12-13 18:47:04 +00:00
Jorn Vernee	7ece9e90c0	8321400: java/foreign/TestStubAllocFailure.java fails with code cache exhaustion Reviewed-by: mcimadamore	2023-12-13 17:34:37 +00:00
Albert Mingkun Yang	9320ef9b29	8321973: Parallel: Remove unused methods in AdaptiveSizePolicy Reviewed-by: tschatzl	2023-12-13 12:43:41 +00:00
Albert Mingkun Yang	2a565ff368	8321808: G1: Use unsigned type for non-negative G1 flags Reviewed-by: tschatzl, iwalulya	2023-12-13 11:18:51 +00:00
Lei Zaakjyu	493b5bd2fd	8293622: Cleanup use of G1ConcRefinementThreads Reviewed-by: ayang, iwalulya	2023-12-13 11:18:38 +00:00
Aleksei Voitylov	f573f6d233	8321515: ARM32: Move method resolution information out of the cpCache properly Reviewed-by: shade	2023-12-13 11:04:11 +00:00
Sergey Tsypanov	8a0a6f8c25	8321279: Implement hashCode() in Heap-X-Buffer.java.template Reviewed-by: alanb, bpb	2023-12-13 09:10:11 +00:00
David Holmes	3d9d353edb	8321825: Remove runtime/CompressedOops/CompressedClassPointers.java from the ProblemList Reviewed-by: ccheung	2023-12-12 23:00:48 +00:00
Matias Saavedra Silva	1b621f5527	8321474: TestAutoCreateSharedArchiveUpgrade.java should be updated with JDK 21 Reviewed-by: dholmes, iklam	2023-12-12 22:49:41 +00:00
Erik Joelsson	5463c9cd9a	8321557: Move SOURCE line verification for OracleJDK out of OpenJDK Reviewed-by: ihse	2023-12-12 21:31:41 +00:00
Joe Darcy	ac07355f55	8320200: Use Elements predicates for record constructors to improve print output Reviewed-by: vromero	2023-12-12 21:00:50 +00:00
Roger Riggs	4fb5c12813	8321180: Condition for non-latin1 string size too large exception is off by one Reviewed-by: rgiulietti	2023-12-12 20:55:17 +00:00
Alexandre Iline	d5a96e3f49	8321621: Update JCov version to 3.0.16 Reviewed-by: erikj, alanb, ihse	2023-12-12 20:41:18 +00:00
Justin Lu	aadf36809c	6230751: [Fmt-Ch] Recursive MessageFormats in ChoiceFormats ignore indicated subformats Reviewed-by: naoto	2023-12-12 19:25:20 +00:00
Joe Darcy	a3447ec656	8321827: Remove unnecessary suppress warnings annotations from the printing processor Reviewed-by: jlaskey	2023-12-12 18:44:43 +00:00
Sergey Bylokhov	b25ed57b76	8270269: Desktop.browse method fails if earlier CoInitialize call as COINIT_MULTITHREADED Reviewed-by: aivanov	2023-12-12 18:30:41 +00:00
Christian Stein	df4ed7eff7	8321739: Source launcher fails with "Not a directory" error Reviewed-by: jlahoda	2023-12-12 15:26:21 +00:00
Jamil Nimeh	5718039a46	8321542: C2: Missing ChaCha20 stub for x86_32 leads to crashes Reviewed-by: chagedorn, shade	2023-12-12 14:36:58 +00:00
Hannes Wallnöfer	c51685267c	`8321889`: JavaDoc method references with wrong (nested) type Reviewed-by: alanb	2023-12-12 11:27:31 +00:00
Thomas Schatzl	7d903964fb	8321422: Test gc/g1/pinnedobjs/TestPinnedObjectTypes.java times out after completion Reviewed-by: iwalulya, ayang	2023-12-12 10:35:40 +00:00
Kevin Walls	6f4824068d	8321729: Remove 'orb' field in RMIConnector Reviewed-by: rriggs, dfuchs	2023-12-12 10:02:01 +00:00
Kevin Walls	e1fd663f22	8311306: Test com/sun/management/ThreadMXBean/ThreadCpuTimeArray.java failed: out of expected range Reviewed-by: sspitsyn	2023-12-12 09:58:41 +00:00
Albert Mingkun Yang	d5214a4288	8321814: G1: Remove unused G1RemSetScanState::_collection_set_iter_state Reviewed-by: tschatzl	2023-12-12 09:45:27 +00:00
Albert Mingkun Yang	2611a49ea1	8321287: Remove unused enum style in Prefetch Reviewed-by: fparain, gziemski	2023-12-12 08:36:55 +00:00
Alan Bateman	b8c0b2fd8c	8321594: NativeThreadSet should use placeholder for virtual threads Reviewed-by: bpb	2023-12-12 07:55:56 +00:00
Guoxiong Li	973bcdab81	8321631: Fix comments in access.hpp Reviewed-by: eosterlund, stefank	2023-12-12 07:19:50 +00:00
Yuri Gaevsky	6359b4ec23	8318217: RISC-V: C2 VectorizedHashCode Reviewed-by: mli, fyang	2023-12-12 06:35:09 +00:00
Jorn Vernee	ce4b257fa5	8320886: Unsafe_SetMemory0 is not guarded Reviewed-by: dholmes, fparain	2023-12-11 19:05:40 +00:00
Hamlin Li	b270f30d10	8318629: G1: Refine code a bit in G1RemSetTrackingPolicy::update_at_allocate Reviewed-by: ayang, tschatzl	2023-12-11 15:45:47 +00:00
Magnus Ihse Bursie	486594d427	8316657: Support whitebox testing in microbenchmarks Reviewed-by: erikj, redestad	2023-12-11 14:17:38 +00:00
Jan Lahoda	ce8399fd60	8321582: yield <primitive-type>.class not parsed correctly. Reviewed-by: vromero	2023-12-11 12:20:22 +00:00
Adam Sotona	3c6459e1de	8321641: ClassFile ModuleAttribute.ModuleAttributeBuilder::moduleVersion spec contains a copy-paste error Reviewed-by: alanb	2023-12-11 10:08:42 +00:00
Anton Bobrov	92fd490f22	8321176: [Screencast] make a second attempt on screencast failure Reviewed-by: azvegint, prr	2023-12-11 08:29:40 +00:00
Per Minborg	d13302f8b0	8321387: SegmentAllocator:allocateFrom(AddressLayout, MemorySegment) does not throw stated UnsupportedOperationException Reviewed-by: mcimadamore	2023-12-11 07:52:31 +00:00
vamsi-parasa	ce108446ca	8319577: x86_64 AVX2 intrinsics for Arrays.sort methods (int, float arrays) Reviewed-by: sviswanathan, ihse, jbhateja, kvn	2023-12-08 22:52:48 +00:00
Joe Darcy	5c12a182e3	8320790: Update --release 22 symbol information for JDK 22 build 27 Reviewed-by: iris, jjg	2023-12-08 19:33:48 +00:00
Brian Burkhalter	71800884f6	8321429: (fc) FileChannel.lock creates a FileKey containing two long index values, they could be stored as int values Reviewed-by: alanb	2023-12-08 19:19:01 +00:00
Naoto Sato	0c178beb69	8321206: Make Locale related system properties `StaticProperty` Reviewed-by: rriggs	2023-12-08 18:47:40 +00:00
Phil Race	6c13a3032f	8312307: Obsoleted code in hb-jdk-font.cc Reviewed-by: serb	2023-12-08 18:47:30 +00:00
Ioi Lam	5e6bfc5eaa	8321539: Minimal build is broken by JDK-8320935 Reviewed-by: matsaave, ccheung, mbaesken	2023-12-08 17:25:22 +00:00
Chris Plummer	2c2d4d2cde	8321485: remove serviceability/attach/ConcAttachTest.java from problemlist on macosx Reviewed-by: jpai, amenkov	2023-12-08 17:02:35 +00:00
Lance Andersen	0eb299af79	8316141: Improve CEN header validation checking Reviewed-by: alanb	2023-12-08 16:37:53 +00:00
Magnus Ihse Bursie	b893a2b2f7	8321597: Use .template consistently for files treated as templates by the build Reviewed-by: erikj	2023-12-08 15:46:02 +00:00
Frederic Thevenet	05f950934e	8321374: Add a configure option to explicitly set CompanyName property in VersionInfo resource for Windows exe/dll Reviewed-by: erikj, ihse	2023-12-08 14:09:01 +00:00
Daniel Lundén	701bc3bbbe	8295166: IGV: dump graph at more locations Reviewed-by: thartmann, rcastanedalo, chagedorn	2023-12-08 11:08:08 +00:00
Daniel Lundén	9e48b90c7f	8310524: C2: record parser-generated LoadN nodes for IGVN Reviewed-by: chagedorn, rcastanedalo, thartmann	2023-12-08 11:04:39 +00:00
sunguoyun	bad5edf146	8320959: jdk/jfr/event/runtime/TestShutdownEvent.java crash with CONF=fastdebug -Xcomp Co-authored-by: Markus Grönlund <mgronlun@openjdk.org> Reviewed-by: mgronlun	2023-12-08 10:47:58 +00:00
Jaikiran Pai	f577385fc8	8316738: java/net/httpclient/HttpClientLocalAddrTest.java failed in timeout Reviewed-by: dfuchs	2023-12-08 10:21:07 +00:00
Alan Bateman	86623aa41d	8320786: Remove ThreadGroup.stop Reviewed-by: rriggs, dholmes, jpai	2023-12-08 08:04:38 +00:00
Alan Bateman	af5c49226c	8320532: Remove Thread/ThreadGroup suspend/resume Reviewed-by: dholmes, jpai, sspitsyn, smarks	2023-12-08 07:10:20 +00:00
Alex Menkov	cb7e3d263a	8321560: [BACKOUT] 8299426: Heap dump does not contain virtual Thread stack references Reviewed-by: cjplummer, dholmes	2023-12-08 01:24:25 +00:00
Phil Race	25dc4762b4	8286827: BogusColorSpace methods return wrong array Reviewed-by: bpb, serb	2023-12-07 23:33:56 +00:00
Weijun Wang	11e4a925be	8320597: RSA signature verification fails on signed data that does not encode params correctly Reviewed-by: mullan, valeriep	2023-12-07 23:25:56 +00:00
Alex Menkov	354ea4c28f	8299426: Heap dump does not contain virtual Thread stack references Reviewed-by: cjplummer, sspitsyn, lmesnik	2023-12-07 23:18:23 +00:00
Phil Race	959a443a9e	8288712: Typo in javadoc in javax.imageio.ImageReader.java Reviewed-by: iris	2023-12-07 21:05:38 +00:00
Naoto Sato	4ed38f5ad5	8321409: Console read line with zero out should zero out underlying buffer in JLine (redux) Reviewed-by: alanb	2023-12-07 19:46:18 +00:00
Matias Saavedra Silva	fe4c0a2f04	8302790: Set FileMapRegion::mapped_base() to null if mapping fails Reviewed-by: iklam, ccheung	2023-12-07 19:32:55 +00:00
Joe Darcy	519ecd352a	8319413: Start of release updates for JDK 23 8319414: Add SourceVersion.RELEASE_23 8319416: Add source 23 and target 23 to javac Reviewed-by: iris, erikj, alanb, vromero	2023-12-07 17:01:29 +00:00