8329816: Add SLEEF version 3.6.1

Reviewed-by: erikj, mli, luhenry
2025-12-06 09:29:38 +01:00 · 2024-09-17 12:58:36 +00:00
parent 80db6e71b0
commit b39e6a84ef
175 changed files with 120709 additions and 0 deletions
--- a/make/Main.gmk
+++ b/make/Main.gmk
@@ -568,6 +568,10 @@ $(eval $(call SetupTarget, update-build-docs, \
    MAKEFILE := UpdateBuildDocs, \
 ))

+$(eval $(call SetupTarget, update-sleef-source, \
+    MAKEFILE := UpdateSleefSource, \
+))
+
 $(eval $(call SetupTarget, update-x11wrappers, \
    MAKEFILE := UpdateX11Wrappers, \
    DEPS := java.base-copy buildtools-jdk, \
--- a/make/UpdateSleefSource.gmk
+++ b/make/UpdateSleefSource.gmk
@@ -0,0 +1,153 @@
+#
+# Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.  Oracle designates this
+# particular file as subject to the "Classpath" exception as provided
+# by Oracle in the LICENSE file that accompanied this code.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+#
+
+################################################################################
+
+default: all
+
+include $(SPEC)
+include MakeBase.gmk
+
+include CopyFiles.gmk
+include Execute.gmk
+
+################################################################################
+# This file is responsible for updating the generated sleef source code files
+# that are checked in to the JDK repo, and that are actually used when building.
+# This target needs to be re-run every time the source code of libsleef is
+# updated from upstream.
+################################################################################
+
+ifneq ($(COMPILE_TYPE), cross)
+  $(error Only cross-compilation of libsleef is currently supported)
+endif
+
+ifeq ($(CMAKE), )
+  $(error CMake not found. Please install cmake and rerun configure)
+endif
+
+ifneq ($(OPENJDK_BUILD_OS), linux)
+  $(error This target is only supported on linux)
+endif
+
+SLEEF_SUPPORT_DIR := $(MAKESUPPORT_OUTPUTDIR)/sleef
+SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/linux/native/libsleef
+SLEEF_SOURCE_DIR := $(SLEEF_SOURCE_BASE_DIR)/upstream
+SLEEF_TARGET_DIR := $(SLEEF_SOURCE_BASE_DIR)/generated
+SLEEF_NATIVE_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/native
+SLEEF_CROSS_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/cross
+
+ifeq ($(OPENJDK_TARGET_CPU), aarch64)
+  CROSS_COMPILATION_FILENAMES := sleefinline_advsimd.h sleefinline_sve.h
+  EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_SVE=TRUE
+else ifeq ($(OPENJDK_TARGET_CPU), riscv64)
+  CROSS_COMPILATION_FILENAMES := sleefinline_rvvm1.h
+  EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_RVVM1=TRUE
+else
+  $(error Unsupported platform)
+endif
+CROSS_COMPILATION_SRC_FILES := $(addprefix $(SLEEF_CROSS_BUILD_DIR)/include/, \
+    $(CROSS_COMPILATION_FILENAMES))
+
+ifeq ($(TOOLCHAIN_TYPE), clang)
+  SLEEF_TOOLCHAIN_TYPE := llvm
+else
+  SLEEF_TOOLCHAIN_TYPE := $(TOOLCHAIN_TYPE)
+endif
+
+SLEEF_CMAKE_FILE := toolchains/$(OPENJDK_TARGET_CPU)-$(SLEEF_TOOLCHAIN_TYPE).cmake
+
+# We need to run CMake twice, first using it to configure the build, and then
+# to actually build; and we need to do this twice, once for a native build
+# and once for the cross-compilation build.
+
+$(eval $(call SetupExecute, sleef_native_config, \
+    INFO := Configuring native sleef build, \
+    OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
+    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
+        $(SLEEF_NATIVE_BUILD_DIR), \
+))
+
+TARGETS := $(sleef_native_config)
+
+$(eval $(call SetupExecute, sleef_native_build, \
+    INFO := Building native sleef, \
+    DEPS := $(sleef_native_config), \
+    OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
+    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
+        $(SLEEF_NATIVE_BUILD_DIR) -j, \
+))
+
+TARGETS := $(sleef_native_build)
+
+$(eval $(call SetupExecute, sleef_cross_config, \
+    INFO := Configuring cross-compiling sleef build, \
+    DEPS := $(sleef_native_build), \
+    OUTPUT_DIR := $(SLEEF_CROSS_BUILD_DIR), \
+    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
+        $(SLEEF_CROSS_BUILD_DIR) \
+        -DCMAKE_C_COMPILER=$(CC) \
+        -DCMAKE_TOOLCHAIN_FILE=$(SLEEF_CMAKE_FILE) \
+        -DNATIVE_BUILD_DIR=$(SLEEF_NATIVE_BUILD_DIR) \
+        -DSLEEF_BUILD_INLINE_HEADERS=TRUE \
+        $(EXTRA_CROSS_OPTIONS), \
+))
+
+TARGETS := $(sleef_cross_config)
+
+$(eval $(call SetupExecute, sleef_cross_build, \
+    INFO := Building cross-compiling sleef, \
+    DEPS := $(sleef_cross_config), \
+    OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
+    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
+        $(SLEEF_CROSS_BUILD_DIR) -j, \
+))
+
+TARGETS := $(sleef_cross_build)
+
+$(CROSS_COMPILATION_SRC_FILES): $(sleef_cross_build)
+
+# Finally, copy the generated files (and one needed static file) into our
+# target directory.
+
+$(eval $(call SetupCopyFiles, copy_static_sleef_source, \
+    FILES := $(SLEEF_SOURCE_DIR)/src/common/misc.h, \
+    DEST := $(SLEEF_TARGET_DIR), \
+))
+
+TARGETS := $(copy_static_sleef_source)
+
+$(eval $(call SetupCopyFiles, copy_generated_sleef_source, \
+    FILES := $(CROSS_COMPILATION_SRC_FILES), \
+    DEST := $(SLEEF_TARGET_DIR), \
+))
+
+TARGETS := $(copy_generated_sleef_source)
+
+################################################################################
+
+all: $(TARGETS)
+
+.PHONY: all default
--- a/make/autoconf/basic_tools.m4
+++ b/make/autoconf/basic_tools.m4
@@ -99,6 +99,7 @@ AC_DEFUN_ONCE([BASIC_SETUP_TOOLS],
  UTIL_REQUIRE_SPECIAL(FGREP, [AC_PROG_FGREP])

  # Optional tools, we can do without them
+  UTIL_LOOKUP_PROGS(CMAKE, cmake)
  UTIL_LOOKUP_PROGS(DF, df)
  UTIL_LOOKUP_PROGS(GIT, git)
  UTIL_LOOKUP_PROGS(NICE, nice)
--- a/make/autoconf/spec.gmk.template
+++ b/make/autoconf/spec.gmk.template
@@ -719,6 +719,7 @@ CCACHE := @CCACHE@
 # CD is going away, but remains to cater for legacy makefiles.
 CD := cd
 CHMOD := @CHMOD@
+CMAKE := @CMAKE@
 CODESIGN := @CODESIGN@
 CP := @CP@
 CUT := @CUT@
--- a/src/jdk.incubator.vector/linux/legal/sleef.md
+++ b/src/jdk.incubator.vector/linux/legal/sleef.md
@@ -0,0 +1,439 @@
+## SLEEF v3.6.1
+
+### Notice
+```
+Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors
+
+-------
+src/arch/helpersve.h has the following copyright:
+Copyright ARM Ltd. 2010 - 2024.
+-------
+src/gencoef/{dp.h, gencoef.c, ld.h, qp.h, simplexfr.c, sp.h} have no copyright but has the following license text:
+// The code is distributed under the Creative Commons Attribution 4.0 International License.
+// https://creativecommons.org/licenses/by/4.0/
+Attribution 4.0 International
+```
+
+### LICENSE Boost v1.0
+```
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+```
+
+### LICENSE Creative Commons Attribution 4.0 International License
+
+```
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution 4.0 International Public License ("Public License"). To the
+extent this Public License may be interpreted as a contract, You are
+granted the Licensed Rights in consideration of Your acceptance of
+these terms and conditions, and the Licensor grants You such rights in
+consideration of benefits the Licensor receives from making the
+Licensed Material available under these terms and conditions.
+
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  i. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  j. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  k. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part; and
+
+            b. produce, reproduce, and Share Adapted Material.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties.
+
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
+```
--- a/src/jdk.incubator.vector/linux/native/libsleef/README.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/README.md
@@ -0,0 +1,54 @@
+# About SLEEF
+
+This directory contains the source code for the SLEEF library, the
+**SIMD Library for Evaluating Elementary Functions**. For more information on
+SLEEF, see https://sleef.org/.
+
+The currently imported libsleef sources is version 3.6.1, which has
+git tag `3.6.1` and git commit hash `6ee14bcae5fe92c2ff8b000d5a01102dab08d774`.
+
+# About the libsleef integration in the JDK
+
+The upstream original source code is available in
+`src/jdk.incubator.vector/linux/native/libsleef/upstream`. However, this code is
+not directly usable in the JDK build system, but is instead used as the base for
+the generation of additional souce code files. This generation is done by
+the libsleef CMake files. If this should have been done at build time, it would
+have meant adding CMake as a required dependency to build the JDK.
+
+Instead, we create these generated files only once, when we import a new
+version of the libsleef source code, and check in the generated files into
+the JDK source tree. The generated files reside in
+`src/jdk.incubator.vector/linux/native/libsleef/generated`.
+
+# Import instructions
+
+To update the version of libsleef that is used in the JDK, clone
+`https://github.com/shibatch/sleef.git`, and copy all files, except the `docs`,
+`.github` and `.git` directories, into
+`src/jdk.incubator.vector/linux/native/libsleef/upstream`.
+
+The libsleef source code does not follow the JDK whitespace rules as enforced by
+jcheck. You will need to remove trailing whitespace, and expand tabs to 8
+spaces in the imported source code.
+
+Update the note above with information about what version you import.
+
+You will need to repeat the process below for each of the platforms in the JDK
+that uses libsleef; currently this is aarch64 and riscv64. The rest of this
+instruction assumes you are doing this on linux/x64; at this point, any other
+setup is not supported. Also, make sure you have CMake installed.
+
+First, run configure for cross-compiling to your selected target platform
+(e.g. aarch64).
+
+Run `make update-sleef-source` to process the upstream source code and
+store the generated files in the `generated` directory.
+
+Now, you can repeat this for the next platform. For instance, you can
+create a separate configuration using `configure --with-conf-name=riscv64` and
+then generate the updated libsleef source code by
+`make update-sleef-source CONF=riscv64`.
+
+Finally, verify with git that the local changes made to the files in
+`src/jdk.incubator.vector/linux/native/libsleef/generated` look okay.
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/misc.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/misc.h
@@ -0,0 +1,332 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2024.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+//
+
+#ifndef __MISC_H__
+#define __MISC_H__
+
+#if !defined(SLEEF_GENHEADER)
+#include <stdint.h>
+#include <string.h>
+#endif
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643383279502884
+#endif
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884L
+#endif
+
+#ifndef M_1_PI
+#define M_1_PI 0.318309886183790671537767526745028724
+#endif
+
+#ifndef M_1_PIl
+#define M_1_PIl 0.318309886183790671537767526745028724L
+#endif
+
+#ifndef M_2_PI
+#define M_2_PI 0.636619772367581343075535053490057448
+#endif
+
+#ifndef M_2_PIl
+#define M_2_PIl 0.636619772367581343075535053490057448L
+#endif
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef SLEEF_FP_ILOGB0
+#define SLEEF_FP_ILOGB0 ((int)0x80000000)
+#endif
+
+#ifndef SLEEF_FP_ILOGBNAN
+#define SLEEF_FP_ILOGBNAN ((int)2147483647)
+#endif
+
+#endif
+
+#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
+#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
+
+#define SLEEF_FLT_MIN 0x1p-126
+#define SLEEF_DBL_MIN 0x1p-1022
+#define SLEEF_INT_MAX 2147483647
+#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
+#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
+
+//
+
+/*
+  PI_A to PI_D are constants that satisfy the following two conditions.
+
+  * For PI_A, PI_B and PI_C, the last 28 bits are zero.
+  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
+
+  The argument of a trig function is multiplied by 1/PI, and the
+  integral part is divided into two parts, each has at most 28
+  bits. So, the maximum argument that could be correctly reduced
+  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
+  double precision calculation, the actual maximum argument that can
+  be correctly reduced is around 2^47.
+ */
+
+#define PI_A 3.1415926218032836914
+#define PI_B 3.1786509424591713469e-08
+#define PI_C 1.2246467864107188502e-16
+#define PI_D 1.2736634327021899816e-24
+#define TRIGRANGEMAX 1e+14
+
+/*
+  PI_A2 and PI_B2 are constants that satisfy the following two conditions.
+
+  * The last 3 bits of PI_A2 are zero.
+  * PI_A2 + PI_B2 is close to PI as much as possible.
+
+  The argument of a trig function is multiplied by 1/PI, and the
+  integral part is multiplied by PI_A2. So, the maximum argument that
+  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
+  we confirmed that it correctly reduces the argument up to around 15.
+ */
+
+#define PI_A2 3.141592653589793116
+#define PI_B2 1.2246467991473532072e-16
+#define TRIGRANGEMAX2 15
+
+#define M_2_PI_H 0.63661977236758138243
+#define M_2_PI_L -3.9357353350364971764e-17
+
+#define SQRT_DBL_MAX 1.3407807929942596355e+154
+
+#define TRIGRANGEMAX3 1e+9
+
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
+
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
+
+#define L10U 0.30102999566383914498 // log 2 / log 10
+#define L10L 1.4205023227266099418e-13
+#define LOG10_2 3.3219280948873623478703194294893901758648313930
+
+#define L10Uf 0.3010253906f
+#define L10Lf 4.605038981e-06f
+
+//
+
+#define PI_Af 3.140625f
+#define PI_Bf 0.0009670257568359375f
+#define PI_Cf 6.2771141529083251953e-07f
+#define PI_Df 1.2154201256553420762e-10f
+#define TRIGRANGEMAXf 39000
+
+#define PI_A2f 3.1414794921875f
+#define PI_B2f 0.00011315941810607910156f
+#define PI_C2f 1.9841872589410058936e-09f
+#define TRIGRANGEMAX2f 125.0f
+
+#define TRIGRANGEMAX4f 8e+6f
+
+#define SQRT_FLT_MAX 18446743523953729536.0
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+#ifndef M_PIf
+# define M_PIf ((float)M_PI)
+#endif
+
+//
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef ABS
+#define ABS(x) ((x) < 0 ? -(x) : (x))
+#endif
+
+#define stringify(s) stringify_(s)
+#define stringify_(s) #s
+
+#if !defined(SLEEF_GENHEADER)
+typedef long double longdouble;
+#endif
+
+#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
+#define Sleef_double2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double2;
+#endif
+
+#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
+#define Sleef_float2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float2;
+#endif
+
+#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
+#define Sleef_longdouble2_DEFINED
+typedef struct {
+  long double x, y;
+} Sleef_longdouble2;
+#endif
+
+#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
+
+#define LIKELY(condition) __builtin_expect(!!(condition), 1)
+#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
+#define RESTRICT __restrict__
+
+#ifndef __arm__
+#define ALIGNED(x) __attribute__((aligned(x)))
+#else
+#define ALIGNED(x)
+#endif
+
+#if defined(SLEEF_GENHEADER)
+
+#define INLINE SLEEF_ALWAYS_INLINE
+#define EXPORT SLEEF_INLINE
+#define CONST SLEEF_CONST
+#define NOEXPORT
+
+#else // #if defined(SLEEF_GENHEADER)
+
+#define CONST __attribute__((const))
+#define INLINE __attribute__((always_inline))
+
+#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+#ifndef SLEEF_STATIC_LIBS
+#define EXPORT __stdcall __declspec(dllexport)
+#define NOEXPORT
+#else // #ifndef SLEEF_STATIC_LIBS
+#define EXPORT
+#define NOEXPORT
+#endif // #ifndef SLEEF_STATIC_LIBS
+#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+#define EXPORT __attribute__((visibility("default")))
+#define NOEXPORT __attribute__ ((visibility ("hidden")))
+#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+
+#endif // #if defined(SLEEF_GENHEADER)
+
+#define SLEEF_NAN __builtin_nan("")
+#define SLEEF_NANf __builtin_nanf("")
+#define SLEEF_NANl __builtin_nanl("")
+#define SLEEF_INFINITY __builtin_inf()
+#define SLEEF_INFINITYf __builtin_inff()
+#define SLEEF_INFINITYl __builtin_infl()
+
+#if defined(__INTEL_COMPILER) || defined (__clang__)
+#define SLEEF_INFINITYq __builtin_inf()
+#define SLEEF_NANq __builtin_nan("")
+#else
+#define SLEEF_INFINITYq __builtin_infq()
+#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
+#endif
+
+#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
+
+#if defined(SLEEF_GENHEADER)
+
+#define INLINE SLEEF_ALWAYS_INLINE
+#define CONST SLEEF_CONST
+#define EXPORT SLEEF_INLINE
+#define NOEXPORT
+
+#else // #if defined(SLEEF_GENHEADER)
+
+#define INLINE __forceinline
+#define CONST
+#ifndef SLEEF_STATIC_LIBS
+#define EXPORT __declspec(dllexport)
+#define NOEXPORT
+#else
+#define EXPORT
+#define NOEXPORT
+#endif
+
+#endif // #if defined(SLEEF_GENHEADER)
+
+#define RESTRICT
+#define ALIGNED(x)
+#define LIKELY(condition) (condition)
+#define UNLIKELY(condition) (condition)
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
+#include <x86intrin.h>
+#endif
+
+#define SLEEF_INFINITY (1e+300 * 1e+300)
+#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
+#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
+#define SLEEF_NANf ((float)SLEEF_NAN)
+#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
+#define SLEEF_NANl ((long double)SLEEF_NAN)
+
+#if (defined(_M_AMD64) || defined(_M_X64))
+#ifndef __SSE2__
+#define __SSE2__
+#define __SSE3__
+#define __SSE4_1__
+#endif
+#elif _M_IX86_FP == 2
+#ifndef __SSE2__
+#define __SSE2__
+#define __SSE3__
+#define __SSE4_1__
+#endif
+#elif _M_IX86_FP == 1
+#ifndef __SSE__
+#define __SSE__
+#endif
+#endif
+
+#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
+
+#if !defined(__linux__)
+#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
+#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
+#define isnanf(x) ((x) != (x))
+#define isnanl(x) ((x) != (x))
+#endif
+
+#endif // #ifndef __MISC_H__
+
+#ifdef ENABLE_AAVPCS
+#define VECTOR_CC __attribute__((aarch64_vector_pcs))
+#else
+#define VECTOR_CC
+#endif
+
+//
+
+#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#if !defined (__clang__)
+#pragma GCC diagnostic ignored "-Wattribute-alias"
+#pragma GCC diagnostic ignored "-Wlto-type-mismatch"
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
+#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
+#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
+#pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
+#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_advsimd.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_advsimd.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_rvvm1.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_rvvm1.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_sve.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_sve.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/CHANGELOG.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/CHANGELOG.md
@@ -0,0 +1,255 @@
+## 3.6.1 - 2024-06-10
+
+This patch release provides important bug fixes, including a fix
+for API compatibility with 3.5 (#534).
+The support and test for some features is still limited, as
+documented in [README](./README.md), however significant progress
+was made in order to test on Linux, macOS and Windows.
+
+### Added
+- Add support for RISC-V in DFT, QUAD and inline headers (#503,
+  #522).
+- Add GHA workflow to run CI tests on Windows x86 (#540) and macOS
+  x86/aarch64 (#543). And update test matrix.
+- Add GHA workflows to run examples in CI (#550).
+
+### Changed
+- Cleanup/Improve support for RISC-V in LIBM (#520, #521).
+- Update supported environment in documentation (#529, #549),
+  including website and test matrix from README.
+
+### Fixed
+- Major fix and cleanup of CMakeLists.txt (#531).
+- Fix compatibility issue after removal of quad and long double
+  sincospi (#545). Restores functions that are missing in 3.6.
+- Various bug fixes (#528, #533, #536, #537).
+
+## 3.6 - 2024-02-14
+
+This release follows a long period of inactivity. The library is now
+being actively maintained. However, the support and test for some
+features is currently limited, as documented in [README](./README.md).
+
+### Added
+- Add documentation for the quad precision math library
+- Enable generation of inline header file for CUDA (PR #337)
+- Add support for System/390 z15 support (PR #343)
+- Add support for POWER 9 (PR #360)
+- Add quad-precision functions (PR #375, #377, #380, #381, #382, #383,
+  #385, #386, #387)
+- Add preliminary support for iOS and Android (PR #388, #389)
+- Add OpenMP pragmas to the function declarations in sleef.h to enable
+  auto-vectorization by GCC (PR #404, #406)
+- Add new public CI test infrastructure using GitHub Actions (PR #476)
+- Add support for RISC-V in libm (PR #477)
+
+### Removed
+- Remove old CI scripts based on Travis/Jenkins/Appveyor (PR #502)
+
+### Changed
+- Optimise error functions (PR #370)
+- Update CMake package config (PR #412)
+- Update documentation and move doc/website to main repository (PR #504,
+  #513)
+- Add SLEEF_ prefix to user-facing CMake options (PR #509)
+- Disable SVE on Darwin (PR #512)
+
+### Fixed
+- Fix parallel builds with GNU make (PR #491)
+- Various bug fixes (PR #492, #499, #508)
+
+## 3.5.1 - 2020-09-15
+### Changed
+- Fixed a bug in handling compiler options
+
+## 3.5 - 2020-09-01
+- IBM System/390 support is added.
+- The library can be built with Clang on Windows.
+- Static libraries with LTO can be generated.
+- Alternative division and sqrt methods can be chosen with AArch64.
+- Header files for inlining the whole SLEEF functions can be generated.
+- IEEE remainder function is added.
+- GCC-10 can now build SLEEF with SVE support.
+
+## 3.4.1 - 2019-10-01
+### Changed
+- Fixed accuracy problem with tan_u35, atan_u10, log2f_u35 and exp10f_u10.
+  https://github.com/shibatch/sleef/pull/260
+  https://github.com/shibatch/sleef/pull/265
+  https://github.com/shibatch/sleef/pull/267
+- SVE intrinsics that are not supported in newer ACLE are replaced.
+  https://github.com/shibatch/sleef/pull/268
+- FMA4 detection problem is fixed.
+  https://github.com/shibatch/sleef/pull/262
+- Compilation problem under Windows with MinGW is fixed.
+  https://github.com/shibatch/sleef/pull/266
+
+## 3.4 - 2019-04-28
+### Added
+- Faster and low precision functions are added.
+  https://github.com/shibatch/sleef/pull/229
+- Functions that return consistent results across platforms are
+  added
+  https://github.com/shibatch/sleef/pull/216
+  https://github.com/shibatch/sleef/pull/224
+- Quad precision math library(libsleefquad) is added
+  https://github.com/shibatch/sleef/pull/235
+  https://github.com/shibatch/sleef/pull/237
+  https://github.com/shibatch/sleef/pull/240
+- AArch64 Vector Procedure Call Standard (AAVPCS) support.
+### Changed
+- Many functions are now faster
+- Testers are now faster
+
+## 3.3.1 - 2018-08-20
+### Added
+- FreeBSD support is added
+### Changed
+- i386 build problem is fixed
+- Trigonometric functions now evaluate correctly with full FP
+  domain.
+  https://github.com/shibatch/sleef/pull/210
+
+## 3.3 - 2018-07-06
+### Added
+- SVE target support is added to libsleef.
+  https://github.com/shibatch/sleef/pull/180
+- SVE target support is added to DFT. With this patch, DFT operations
+  can be carried out using 256, 512, 1024 and 2048-bit wide vectors
+  according to runtime availability of vector registers and operators.
+  https://github.com/shibatch/sleef/pull/182
+- 3.5-ULP versions of sinh, cosh, tanh, sinhf, coshf, tanhf, and the
+  corresponding testing functionalities are added.
+  https://github.com/shibatch/sleef/pull/192
+- Power VSX target support is added to libsleef.
+  https://github.com/shibatch/sleef/pull/195
+- Payne-Hanek like argument reduction is added to libsleef.
+  https://github.com/shibatch/sleef/pull/197
+
+## 3.2 - 2018-02-26
+### Added
+- The whole build system of the project migrated from makefiles to
+  cmake. In particualr this includes `libsleef`, `libsleefgnuabi`,
+  `libdft` and all the tests.
+- Benchmarks that compare `libsleef` vs `SVML` on X86 Linux are
+  available in the project tree under src/libm-benchmarks directory.
+- Extensive upstream testing via Travis CI and Appveyor, on the
+  following systems:
+  * OS: Windows / Linux / OSX.
+  * Compilers: gcc / clang / MSVC.
+  * Targets: X86 (SSE/AVX/AVX2/AVX512F), AArch64 (Advanced SIMD), ARM
+    (NEON). Emulators like QEMU or SDE can be used to run the tests.
+- Added the following new vector functions (with relative testing):
+  * `log2`
+- New compatibility tests have been added to check that
+  `libsleefgnuabi` exports the GNUABI symbols correctly.
+- The library can be compiled to an LLVM bitcode object.
+- Added masked interface to the library to support AVX512F masked
+  vectorization.
+
+### Changed
+- Use native instructions if available for `sqrt`.
+- Fixed fmax and fmin behavior on AArch64:
+  https://github.com/shibatch/sleef/pull/140
+- Speed improvements for `asin`, `acos`, `fmod` and `log`. Computation
+  speed of other functions are also improved by general optimization.
+  https://github.com/shibatch/sleef/pull/97
+- Removed `libm` dependency.
+
+### Removed
+- Makefile build system
+
+## 3.1 - 2017-07-19
+- Added AArch64 support
+- Implemented the remaining C99 math functions : lgamma, tgamma,
+  erf, erfc, fabs, copysign, fmax, fmin, fdim, trunc, floor, ceil,
+  round, rint, modf, ldexp, nextafter, frexp, hypot, and fmod.
+- Added dispatcher for x86 functions
+- Improved reduction of trigonometric functions
+- Added support for 32-bit x86, Cygwin, etc.
+- Improved tester
+
+## 3.0 - 2017-02-07
+- New API is defined
+- Functions for DFT are added
+- sincospi functions are added
+- gencoef now supports single, extended and quad precision in addition to double precision
+- Linux, Windows and Mac OS X are supported
+- GCC, Clang, Intel Compiler, Microsoft Visual C++ are supported
+- The library can be compiled as DLLs
+- Files needed for creating a debian package are now included
+
+## 2.120 - 2017-01-30
+- Relicensed to Boost Software License Version 1.0
+
+## 2.110 - 2016-12-11
+- The valid range of argument is extended for trig functions
+- Specification of each functions regarding to the domain and accuracy is added
+- A coefficient generation tool is added
+- New testing tools are introduced
+- Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh
+- SIMD xsin and xcos returned values more than 1 when FMA is enabled
+- Pure C cbrt returned incorrect values when the argument is negative
+- tan_u1 returned values with more than 1 ulp of error on rare occasions
+- Removed support for Java language(because no one seems using this)
+
+## 2.100 - 2016-12-04
+- Added support for AVX-512F and Clang Extended Vectors.
+
+## 2.90 - 2016-11-27
+- Added ilogbf. All the reported bugs(listed below) are fixed.
+- Log function returned incorrect values when the argument is very small.
+- Signs of returned values were incorrect when the argument is signed zero.
+- Tester incorrectly counted ULP in some cases.
+- ilogb function returned incorrect values in some cases.
+
+## 2.80 - 2013-05-18
+- Added support for ARM NEON. Added higher accuracy single
+  precision functions : sinf_u1, cosf_u1, sincosf_u1, tanf_u1, asinf_u1,
+  acosf_u1, atanf_u1, atan2f_u1, logf_u1, and cbrtf_u1.
+
+## 2.70 - 2013-04-30
+- Added higher accuracy functions : sin_u1, cos_u1, sincos_u1,
+  tan_u1, asin_u1, acos_u1, atan_u1, atan2_u1, log_u1, and
+  cbrt_u1. These functions evaluate the corresponding function with at
+  most 1 ulp of error.
+
+## 2.60 - 2013-03-26
+- Added the remaining single precision functions : powf, sinhf,
+  coshf, tanhf, exp2f, exp10f, log10f, log1pf. Added support for FMA4
+  (for AMD Bulldozer). Added more test cases. Fixed minor bugs (which
+  degraded accuracy in some rare cases).
+
+## 2.50 - 2013-03-12
+- Added support for AVX2. SLEEF now compiles with ICC.
+
+## 2.40 - 2013-03-07
+- Fixed incorrect denormal/nonnumber handling in ldexp, ldexpf,
+  sinf and cosf. Removed support for Go language.
+
+## 2.31 - 2012-07-05
+- Added sincosf.
+
+## 2.30 - 2012-01-20
+- Added single precision functions : sinf, cosf, tanf, asinf,
+  acosf, atanf, logf, expf, atan2f and cbrtf.
+
+## 2.20 - 2012-01-09
+- Added exp2, exp10, expm1, log10, log1p, and cbrt.
+
+## 2.10 - 2012-01-05
+- asin() and acos() are back.
+- Added ilogb() and ldexp().
+- Added hyperbolic functions.
+- Eliminated dependency on frexp, ldexp, fabs, isnan and isinf.
+
+## 2.00 - 2011-12-30
+- All of the algorithm has been updated.
+- Both accuracy and speed are improved since version 1.10.
+- Denormal number handling is also improved.
+
+## 1.10 - 2010-06-22
+- AVX support is added. Accuracy tester is added.
+
+## 1.00 - 2010-05-15
+- Initial release
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/CMakeLists.txt
@@ -0,0 +1,339 @@
+cmake_minimum_required(VERSION 3.18)
+project(SLEEF VERSION 3.6.1 LANGUAGES C)
+
+set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
+
+# Options
+
+option(SLEEF_BUILD_STATIC_TEST_BINS "Build statically linked test executables" OFF)
+option(SLEEF_ENABLE_LTO "Enable LTO on GCC or ThinLTO on clang" OFF)
+option(SLEEF_BUILD_LIBM "libsleef will be built." ON)
+option(SLEEF_BUILD_DFT "libsleefdft will be built." OFF)
+option(SLEEF_BUILD_QUAD "libsleefquad will be built." OFF)
+option(SLEEF_BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON)
+option(SLEEF_BUILD_SCALAR_LIB "libsleefscalar will be built." OFF)
+option(SLEEF_BUILD_TESTS "Tests will be built." ON)
+option(SLEEF_BUILD_INLINE_HEADERS "Build header for inlining whole SLEEF functions" OFF)
+
+option(SLEEF_TEST_ALL_IUT "Perform tests on implementations with all vector extensions" OFF)
+option(SLEEF_SHOW_CONFIG "Show SLEEF configuration status messages." ON)
+option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
+option(SLEEF_ASAN "Enable address sanitizing on all targets." OFF)
+
+option(SLEEF_ENFORCE_TESTER "Build fails if tester is not available" OFF)
+option(SLEEF_ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)
+
+option(SLEEF_ENABLE_ALTDIV  "Enable alternative division method (aarch64 only)" OFF)
+option(SLEEF_ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
+
+option(SLEEF_DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)
+option(SLEEF_DISABLE_MPFR "Disable testing with the MPFR library" OFF)
+option(SLEEF_DISABLE_SSL "Disable testing with the SSL library" OFF)
+
+option(SLEEF_ENABLE_CUDA "Enable CUDA" OFF)
+option(SLEEF_ENABLE_CXX "Enable C++" OFF)
+
+#
+
+if (DEFINED SLEEF_BUILD_SHARED_LIBS)
+  set(BUILD_SHARED_LIBS ${SLEEF_BUILD_SHARED_LIBS})
+endif ()
+
+if (SLEEF_SHOW_CONFIG)
+  # Normalize the value of BUILD_SHARED_LIBS so that it displays nicely
+  # in the configuration display
+  if (BUILD_SHARED_LIBS)
+    set(BUILD_SHARED_LIBS ON)
+  else ()
+    set(BUILD_SHARED_LIBS OFF)
+  endif ()
+endif ()
+
+# Function used to generate safe command arguments for add_custom_command
+function(command_arguments PROPNAME)
+  set(quoted_args "")
+  foreach(arg ${ARGN})
+    list(APPEND quoted_args "\"${arg}\"" )
+  endforeach()
+  set(${PROPNAME} ${quoted_args} PARENT_SCOPE)
+endfunction()
+
+# Helper function for concatenating several files
+function(sleef_concat_files)
+  cmake_parse_arguments(concat_required "" "OUTPUT" "SOURCES" ${ARGN})
+  if("${concat_required_OUTPUT}" STREQUAL "")
+    message(FATAL_ERROR "Must pass OUTPUT to sleef_concat_files")
+  endif()
+
+  if(NOT concat_required_SOURCES)
+    message(FATAL_ERROR "sleef_concat_files not passed any SOURCES")
+  endif()
+
+  add_custom_command(
+    OUTPUT ${concat_required_OUTPUT}
+    COMMAND ${CMAKE_COMMAND} -E cat ${concat_required_SOURCES} > ${concat_required_OUTPUT}
+    DEPENDS ${concat_required_SOURCES}
+    COMMAND_EXPAND_LISTS)
+endfunction()
+
+# Settings
+
+set(SLEEF_ALL_SUPPORTED_EXTENSIONS
+  AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2  # x86
+  SVENOFMA SVE ADVSIMDNOFMA ADVSIMD                     # Aarch64
+  NEON32 NEON32VFPV4                                    # Aarch32
+  VSX VSXNOFMA VSX3 VSX3NOFMA                           # PPC64
+  VXE VXENOFMA VXE2 VXE2NOFMA                           # IBM Z
+  RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2                     # RISC-V Vectors
+  PUREC_SCALAR PURECFMA_SCALAR                          # Generic type
+  CACHE STRING "List of SIMD architectures supported by libsleef."
+  )
+
+set(SLEEF_SUPPORTED_LIBM_EXTENSIONS
+  AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2  # x86
+  SVENOFMA SVE ADVSIMDNOFMA ADVSIMD                     # Aarch64
+  NEON32 NEON32VFPV4                                    # Aarch32
+  VSX VSXNOFMA VSX3 VSX3NOFMA                           # PPC64
+  VXE VXENOFMA VXE2 VXE2NOFMA                           # IBM Z
+  RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2                     # RISC-V Vectors
+  PUREC_SCALAR PURECFMA_SCALAR                          # Generic type
+  CACHE STRING "List of SIMD architectures supported by libsleef."
+  )
+set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS
+  SSE2 AVX AVX2 AVX512F ADVSIMD SVE
+  CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI."
+)
+
+set(SLEEF_SUPPORTED_QUAD_EXTENSIONS
+  PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX2128 AVX2 AVX512F ADVSIMD SVE VSX VSX3 VXE VXE2 RVVM1 RVVM2)
+
+# MKMASKED_PARAMS
+
+command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_dp avx512f e 8)
+command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_sp avx512f e -16)
+
+command_arguments(MKMASKED_PARAMS_GNUABI_SVE_dp sve s 2)
+command_arguments(MKMASKED_PARAMS_GNUABI_SVE_sp sve s -4)
+
+#
+
+set(COSTOVERRIDE_AVX512F 10)
+set(COSTOVERRIDE_AVX512FNOFMA 10)
+set(COSTOVERRIDE_AVX2 2)
+set(COSTOVERRIDE_AVX 2)
+set(COSTOVERRIDE_NEON32 2)
+set(COSTOVERRIDE_NEON32VFPV4 2)
+set(COSTOVERRIDE_SVE 10)
+set(COSTOVERRIDE_SVENOFMA 10)
+set(COSTOVERRIDE_RVVM1 10)
+set(COSTOVERRIDE_RVVM1NOFMA 10)
+set(COSTOVERRIDE_RVVM2 20)
+set(COSTOVERRIDE_RVVM2NOFMA 20)
+
+#
+
+enable_testing()
+
+if (SLEEF_ENABLE_CXX)
+  enable_language(CXX)
+endif()
+
+if (SLEEF_ENABLE_CUDA)
+  enable_language(CUDA)
+endif()
+
+# For specifying installation directories
+include(GNUInstallDirs)
+
+if(NOT DEFINED sleef_SOURCE_DIR)
+   set(sleef_SOURCE_DIR ${CMAKE_SOURCE_DIR})
+endif()
+
+if(NOT DEFINED sleef_BINARY_DIR)
+   set(sleef_BINARY_DIR ${CMAKE_BINARY_DIR})
+endif()
+
+# Sanity check for in-source builds which we do not want to happen
+if(sleef_SOURCE_DIR STREQUAL sleef_BINARY_DIR)
+  message(FATAL_ERROR "SLEEF does not allow in-source builds.
+You can refer to docs/build-with-cmake.md for instructions on how provide a \
+separate build directory. Note: Please remove autogenerated file \
+`CMakeCache.txt` and directory `CMakeFiles` in the current directory.")
+endif()
+
+if(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
+  message(FATAL_ERROR "SLEEF_ENABLE_LTO and BUILD_SHARED_LIBS cannot be specified at the same time")
+endif(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
+
+if(SLEEF_ENABLE_LTO)
+  include(CheckIPOSupported)
+  check_ipo_supported(RESULT supported OUTPUT error)
+endif()
+
+# Set output directories for the library files
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
+
+foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
+  string(TOUPPER ${CONFIG} CONFIG)
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
+  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
+  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/bin)
+endforeach(CONFIG CMAKE_CONFIGURATION_TYPES)
+
+# Path for finding cmake modules
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
+set(SLEEF_SCRIPT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Scripts CACHE PATH
+  "Path for finding sleef specific cmake scripts")
+
+if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND "x${CMAKE_C_SIMULATE_ID}" STREQUAL "xMSVC")
+  message(STATUS "Building with Clang on Windows")
+  set(SLEEF_CLANG_ON_WINDOWS TRUE)
+endif()
+
+# sleef-config.h.in passes cmake settings to the source code
+include(Configure.cmake)
+configure_file(
+  ${PROJECT_SOURCE_DIR}/sleef-config.h.in
+  ${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY)
+
+# We like to have a documented index of all targets in the project. The
+# variables listed below carry the names of the targets defined throughout
+# the project.
+
+# Generates object file (shared library) `libsleef`
+# Defined in src/libm/CMakeLists.txt via command add_library
+set(TARGET_LIBSLEEF "sleef")
+set(TARGET_LIBSLEEFGNUABI "sleefgnuabi")
+# Generates the sleef.h headers and all the rename headers
+# Defined in src/libm/CMakeLists.txt via custom commands and a custom target
+set(TARGET_HEADERS "headers")
+set(TARGET_INLINE_HEADERS "inline_headers")
+set(TARGET_QINLINE_HEADERS "quad_inline_headers")
+set(TARGET_LIBINLINE "sleefinline")
+# Generates executable files for running the test suite
+# Defined in src/libm-tester/CMakeLists.txt via command add_executable
+set(TARGET_TESTER "tester")
+set(TARGET_IUT "iut")
+# The target to generate LLVM bitcode only, available when SLEEF_ENABLE_LLVM_BITCODE is passed to cmake
+set(TARGET_LLVM_BITCODE "llvm-bitcode")
+# Generates the helper executable file mkrename needed to write the sleef header
+set(TARGET_MKRENAME "mkrename")
+set(TARGET_MKRENAME_GNUABI "mkrename_gnuabi")
+set(TARGET_MKMASKED_GNUABI "mkmasked_gnuabi")
+# Generates the helper executable file mkdisp needed to write the sleef header
+set(TARGET_MKDISP "mkdisp")
+set(TARGET_MKALIAS "mkalias")
+# Generates static library common
+# Defined in src/common/CMakeLists.txt via command add_library
+set(TARGET_LIBCOMMON_OBJ "common")
+set(TARGET_LIBARRAYMAP_OBJ "arraymap")
+
+# Function used to add an executable that is executed on host
+function(add_host_executable TARGETNAME)
+  if (NOT CMAKE_CROSSCOMPILING)
+    add_executable(${TARGETNAME} ${ARGN})
+    # Ensure that Darwin host executable is built as universal binary
+    if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
+      target_compile_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
+      target_link_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
+    endif()
+  else()
+    add_executable(${TARGETNAME} IMPORTED GLOBAL)
+    set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
+  endif()
+endfunction()
+
+function(host_target_AAVPCS_definitions TARGETNAME)
+  if (NOT CMAKE_CROSSCOMPILING)
+    target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1)
+  endif()
+endfunction()
+
+# Generates object file (shared library) `libsleefdft`
+# Defined in src/dft/CMakeLists.txt via command add_library
+set(TARGET_LIBDFT "sleefdft")
+
+# Check subdirectories
+add_subdirectory("src")
+
+# Install the CMake package config
+include(CMakePackageConfigHelpers)
+
+write_basic_package_version_file(
+    sleefConfigVersion.cmake
+    COMPATIBILITY SameMajorVersion
+)
+
+set(
+    SLEEF_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/sleef"
+    CACHE STRING "CMake package config location relative to the install prefix"
+)
+
+mark_as_advanced(SLEEF_INSTALL_CMAKEDIR)
+
+install(
+    FILES
+    "${PROJECT_SOURCE_DIR}/sleefConfig.cmake"
+    "${PROJECT_BINARY_DIR}/sleefConfigVersion.cmake"
+    DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
+    COMPONENT sleef_Development
+)
+
+install(
+    EXPORT sleefTargets
+    NAMESPACE sleef::
+    DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
+    COMPONENT sleef_Development
+)
+
+# Extra messages at configuration time. By default is active, it can be
+# turned off by invoking cmake with "-DSLEEF_SHOW_CONFIG=OFF".
+if(SLEEF_SHOW_CONFIG)
+  message(STATUS "Configuring build for ${PROJECT_NAME}-v${SLEEF_VERSION}")
+  message("   Target system: ${CMAKE_SYSTEM}")
+  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
+    message("   Target processor: ${CMAKE_OSX_ARCHITECTURES}")
+  else()
+    message("   Target processor: ${CMAKE_SYSTEM_PROCESSOR}")
+  endif()
+  message("   Host system: ${CMAKE_HOST_SYSTEM}")
+  message("   Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  message("   Detected C compiler: ${CMAKE_C_COMPILER_ID} @ ${CMAKE_C_COMPILER}")
+  message("   CMake: ${CMAKE_VERSION}")
+  message("   Make program: ${CMAKE_MAKE_PROGRAM}")
+  if(CMAKE_CROSSCOMPILING)
+    message("   Crosscompiling SLEEF.")
+    message("   Native build dir: ${NATIVE_BUILD_DIR}")
+  endif(CMAKE_CROSSCOMPILING)
+  message(STATUS "Using option `${SLEEF_C_FLAGS}` to compile libsleef")
+  message(STATUS "Building shared libs : " ${BUILD_SHARED_LIBS})
+  message(STATUS "Building static test bins: " ${SLEEF_BUILD_STATIC_TEST_BINS})
+  message(STATUS "MPFR : " ${LIB_MPFR})
+  if (MPFR_INCLUDE_DIR)
+    message(STATUS "MPFR header file in " ${MPFR_INCLUDE_DIR})
+  endif()
+  message(STATUS "GMP : " ${LIBGMP})
+  message(STATUS "RT : " ${LIBRT})
+  message(STATUS "FFTW3 : " ${LIBFFTW3})
+  message(STATUS "OPENSSL : " ${OPENSSL_VERSION})
+  message(STATUS "SDE : " ${SDE_COMMAND})
+  if (SLEEF_BUILD_INLINE_HEADERS)
+    message(STATUS "SED : " ${SED_COMMAND})
+  endif()
+  message(STATUS "COMPILER_SUPPORTS_OPENMP : " ${COMPILER_SUPPORTS_OPENMP})
+  if(ENABLE_GNUABI)
+    message(STATUS "A version of SLEEF compatible  with libm and libmvec in GNU libc will be produced (${TARGET_LIBSLEEFGNUABI}.so)")
+  endif()
+  if (COMPILER_SUPPORTS_SVE)
+    message(STATUS "Building SLEEF with VLA SVE support")
+    if (ARMIE_COMMAND)
+      message(STATUS "Arm Instruction Emulator found at ${ARMIE_COMMAND}")
+      message(STATUS "SVE testing is done with ${SVE_VECTOR_BITS}-bits vectors.")
+    endif()
+  endif()
+  if(FORCE_AAVPCS)
+    message(STATUS "Building SLEEF with AArch64 Vector PCS support")
+  endif()
+endif(SLEEF_SHOW_CONFIG)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/CONTRIBUTORS.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/CONTRIBUTORS.md
@@ -0,0 +1,27 @@
+# List of contributors
+
+These lists are not exhaustive and only provide most relevant contact information.
+For an exhausitive list of contributors please refer to the
+[GitHub contributors section for SLEEF](https://github.com/shibatch/sleef/graphs/contributors).
+
+## Maintainers
+
+| Name                 | Affiliation             | Github profile                     |
+| -------------------- | ----------------------- | ---------------------------------- |
+| Pierre Blanchard     | Arm Ltd.                | https://github.com/blapie          |
+| Joana Cruz           | Arm Ltd.                | https://github.com/joanaxcruz      |
+| Joe Ramsay           | Arm Ltd.                | https://github.com/joeramsay       |
+| Naoki Shibata        | Nara Institute of Science and Technology | https://github.com/shibatch |
+
+## Contributors
+
+| Name                 | Affiliation             | Github profile                     |
+| -------------------- | ----------------------- | ---------------------------------- |
+| Anonymous            |                         | https://github.com/friendlyanon    |
+| Diana Bite           | Former Arm Ltd.         | https://github.com/diaena          |
+| Ludovic Henry        | Rivos Inc.              | https://github.com/luhenry         |
+| Martin Krastev       | Chaos Group             | https://github.com/blu             |
+| Jilayne Lovejoy      | Former Arm Inc.         | https://github.com/jlovejoy        |
+| Kerry McLaughlin     | Arm Ltd.                | https://github.com/kmclaughlin-arm |
+| Alexandre Mutel      | Unity Technologies      | https://github.com/xoofx           |
+| Francesco Petrogalli | Former Arm Ltd.         | https://github.com/fpetrogalli-arm |
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/Configure.cmake
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/Configure.cmake
@@ -0,0 +1,860 @@
+include(CheckCCompilerFlag)
+include(CheckCSourceCompiles)
+include(CheckTypeSize)
+include(CheckLanguage)
+
+#
+
+if (SLEEF_BUILD_STATIC_TEST_BINS)
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+  set(BUILD_SHARED_LIBS OFF)
+  set(CMAKE_EXE_LINKER_FLAGS "-static")
+endif()
+
+set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl")
+if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL)
+  if (SLEEF_BUILD_STATIC_TEST_BINS)
+    set(OPENSSL_USE_STATIC_LIBS TRUE)
+  endif()
+  find_package(OpenSSL)
+  if (OPENSSL_FOUND)
+    set(SLEEF_OPENSSL_FOUND TRUE)
+    set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES})
+    # Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically.
+    # This is a known issue https://github.com/openssl/openssl/issues/13872.
+    if (SLEEF_BUILD_STATIC_TEST_BINS)
+      string(REGEX REPLACE
+             "-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
+             SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}")
+    endif()
+    set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION})
+    set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES})
+    set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
+  endif()
+else()
+  # find_package cannot find OpenSSL when cross-compiling
+  find_library(LIBSSL ssl)
+  find_library(LIBCRYPTO crypto)
+  if (LIBSSL AND LIBCRYPTO)
+    set(SLEEF_OPENSSL_FOUND TRUE)
+    set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES})
+    set(SLEEF_OPENSSL_VERSION ${LIBSSL})
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_TESTER3 AND NOT SLEEF_OPENSSL_FOUND)
+  message(FATAL_ERROR "SLEEF_ENFORCE_TESTER3 is specified and OpenSSL not found")
+endif()
+
+# Some toolchains require explicit linking of the libraries following.
+find_library(LIB_MPFR mpfr)
+find_library(LIBM m)
+find_library(LIBGMP gmp)
+find_library(LIBRT rt)
+find_library(LIBFFTW3 fftw3)
+
+if (LIB_MPFR)
+  find_path(MPFR_INCLUDE_DIR
+    NAMES mpfr.h
+    ONLY_CMAKE_FIND_ROOT_PATH)
+endif(LIB_MPFR)
+
+if (LIBFFTW3)
+  find_path(FFTW3_INCLUDE_DIR
+    NAMES fftw3.h
+    ONLY_CMAKE_FIND_ROOT_PATH)
+endif(LIBFFTW3)
+
+if (NOT LIBM)
+  set(LIBM "")
+endif()
+
+if (NOT LIBRT)
+  set(LIBRT "")
+endif()
+
+if (SLEEF_DISABLE_MPFR)
+  set(LIB_MPFR "")
+endif()
+
+if (SLEEF_DISABLE_SSL)
+  set(SLEEF_OPENSSL_FOUND FALSE)
+endif()
+
+# Force set default build type if none was specified
+# Note: some sleef code requires the optimisation flags turned on
+if(NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Setting build type to 'Release' (required for full support).")
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "RelWithDebInfo" "MinSizeRel")
+endif()
+
+# Sanitizers
+if(SLEEF_ASAN)
+  # Add address sanitizing to all targets
+  add_compile_options(-fno-omit-frame-pointer -fsanitize=address)
+  add_link_options(-fno-omit-frame-pointer -fsanitize=address)
+endif()
+
+# TARGET PROCESSOR DETECTION
+set(SLEEF_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
+  set(SLEEF_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
+endif()
+
+# PLATFORM DETECTION
+if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+  set(SLEEF_ARCH_32BIT ON CACHE INTERNAL "True for 32-bit architecture.")
+endif()
+
+if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)")
+  set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")
+
+  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
+elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64")
+  set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
+  # Aarch64 requires support for advsimdfma4
+  set(COMPILER_SUPPORTS_ADVSIMD 1)
+  set(COMPILER_SUPPORTS_ADVSIMDNOFMA 1)
+
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+  set(SLEEF_ARCH_AARCH32 ON CACHE INTERNAL "True for Aarch32 architecture.")
+  set(COMPILER_SUPPORTS_NEON32 1)
+  set(COMPILER_SUPPORTS_NEON32VFPV4 1)
+
+  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mfpu=vfpv4")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+  set(SLEEF_ARCH_PPC64 ON CACHE INTERNAL "True for PPC64 architecture.")
+
+  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mvsx")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
+  set(SLEEF_ARCH_S390X ON CACHE INTERNAL "True for IBM Z architecture.")
+
+  set(CLANG_FLAGS_ENABLE_PUREC_SCALAR "-march=z14;-mzvector")
+  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-march=z14;-mzvector")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+  set(SLEEF_ARCH_RISCV64 ON CACHE INTERNAL "True for RISCV64 architecture.")
+endif()
+
+set(COMPILER_SUPPORTS_PUREC_SCALAR 1)
+set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
+
+# Compiler feature detection
+
+# Detect CLANG executable path (on both Windows and Linux/OSX)
+if(NOT CLANG_EXE_PATH)
+  # If the current compiler used by CMAKE is already clang, use this one directly
+  if(CMAKE_C_COMPILER MATCHES "clang")
+    set(CLANG_EXE_PATH ${CMAKE_C_COMPILER})
+  else()
+    # Else we may find clang on the path?
+    find_program(CLANG_EXE_PATH NAMES clang "clang-11" "clang-10" "clang-9" "clang-8" "clang-7" "clang-6.0" "clang-5.0" "clang-4.0" "clang-3.9")
+  endif()
+endif()
+
+# Allow to define the Gcc/Clang here
+# As we might compile the lib with MSVC, but generates bitcode with CLANG
+# Intel vector extensions.
+set(CLANG_FLAGS_ENABLE_SSE2 "-msse2")
+set(CLANG_FLAGS_ENABLE_SSE4 "-msse4.1")
+set(CLANG_FLAGS_ENABLE_AVX "-mavx")
+set(CLANG_FLAGS_ENABLE_FMA4 "-mfma4")
+set(CLANG_FLAGS_ENABLE_AVX2 "-mavx2;-mfma")
+set(CLANG_FLAGS_ENABLE_AVX2128 "-mavx2;-mfma")
+set(CLANG_FLAGS_ENABLE_AVX512F "-mavx512f")
+set(CLANG_FLAGS_ENABLE_AVX512FNOFMA "-mavx512f")
+set(CLANG_FLAGS_ENABLE_NEON32 "--target=arm-linux-gnueabihf;-mcpu=cortex-a8")
+set(CLANG_FLAGS_ENABLE_NEON32VFPV4 "-march=armv7-a;-mfpu=neon-vfpv4")
+# Arm AArch64 vector extensions.
+set(CLANG_FLAGS_ENABLE_SVE "-march=armv8-a+sve")
+set(CLANG_FLAGS_ENABLE_SVENOFMA "-march=armv8-a+sve")
+# PPC64
+set(CLANG_FLAGS_ENABLE_VSX "-mcpu=power8")
+set(CLANG_FLAGS_ENABLE_VSXNOFMA "-mcpu=power8")
+set(CLANG_FLAGS_ENABLE_VSX3 "-mcpu=power9")
+set(CLANG_FLAGS_ENABLE_VSX3NOFMA "-mcpu=power9")
+# IBM z
+set(CLANG_FLAGS_ENABLE_VXE "-march=z14;-mzvector")
+set(CLANG_FLAGS_ENABLE_VXENOFMA "-march=z14;-mzvector")
+set(CLANG_FLAGS_ENABLE_VXE2 "-march=z15;-mzvector")
+set(CLANG_FLAGS_ENABLE_VXE2NOFMA "-march=z15;-mzvector")
+# RISC-V
+set(CLANG_FLAGS_ENABLE_RVVM1 "-march=rv64gcv_zba_zbb_zbs")
+set(CLANG_FLAGS_ENABLE_RVVM1NOFMA "-march=rv64gcv_zba_zbb_zbs")
+set(CLANG_FLAGS_ENABLE_RVVM2 "-march=rv64gcv_zba_zbb_zbs")
+set(CLANG_FLAGS_ENABLE_RVVM2NOFMA "-march=rv64gcv_zba_zbb_zbs")
+
+set(FLAGS_OTHERS "")
+
+# All variables storing compiler flags should be prefixed with FLAGS_
+if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
+  # Always compile sleef with -ffp-contract.
+  set(FLAGS_STRICTMATH "-ffp-contract=off")
+  set(FLAGS_FASTMATH "-ffast-math")
+  set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
+
+  if (SLEEF_ARCH_X86 AND SLEEF_ARCH_32BIT)
+    string(CONCAT FLAGS_STRICTMATH ${FLAGS_STRICTMATH} " -msse2 -mfpmath=sse")
+    string(CONCAT FLAGS_FASTMATH ${FLAGS_FASTMATH} " -msse2 -mfpmath=sse")
+  endif()
+
+  # Without the options below, gcc generates calls to libm
+  string(CONCAT FLAGS_OTHERS "-fno-math-errno -fno-trapping-math")
+
+  # Intel vector extensions.
+  foreach(SIMD ${SLEEF_ALL_SUPPORTED_EXTENSIONS})
+    set(FLAGS_ENABLE_${SIMD} ${CLANG_FLAGS_ENABLE_${SIMD}})
+  endforeach()
+
+  # Warning flags.
+  set(FLAGS_WALL "-Wall -Wno-unused-function -Wno-attributes -Wno-unused-result")
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+    # The following compiler option is needed to suppress the warning
+    # "AVX vector return without AVX enabled changes the ABI" at
+    # src/arch/helpervecext.h:88
+    string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi")
+    set(FLAGS_ENABLE_NEON32 "-mfpu=neon")
+  endif(CMAKE_C_COMPILER_ID MATCHES "GNU")
+
+  if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
+    if (NOT SLEEF_LLVM_AR_COMMAND)
+      find_program(SLEEF_LLVM_AR_COMMAND "llvm-ar")
+    endif()
+    if (SLEEF_LLVM_AR_COMMAND)
+      SET(CMAKE_AR ${SLEEF_LLVM_AR_COMMAND})
+      SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs <TARGET> <LINK_FLAGS> <OBJECTS>")
+      SET(CMAKE_C_ARCHIVE_FINISH "true")
+    endif(SLEEF_LLVM_AR_COMMAND)
+    string(CONCAT FLAGS_OTHERS "-flto=thin")
+  endif(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
+
+  # Flags for generating inline headers
+  set(FLAG_PREPROCESS "-E")
+  set(FLAG_PRESERVE_COMMENTS "-C")
+  set(FLAG_INCLUDE "-I")
+  set(FLAG_DEFINE "-D")
+
+  if (SLEEF_CLANG_ON_WINDOWS)
+    # The following line is required to prevent clang from displaying
+    # many warnings. Clang on Windows references MSVC header files,
+    # which have deprecation and security attributes for many
+    # functions.
+
+    string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE -Wno-deprecated-declarations")
+  endif()
+elseif(MSVC)
+  # Intel vector extensions.
+  if (CMAKE_CL_64)
+    set(FLAGS_ENABLE_SSE2 /D__SSE2__)
+    set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__)
+  else()
+    set(FLAGS_ENABLE_SSE2 /D__SSE2__ /arch:SSE2)
+    set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /arch:SSE2)
+  endif()
+  set(FLAGS_ENABLE_AVX  /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /arch:AVX)
+  set(FLAGS_ENABLE_FMA4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__FMA4__ /arch:AVX2)
+  set(FLAGS_ENABLE_AVX2 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
+  set(FLAGS_ENABLE_AVX2128 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
+  set(FLAGS_ENABLE_AVX512F /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
+  set(FLAGS_ENABLE_AVX512FNOFMA /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
+  set(FLAGS_ENABLE_PURECFMA_SCALAR /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
+  set(FLAGS_WALL "/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE")
+
+  set(FLAGS_NO_ERRNO "")
+
+  set(FLAG_PREPROCESS "/E")
+  set(FLAG_PRESERVE_COMMENTS "/C")
+  set(FLAG_INCLUDE "/I")
+  set(FLAG_DEFINE "/D")
+elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
+  set(FLAGS_ENABLE_SSE2 "-msse2")
+  set(FLAGS_ENABLE_SSE4 "-msse4.1")
+  set(FLAGS_ENABLE_AVX "-mavx")
+  set(FLAGS_ENABLE_AVX2 "-march=core-avx2")
+  set(FLAGS_ENABLE_AVX2128 "-march=core-avx2")
+  set(FLAGS_ENABLE_AVX512F "-xCOMMON-AVX512")
+  set(FLAGS_ENABLE_AVX512FNOFMA "-xCOMMON-AVX512")
+  set(FLAGS_ENABLE_PURECFMA_SCALAR "-march=core-avx2;-fno-strict-aliasing")
+  set(FLAGS_ENABLE_FMA4 "-msse2")  # This is a dummy flag
+  if(CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
+    set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_types")
+    set(FLAGS_FASTMATH "-fp-model fast -Qoption,cpp,--extended_float_types")
+  else()
+    set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type")
+    set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type")
+  endif()
+  set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
+  set(FLAGS_WALL "-fmax-errors=3 -Wall -Wno-unused -Wno-attributes")
+
+  set(FLAGS_NO_ERRNO "")
+
+  set(FLAG_PREPROCESS "-E")
+  set(FLAG_PRESERVE_COMMENTS "-C")
+  set(FLAG_INCLUDE "-I")
+  set(FLAG_DEFINE "-D")
+endif()
+
+set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_OTHERS}")
+if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
+  set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_OTHERS}")
+else()
+  set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_FASTMATH} ${FLAGS_OTHERS}")
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+  set(FLAGS_ENABLE_SVE "${FLAGS_ENABLE_SVE};-fno-tree-vrp")
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
+  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
+  set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
+  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
+  set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse")
+endif()
+
+if(CYGWIN OR MINGW)
+  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-asynchronous-unwind-tables")
+  set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-asynchronous-unwind-tables")
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 9.3 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10.2)
+  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
+  set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
+endif()
+
+# FEATURE DETECTION
+
+# Long double
+
+option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF)
+option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF)
+
+if(NOT SLEEF_DISABLE_LONG_DOUBLE)
+  CHECK_TYPE_SIZE("long double" LD_SIZE)
+  if(LD_SIZE GREATER "9")
+    # This is needed to check since internal compiler error occurs with gcc 4.x
+    CHECK_C_SOURCE_COMPILES("
+  typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*2)));
+  vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
+  int main() { vlongdouble vld = vcast_vl_l(0);
+  }" COMPILER_SUPPORTS_LONG_DOUBLE)
+  endif()
+else()
+  message(STATUS "Support for long double disabled by CMake option")
+endif()
+
+if (SLEEF_ENFORCE_LONG_DOUBLE AND NOT COMPILER_SUPPORTS_LONG_DOUBLE)
+  message(FATAL_ERROR "SLEEF_ENFORCE_LONG_DOUBLE is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# float128
+
+option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF)
+option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF)
+
+if(NOT SLEEF_DISABLE_FLOAT128)
+  CHECK_C_SOURCE_COMPILES("
+  int main() { __float128 r = 1;
+  }" COMPILER_SUPPORTS_FLOAT128)
+else()
+  message(STATUS "Support for float128 disabled by CMake option")
+endif()
+
+if (SLEEF_ENFORCE_FLOAT128 AND NOT COMPILER_SUPPORTS_FLOAT128)
+  message(FATAL_ERROR "SLEEF_ENFORCE_FLOAT128 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+if(COMPILER_SUPPORTS_FLOAT128)
+  CHECK_C_SOURCE_COMPILES("
+  #include <quadmath.h>
+  int main() { __float128 r = 1;
+  }" COMPILER_SUPPORTS_QUADMATH)
+endif()
+
+# SSE2
+
+option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF)
+option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE2)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE2}")
+  CHECK_C_SOURCE_COMPILES("
+  #if defined(_MSC_VER)
+  #include <intrin.h>
+  #else
+  #include <x86intrin.h>
+  #endif
+  int main() {
+    __m128d r = _mm_mul_pd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
+    COMPILER_SUPPORTS_SSE2)
+endif()
+
+if (SLEEF_ENFORCE_SSE2 AND NOT COMPILER_SUPPORTS_SSE2)
+  message(FATAL_ERROR "SLEEF_ENFORCE_SSE2 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# SSE 4.1
+
+option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF)
+option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE4)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE4}")
+  CHECK_C_SOURCE_COMPILES("
+  #if defined(_MSC_VER)
+  #include <intrin.h>
+  #else
+  #include <x86intrin.h>
+  #endif
+  int main() {
+    __m128d r = _mm_floor_sd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
+    COMPILER_SUPPORTS_SSE4)
+endif()
+
+if (SLEEF_ENFORCE_SSE4 AND NOT COMPILER_SUPPORTS_SSE4)
+  message(FATAL_ERROR "SLEEF_ENFORCE_SSE4 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# AVX
+
+option(SLEEF_ENFORCE_AVX "Disable AVX" OFF)
+option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX}")
+  CHECK_C_SOURCE_COMPILES("
+  #if defined(_MSC_VER)
+  #include <intrin.h>
+  #else
+  #include <x86intrin.h>
+  #endif
+  int main() {
+    __m256d r = _mm256_add_pd(_mm256_set1_pd(1), _mm256_set1_pd(2));
+  }" COMPILER_SUPPORTS_AVX)
+endif()
+
+if (SLEEF_ENFORCE_AVX AND NOT COMPILER_SUPPORTS_AVX)
+  message(FATAL_ERROR "SLEEF_ENFORCE_AVX is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# FMA4
+
+option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF)
+option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_FMA4)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_FMA4}")
+  CHECK_C_SOURCE_COMPILES("
+  #if defined(_MSC_VER)
+  #include <intrin.h>
+  #else
+  #include <x86intrin.h>
+  #endif
+  int main() {
+    __m256d r = _mm256_macc_pd(_mm256_set1_pd(1), _mm256_set1_pd(2), _mm256_set1_pd(3)); }"
+    COMPILER_SUPPORTS_FMA4)
+endif()
+
+if (SLEEF_ENFORCE_FMA4 AND NOT COMPILER_SUPPORTS_FMA4)
+  message(FATAL_ERROR "SLEEF_ENFORCE_FMA4 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# AVX2
+
+option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF)
+option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX2)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX2}")
+  CHECK_C_SOURCE_COMPILES("
+  #if defined(_MSC_VER)
+  #include <intrin.h>
+  #else
+  #include <x86intrin.h>
+  #endif
+  int main() {
+    __m256i r = _mm256_abs_epi32(_mm256_set1_epi32(1)); }"
+    COMPILER_SUPPORTS_AVX2)
+
+  # AVX2 implies AVX2128
+  if(COMPILER_SUPPORTS_AVX2)
+    set(COMPILER_SUPPORTS_AVX2128 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_AVX2 AND NOT COMPILER_SUPPORTS_AVX2)
+  message(FATAL_ERROR "SLEEF_ENFORCE_AVX2 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# AVX512F
+
+option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF)
+option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX512F)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX512F}")
+  CHECK_C_SOURCE_COMPILES("
+  #if defined(_MSC_VER)
+  #include <intrin.h>
+  #else
+  #include <x86intrin.h>
+  #endif
+  __m512 addConstant(__m512 arg) {
+    return _mm512_add_ps(arg, _mm512_set1_ps(1.f));
+  }
+  int main() {
+    __m512i a = _mm512_set1_epi32(1);
+    __m256i ymm = _mm512_extracti64x4_epi64(a, 0);
+    __mmask16 m = _mm512_cmp_epi32_mask(a, a, _MM_CMPINT_EQ);
+    __m512i r = _mm512_andnot_si512(a, a); }"
+    COMPILER_SUPPORTS_AVX512F)
+
+  if (COMPILER_SUPPORTS_AVX512F)
+    set(COMPILER_SUPPORTS_AVX512FNOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_AVX512F AND NOT COMPILER_SUPPORTS_AVX512F)
+  message(FATAL_ERROR "SLEEF_ENFORCE_AVX512F is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# SVE
+
+option(SLEEF_DISABLE_SVE "Disable SVE" OFF)
+option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF)
+
+# Darwin does not support SVE yet (see issue #474),
+# therefore we disable SVE on Darwin systems.
+if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SVE}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <arm_sve.h>
+  int main() {
+    svint32_t r = svdup_n_s32(1); }"
+    COMPILER_SUPPORTS_SVE)
+
+  if(COMPILER_SUPPORTS_SVE)
+    set(COMPILER_SUPPORTS_SVENOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_SVE AND NOT COMPILER_SUPPORTS_SVE)
+  message(FATAL_ERROR "SLEEF_ENFORCE_SVE is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# VSX
+
+option(SLEEF_DISABLE_VSX "Disable VSX" OFF)
+option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <altivec.h>
+  #ifndef __LITTLE_ENDIAN__
+    #error \"Only VSX(ISA2.07) little-endian mode is supported \"
+  #endif
+  int main() {
+    vector double d;
+    vector unsigned char p = {
+      4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
+    };
+    d = vec_perm(d, d, p);
+  }"
+    COMPILER_SUPPORTS_VSX)
+
+  if (COMPILER_SUPPORTS_VSX)
+    set(COMPILER_SUPPORTS_VSXNOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_VSX AND NOT COMPILER_SUPPORTS_VSX)
+  message(FATAL_ERROR "SLEEF_ENFORCE_VSX is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# VSX3
+
+option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF)
+option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX3)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX3}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <altivec.h>
+  #ifndef __LITTLE_ENDIAN__
+    #error \"Only VSX3 little-endian mode is supported \"
+  #endif
+  int main() {
+    static vector double d;
+    static vector unsigned long long a, b;
+
+    d = vec_insert_exp(a, b);
+  }"
+    COMPILER_SUPPORTS_VSX3)
+
+  if (COMPILER_SUPPORTS_VSX3)
+    set(COMPILER_SUPPORTS_VSX3NOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_VSX3 AND NOT COMPILER_SUPPORTS_VSX3)
+  message(FATAL_ERROR "SLEEF_ENFORCE_VSX3 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# IBM Z
+
+option(SLEEF_DISABLE_VXE "Disable VXE" OFF)
+option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <vecintrin.h>
+  int main() {
+    __vector float d;
+    d = vec_sqrt(d);
+  }"
+    COMPILER_SUPPORTS_VXE)
+
+  if(COMPILER_SUPPORTS_VXE)
+    set(COMPILER_SUPPORTS_VXENOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_VXE AND NOT COMPILER_SUPPORTS_VXE)
+  message(FATAL_ERROR "SLEEF_ENFORCE_VXE is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+#
+
+option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF)
+option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE2)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE2}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <vecintrin.h>
+  int main() {
+    __vector float d;
+    d = vec_sqrt(d);
+  }"
+    COMPILER_SUPPORTS_VXE2)
+
+  if(COMPILER_SUPPORTS_VXE2)
+    set(COMPILER_SUPPORTS_VXE2NOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_VXE2 AND NOT COMPILER_SUPPORTS_VXE2)
+  message(FATAL_ERROR "SLEEF_ENFORCE_VXE2 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# RVVM1
+
+option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF)
+option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM1)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM1}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <riscv_vector.h>
+  int main() {
+    vint32m1_t r = __riscv_vmv_v_x_i32m1(1, __riscv_vlenb() * 8 / 32); }"
+    COMPILER_SUPPORTS_RVVM1)
+
+  if(COMPILER_SUPPORTS_RVVM1)
+    set(COMPILER_SUPPORTS_RVVM1NOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_RVVM1 AND NOT COMPILER_SUPPORTS_RVVM1)
+  message(FATAL_ERROR "SLEEF_ENFORCE_RVVM1 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# RVVM2
+
+option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF)
+option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF)
+
+if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM2)
+  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM2}")
+  CHECK_C_SOURCE_COMPILES("
+  #include <riscv_vector.h>
+  int main() {
+    vint32m2_t r = __riscv_vmv_v_x_i32m2(1, 2 * __riscv_vlenb() * 8 / 32); }"
+    COMPILER_SUPPORTS_RVVM2)
+
+  if(COMPILER_SUPPORTS_RVVM2)
+    set(COMPILER_SUPPORTS_RVVM2NOFMA 1)
+  endif()
+endif()
+
+if (SLEEF_ENFORCE_RVVM2 AND NOT COMPILER_SUPPORTS_RVVM2)
+  message(FATAL_ERROR "SLEEF_ENFORCE_RVVM2 is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# CUDA
+
+option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
+
+if (SLEEF_ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
+  message(FATAL_ERROR "SLEEF_ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# OpenMP
+
+option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
+option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
+
+if(NOT SLEEF_DISABLE_OPENMP)
+  find_package(OpenMP)
+  # Check if compilation with OpenMP really succeeds
+  # It might not succeed even though find_package(OpenMP) succeeds.
+  if(OPENMP_FOUND)
+    set (CMAKE_REQUIRED_FLAGS "${OpenMP_C_FLAGS}")
+    CHECK_C_SOURCE_COMPILES("
+  #include <stdio.h>
+  int main() {
+  int i;
+  #pragma omp parallel for
+    for(i=0;i < 10;i++) { putchar(0); }
+  }"
+      COMPILER_SUPPORTS_OPENMP)
+
+    CHECK_C_SOURCE_COMPILES("
+  #pragma omp declare simd notinbranch
+  double func(double x) { return x + 1; }
+  double a[1024];
+  int main() {
+  #pragma omp parallel for simd
+    for (int i = 0; i < 1024; i++) a[i] = func(a[i]);
+  }
+  "
+      COMPILER_SUPPORTS_OMP_SIMD)
+  endif(OPENMP_FOUND)
+else()
+  message(STATUS "Support for OpenMP disabled by CMake option")
+endif()
+
+if (SLEEF_ENFORCE_OPENMP AND NOT COMPILER_SUPPORTS_OPENMP)
+  message(FATAL_ERROR "SLEEF_ENFORCE_OPENMP is specified and that feature is disabled or not supported by the compiler")
+endif()
+
+# Weak aliases
+
+CHECK_C_SOURCE_COMPILES("
+#if defined(__CYGWIN__)
+#define EXPORT __stdcall __declspec(dllexport)
+#else
+#define EXPORT
+#endif
+  EXPORT int f(int a) {
+   return a + 2;
+  }
+  EXPORT int g(int a) __attribute__((weak, alias(\"f\")));
+  int main(void) {
+    return g(2);
+  }"
+  COMPILER_SUPPORTS_WEAK_ALIASES)
+if (COMPILER_SUPPORTS_WEAK_ALIASES AND
+    NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND
+    NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND
+    NOT SLEEF_CLANG_ON_WINDOWS AND
+    NOT MINGW AND SLEEF_BUILD_GNUABI_LIBS)
+  set(ENABLE_GNUABI ${COMPILER_SUPPORTS_WEAK_ALIASES})
+endif()
+
+# Built-in math functions
+
+CHECK_C_SOURCE_COMPILES("
+  int main(void) {
+    double a = __builtin_sqrt (2);
+    float  b = __builtin_sqrtf(2);
+  }"
+  COMPILER_SUPPORTS_BUILTIN_MATH)
+
+# SYS_getrandom
+
+CHECK_C_SOURCE_COMPILES("
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/random.h>
+  int main(void) {
+    int i;
+    syscall(SYS_getrandom, &i, sizeof(i), 0);
+  }"
+  COMPILER_SUPPORTS_SYS_GETRANDOM)
+
+#
+
+# Reset used flags
+set(CMAKE_REQUIRED_FLAGS)
+set(CMAKE_REQUIRED_LIBRARIES)
+
+# Save the default C flags
+set(ORG_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+
+##
+
+# Check if sde64 command is available
+
+find_program(SDE_COMMAND sde64)
+if (NOT SDE_COMMAND)
+  find_program(SDE_COMMAND sde)
+endif()
+
+# Check if armie command is available
+
+find_program(ARMIE_COMMAND armie)
+if (NOT SVE_VECTOR_BITS)
+  set(SVE_VECTOR_BITS 128)
+endif()
+
+#
+
+find_program(FILECHECK_COMMAND NAMES FileCheck FileCheck-11 FileCheck-10 FileCheck-9)
+
+#
+
+find_program(SED_COMMAND sed)
+
+##
+
+if(SLEEF_SHOW_ERROR_LOG)
+  if (EXISTS ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log)
+    file(READ ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log FILE_CONTENT)
+    message("")
+    message("")
+    message("======  Content of CMakeError.log  ======")
+    message("")
+    message("${FILE_CONTENT}")
+    message("")
+    message("========  End of CMakeError.log  ========")
+    message("")
+    message("")
+  endif()
+endif(SLEEF_SHOW_ERROR_LOG)
+
+if (MSVC OR SLEEF_CLANG_ON_WINDOWS)
+  set(COMPILER_SUPPORTS_OPENMP FALSE)   # At this time, OpenMP is not supported on MSVC
+endif()
+
+##
+
+# Set common definitions
+
+if (NOT BUILD_SHARED_LIBS)
+  set(COMMON_TARGET_DEFINITIONS SLEEF_STATIC_LIBS=1)
+  set(SLEEF_STATIC_LIBS 1)
+endif()
+
+if (COMPILER_SUPPORTS_WEAK_ALIASES)
+  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_ALIAS=1)
+endif()
+
+if (COMPILER_SUPPORTS_SYS_GETRANDOM)
+  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_SYS_getrandom=1)
+endif()
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/LICENSE.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/LICENSE.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/README.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/README.md
@@ -0,0 +1,221 @@
+# SLEEF
+
+![Github Actions](https://github.com/shibatch/sleef/actions/workflows/build_and_test.yml/badge.svg?event=push&branch=master)
+[![DOI:10.1109/TPDS.2019.2960333](http://img.shields.io/badge/DOI-10.1109/TPDS.2019.2960333-blue.svg)](https://ieeexplore.ieee.org/document/8936472)
+[![License](https://img.shields.io/badge/License-Boost_1.0-lightblue.svg)](https://www.boost.org/LICENSE_1_0.txt)
+![CMake](https://img.shields.io/badge/cmake-v3.18+-yellow.svg)
+[![Spack](https://img.shields.io/spack/v/sleef)](https://spack.readthedocs.io/en/v0.16.2/package_list.html#sleef)
+[![SourceForge Downloads](https://img.shields.io/sourceforge/dt/sleef)](https://sourceforge.net/projects/sleef/)
+
+SLEEF is a library that implements vectorized versions of C standard math functions. This library also includes DFT subroutines.
+
+- **Web Page:** [https://sleef.org/][webpage_url]
+- **Sources:** [https://github.com/shibatch/sleef][repo_url]
+
+## Supported environment
+
+### Test matrix
+
+The following table summarises currently supported vector extensions, compilers and OS-es.
+
+:green_circle: : Tested extensively in CI.
+
+:yellow_circle: : Tested partially in CI.
+
+:x: : Currently failing some tests in CI.
+
+:white_circle: : Not tested in CI. Might have passed tests in previous CI framework.
+
+[This issue](https://github.com/shibatch/sleef/issues/481) tracks progress on improving test coverage.
+Compilation of SLEEF on previously supported environments might still be safe, we just cannot verify it yet.
+
+
+<table>
+<tr>
+  <th colspan="2" rowspan="2"></th>
+  <th colspan="9">OS/Compiler</th>
+</tr>
+<tr>
+  <th colspan="3">Linux</th>
+  <th colspan="2">macOS</th>
+  <th colspan="4">Windows</th>
+</tr>
+<tr>
+  <th>Arch.</th>
+  <th>Vector Extensions</th>
+  <th>gcc</th><th>llvm</th><th>icc</th>
+  <th>gcc</th><th>llvm</th>
+  <th>gcc</th><th>llvm-gnu</th><th>llvm-msvc</th><th>msvc</th>
+</tr>
+<tr align="center"><th>x86_64</th><th>SSE2, SSE4,<br>AVX, AVX2, AVX512F</th>
+  <td>:green_circle:</td><td>:green_circle:</td><td>:white_circle:</td>
+  <td>:white_circle:</td><td>:green_circle:</td>
+  <td>:white_circle:</td><td>:yellow_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
+</tr>
+<tr align="center"><th>x86 32bit<br>(i386)</th><th>SSE</th>
+  <td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
+  <td colspan="2">N/A</td>
+  <td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
+</tr>
+<tr align="center"><th>AArch64<br>(arm)</th><th>Neon, SVE</th>
+  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
+  <td colspan="1">N/A</td><td>:green_circle:</td>
+  <td colspan="1">N/A</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
+</tr>
+<tr align="center"><th>AArch32<br>(armhf)</th><th>NEON</th>
+  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
+  <td colspan="2">N/A</td>
+  <td colspan="4">N/A</td>
+</tr>
+<tr align="center"><th>PowerPC<br>(ppc64el)</th><th>VSX, VSX3</th>
+  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
+  <td colspan="2">N/A</td>
+  <td colspan="4">N/A</td>
+</tr>
+<tr align="center"><th>IBM/Z<br>(s390x)</th><th>VXE, VXE2</th>
+  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
+  <td colspan="2">N/A</td>
+  <td colspan="4">N/A</td>
+</tr>
+<tr align="center"><th>RISC-V<br>(riscv64)</th><th>RVV1, RVV2</th>
+  <td>N/A (14+)</td><td>:green_circle:</td><td>N/A</td>
+  <td colspan="2">N/A</td>
+  <td colspan="4">N/A</td>
+</tr>
+</table>
+
+### Component support
+
+The above table is valid for libm in single, double and quadruple precision, as well as fast Discrete Fourier Transform (DFT).
+
+Generation of inline headers is also supported for most vector extensions.
+
+LTO is not tested in CI yet, except on Windows.
+
+### Compiler support
+
+Results are displayed for gcc 11 and llvm 17, the compiler versions used in CI tests with GitHub Actions.
+
+Older versions should be supported too, while newer ones are either not tested or have known issues.
+
+Some compiler versions simply do not support certain vector extensions, for instance SVE is only supported for gcc version 9 onwards.
+
+Similarly, the RISC-V interface in SLEEF is based on version 1.0 of the intrinsics, which is only supported from llvm version 17 and gcc version 14 onwards.
+
+Toolchain files provide some information on supported compiler versions.
+
+### OS support
+
+Only Linux distributions and macOS are fully tested in CI and thus officially supported.
+
+Building SLEEF for Windows on x86 machines was officially supported ( :white_circle: ), as of 3.5.1,
+however it is only partially tested due to [known limitations of the test suite with MinGW or MSYS2](https://github.com/shibatch/sleef/issues/544).
+As a result tests for Windows on x86 only include DFT for now (other tests are disabled in build system),
+but all components are built.
+
+Support for iOS and Android is only preliminary on AArch64.
+
+SVE is not supported on Darwin-based system and therefore automatically disabled by SLEEF on Darwin.
+
+### More on supported environment
+
+Refer to our web page for [more on supported environment][supported_env_url].
+
+## Install SLEEF dependencies
+
+The library itself does not have any additional dependency.
+
+However some tests require:
+
+- libssl and libcrypto, that can be provided by installing openssl.
+- libm, libgmp and libmpfr
+- libfftw.
+
+These tests can be disabled if necessary.
+
+## How to build SLEEF
+
+We recommend relying on CMake as much as possible in the build process to ensure portability.
+**CMake 3.18+** is the minimum required.
+
+1. Check out the source code from our GitHub repository
+
+```
+git clone https://github.com/shibatch/sleef
+```
+
+2. Make a separate directory to create an out-of-source build
+
+```
+cd sleef && mkdir build
+```
+
+3. Run cmake to configure the project
+
+```
+cmake -S . -B build
+```
+
+By default this will generate shared libraries. In order to generate static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`.
+
+For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`.
+
+4. Run make to build the project
+
+```
+cmake --build build -j --clean-first
+```
+
+5. Run tests using ctests
+
+```
+ctest --test-dir build -j
+```
+
+For more detailed build instructions please refer to the [dedicated section on CMake](./docs/build-with-cmake.md) or to [our web page][build_info_url].
+
+## Install SLEEF
+
+### From source
+
+Assuming following instructions were followed.
+
+6. Install to specified directory `<prefix>`
+
+```
+cmake --install build --prefix=<prefix>
+```
+
+### Using Spack
+
+SLEEF can also be directly installed using Spack.
+
+```
+spack install sleef@master
+```
+
+### Uninstall
+
+In order to uninstall SLEEF library and headers run
+
+```
+sudo xargs rm -v < build/install_manifest.txt
+```
+
+## License
+
+The software is distributed under the Boost Software License, Version 1.0.
+See accompanying file [LICENSE.txt](./LICENSE.txt) or copy at [http://www.boost.org/LICENSE_1_0.txt][license_url].
+Contributions to this project are accepted under the same license.
+
+Copyright &copy; 2010-2024 SLEEF Project, Naoki Shibata and contributors.<br/>
+
+
+<!-- Repository links -->
+
+[webpage_url]: https://sleef.org/
+[build_info_url]: https://sleef.org/compile.xhtml
+[supported_env_url]: https://sleef.org/index.xhtml#environment
+[repo_url]: https://github.com/shibatch/sleef
+[repo_license_url]: https://github.com/shibatch/sleef/blob/main/LICENSE.txt
+[license_url]: http://www.boost.org/LICENSE_1_0.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/include/sleefdft.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/include/sleefdft.h
@@ -0,0 +1,71 @@
+#ifndef __SLEEFDFT_H__
+#define __SLEEFDFT_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#define SLEEF_MODE_FORWARD     (0 <<  0)
+#define SLEEF_MODE_BACKWARD    (1 <<  0)
+
+#define SLEEF_MODE_COMPLEX     (0 <<  1)
+#define SLEEF_MODE_REAL        (1 <<  1)
+
+#define SLEEF_MODE_ALT         (1 <<  2)
+#define SLEEF_MODE_FFTWCOMPAT  (1 <<  3)
+
+#define SLEEF_MODE_DEBUG       (1 << 10)
+#define SLEEF_MODE_VERBOSE     (1 << 11)
+#define SLEEF_MODE_NO_MT       (1 << 12)
+
+#define SLEEF_MODE_ESTIMATE    (1 << 20)
+#define SLEEF_MODE_MEASURE     (2 << 20)
+
+#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+#ifdef IMPORT_IS_EXPORT
+#define IMPORT __declspec(dllexport)
+#else // #ifdef IMPORT_IS_EXPORT
+#define IMPORT __declspec(dllimport)
+#if (defined(_MSC_VER))
+#pragma comment(lib,"sleefdft.lib")
+#endif // #if (defined(_MSC_VER))
+#endif // #ifdef IMPORT_IS_EXPORT
+#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+#define IMPORT
+#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+
+IMPORT struct SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *out, uint64_t mode);
+IMPORT struct SleefDFT *SleefDFT_double_init2d(uint32_t n, uint32_t m, const double *in, double *out, uint64_t mode);
+IMPORT void SleefDFT_double_execute(struct SleefDFT *ptr, const double *in, double *out);
+
+IMPORT struct SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, uint64_t mode);
+IMPORT struct SleefDFT *SleefDFT_float_init2d(uint32_t n, uint32_t m, const float *in, float *out, uint64_t mode);
+IMPORT void SleefDFT_float_execute(struct SleefDFT *ptr, const float *in, float *out);
+
+IMPORT void SleefDFT_dispose(struct SleefDFT *ptr);
+
+IMPORT void SleefDFT_setPath(struct SleefDFT *ptr, char *pathStr);
+
+//
+
+IMPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode);
+
+#define SLEEF_PLAN_AUTOMATIC 0
+#define SLEEF_PLAN_READONLY (1 << 0)
+#define SLEEF_PLAN_RESET (1 << 1)
+#define SLEEF_PLAN_BUILDALLPLAN (1 << 2)
+#define SLEEF_PLAN_NOLOCK (1 << 3)
+#define SLEEF_PLAN_MEASURE (1 << 29)
+#define SLEEF_PLAN_REFERTOENVVAR (1 << 30)
+
+#undef IMPORT
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // #ifndef __SLEEFDFT_H__
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleef-config.h.in
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleef-config.h.in
@@ -0,0 +1,11 @@
+// Configuration of @PROJECT_NAME@ /////////////////////////////////////////////
+
+#ifndef SLEEF_CONFIG_H
+#define SLEEF_CONFIG_H
+
+#define SLEEF_VERSION_MAJOR @SLEEF_VERSION_MAJOR@
+#define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@
+
+#cmakedefine SLEEF_STATIC_LIBS
+
+#endif // SLEEF_CONFIG_H
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleefConfig.cmake
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleefConfig.cmake
@@ -0,0 +1 @@
+include("${CMAKE_CURRENT_LIST_DIR}/sleefTargets.cmake")
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/CMakeLists.txt
@@ -0,0 +1,22 @@
+include_directories("common")
+include_directories("arch")
+
+add_subdirectory("libm")
+if (SLEEF_BUILD_TESTS AND NOT MINGW)
+  add_subdirectory("libm-tester")
+endif()
+add_subdirectory("common")
+
+if (SLEEF_BUILD_DFT)
+  add_subdirectory("dft")
+  if (SLEEF_BUILD_TESTS)
+    add_subdirectory("dft-tester")
+  endif()
+endif()
+
+if (SLEEF_BUILD_QUAD)
+  add_subdirectory("quad")
+  if (SLEEF_BUILD_TESTS AND NOT MINGW)
+    add_subdirectory("quad-tester")
+  endif()
+endif()
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperadvsimd.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperadvsimd.h
@@ -0,0 +1,837 @@
+/*********************************************************************/
+/*          Copyright ARM Ltd. 2010 - 2024.                          */
+/* Distributed under the Boost Software License, Version 1.0.        */
+/*    (See accompanying file LICENSE.txt or copy at                  */
+/*          http://www.boost.org/LICENSE_1_0.txt)                    */
+/*********************************************************************/
+
+#if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
+#error Please specify advsimd flags.
+#endif
+
+#if !defined(SLEEF_GENHEADER)
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 1
+//@#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP 2
+//@#define LOG2VECTLENSP 2
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#if CONFIG == 1
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+#endif
+
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#define ISANAME "AArch64 AdvSIMD"
+
+// Mask definition
+typedef uint32x4_t vmask;
+typedef uint32x4_t vopmask;
+
+// Single precision definitions
+typedef float32x4_t vfloat;
+typedef int32x4_t vint2;
+
+// Double precision definitions
+typedef float64x2_t vdouble;
+typedef int32x2_t vint;
+
+typedef int64x2_t vint64;
+typedef uint64x2_t vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+#define DFTPRIORITY 10
+
+static INLINE int vavailability_i(int name) { return 3; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) {
+  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmin_u32(x0, x0);
+  return vget_lane_u32(x1, 0);
+}
+
+static INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) {
+  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmin_u32(x0, x0);
+  return vget_lane_u32(x1, 0);
+}
+
+// Vector load / store
+static INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
+static INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
+static INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
+static INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
+static INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
+static INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
+static INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
+static INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
+static INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
+static INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
+static INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
+static INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
+
+static INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
+}
+
+static INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return ((vfloat) {
+      ptr[vgetq_lane_s32(vi2, 0)],
+      ptr[vgetq_lane_s32(vi2, 1)],
+      ptr[vgetq_lane_s32(vi2, 2)],
+      ptr[vgetq_lane_s32(vi2, 3)]
+    });
+}
+
+// Basic logical operations for mask
+static INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
+
+// Mask <--> single precision reinterpret
+static INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) {
+  return vreinterpretq_u32_f32(vf);
+}
+static INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) {
+  return vreinterpretq_f32_u32(vm);
+}
+static INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
+static INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }
+
+// Mask <--> double precision reinterpret
+static INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) {
+  return vreinterpretq_u32_f64(vd);
+}
+static INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) {
+  return vreinterpretq_f64_u32(vm);
+}
+static INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) {
+  return vreinterpretq_f32_s32(vm);
+}
+static INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) {
+  return vreinterpretq_s32_f32(vf);
+}
+
+/****************************************/
+/* Single precision FP operations */
+/****************************************/
+// Broadcast
+static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
+
+// Add, Sub, Mul
+static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
+  return vaddq_f32(x, y);
+}
+static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
+  return vsubq_f32(x, y);
+}
+static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
+  return vmulq_f32(x, y);
+}
+
+// |x|, -x
+static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
+static INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
+
+#if CONFIG == 1
+// Multiply accumulate: z = z + x * y
+static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
+  return vfmaq_f32(z, x, y);
+}
+// Multiply subtract: z = z - x * y
+static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
+  return vfmsq_f32(z, x, y);
+}
+// Multiply subtract: z = x * y - z
+static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
+  return vneg_vf_vf(vfmsq_f32(z, x, y));
+}
+#else
+static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+#endif
+
+static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
+  return vfmaq_f32(z, x, y);
+}
+
+static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
+  return vfmsq_f32(z, x, y);
+}
+
+static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
+  return vfma_vf_vf_vf_vf(x, y, vneg_vf_vf(z));
+}
+
+// Reciprocal 1/x, Division, Square root
+static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
+#ifndef SLEEF_ENABLE_ALTDIV
+  return vdivq_f32(n, d);
+#else
+  // Finite numbers (including denormal) only, gives mostly correctly rounded result
+  float32x4_t t, u, x, y;
+  uint32x4_t i0, i1;
+  i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
+  i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
+  i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
+  t = vreinterpretq_f32_u32(i0);
+  y = vmulq_f32(d, t);
+  x = vmulq_f32(n, t);
+  t = vrecpeq_f32(y);
+  t = vmulq_f32(t, vrecpsq_f32(y, t));
+  t = vmulq_f32(t, vrecpsq_f32(y, t));
+  u = vmulq_f32(x, t);
+  u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
+  return u;
+#endif
+}
+static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
+#ifndef SLEEF_ENABLE_ALTDIV
+  return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
+#else
+  return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
+                   vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
+#endif
+}
+
+static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
+#ifndef SLEEF_ENABLE_ALTSQRT
+  return vsqrtq_f32(d);
+#else
+  // Gives correctly rounded result for all input range
+  vfloat w, x, y, z;
+
+  y = vrsqrteq_f32(d);
+  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
+  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
+  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);
+
+  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
+  w = vmul_vf_vf_vf(w, y);
+  x = vmul_vf_vf_vf(w, d);
+  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
+  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
+  w = vfma_vf_vf_vf_vf(w, z, y);
+  w = vadd_vf_vf_vf(w, x);
+
+  return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
+                             vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
+#endif
+}
+
+// max, min
+static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
+  return vmaxq_f32(x, y);
+}
+static INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
+  return vminq_f32(x, y);
+}
+
+// Comparisons
+static INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
+static INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
+  return vmvnq_u32(vceqq_f32(x, y));
+}
+static INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
+static INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
+static INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
+static INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
+
+// Conditional select
+static INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
+  return vbslq_f32(mask, x, y);
+}
+
+// int <--> float conversions
+static INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
+static INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
+static INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
+static INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) {
+  return vcvtq_s32_f32(vrndnq_f32(d));
+}
+
+/***************************************/
+/* Single precision integer operations */
+/***************************************/
+
+// Add, Sub, Neg (-x)
+static INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vaddq_s32(x, y);
+}
+static INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vsubq_s32(x, y);
+}
+static INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
+
+// Logical operations
+static INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vandq_s32(x, y);
+}
+static INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vbicq_s32(y, x);
+}
+static INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vorrq_s32(x, y);
+}
+static INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return veorq_s32(x, y);
+}
+
+// Shifts
+#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
+//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
+#define vsrl_vi2_vi2_i(x, c)                                                   \
+  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
+//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
+
+#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
+//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
+#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
+//@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
+#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
+//@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
+#define vsrl_vi_vi_i(x, c)                                                     \
+  vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
+//@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
+
+// Comparison returning masks
+static INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
+static INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
+// Comparison returning integers
+static INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vreinterpretq_s32_u32(vcgtq_s32(x, y));
+}
+static INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vreinterpretq_s32_u32(vceqq_s32(x, y));
+}
+
+// Conditional select
+static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
+  return vbslq_s32(m, x, y);
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+/****************************************/
+/* Double precision FP operations */
+/****************************************/
+// Broadcast
+static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
+
+// Add, Sub, Mul
+static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
+  return vaddq_f64(x, y);
+}
+static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
+  return vsubq_f64(x, y);
+}
+static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
+  return vmulq_f64(x, y);
+}
+
+// |x|, -x
+static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
+static INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }
+
+// max, min
+static INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
+  return vmaxq_f64(x, y);
+}
+static INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
+  return vminq_f64(x, y);
+}
+
+#if CONFIG == 1
+// Multiply accumulate: z = z + x * y
+static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
+  return vfmaq_f64(z, x, y);
+}
+
+static INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
+  return vfmsq_f64(z, x, y);
+}
+
+//[z = x * y - z]
+static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
+  return vneg_vd_vd(vfmsq_f64(z, x, y));
+}
+#else
+static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+#endif
+
+static INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
+  return vfmaq_f64(z, x, y);
+}
+
+static INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
+  return vfmsq_f64(z, x, y);
+}
+
+static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
+  return vfma_vd_vd_vd_vd(x, y, vneg_vd_vd(z));
+}
+
+// Reciprocal 1/x, Division, Square root
+static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
+#ifndef SLEEF_ENABLE_ALTDIV
+  return vdivq_f64(n, d);
+#else
+  // Finite numbers (including denormal) only, gives mostly correctly rounded result
+  float64x2_t t, u, x, y;
+  uint64x2_t i0, i1;
+  i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
+  i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
+  i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
+  t = vreinterpretq_f64_u64(i0);
+  y = vmulq_f64(d, t);
+  x = vmulq_f64(n, t);
+  t = vrecpeq_f64(y);
+  t = vmulq_f64(t, vrecpsq_f64(y, t));
+  t = vmulq_f64(t, vrecpsq_f64(y, t));
+  t = vmulq_f64(t, vrecpsq_f64(y, t));
+  u = vmulq_f64(x, t);
+  u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
+  return u;
+#endif
+}
+static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
+#ifndef SLEEF_ENABLE_ALTDIV
+  return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
+#else
+  return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
+                   vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
+#endif
+}
+
+static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
+#ifndef SLEEF_ENABLE_ALTSQRT
+  return vsqrtq_f64(d);
+#else
+  // Gives correctly rounded result for all input range
+  vdouble w, x, y, z;
+
+  y = vrsqrteq_f64(d);
+  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
+  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
+  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
+
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
+  w = vmul_vd_vd_vd(w, y);
+  x = vmul_vd_vd_vd(w, d);
+  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
+  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
+  w = vfma_vd_vd_vd_vd(w, z, y);
+  w = vadd_vd_vd_vd(w, x);
+
+  return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
+                             vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
+#endif
+}
+
+/* Comparisons */
+static INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vceqq_f64(x, y));
+}
+static INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
+  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));
+}
+static INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcltq_f64(x, y));
+}
+static INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcgtq_f64(x, y));
+}
+static INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcleq_f64(x, y));
+}
+static INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcgeq_f64(x, y));
+}
+
+// Conditional select
+static INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
+  return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);
+}
+
+#if 1
+static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+#else
+// This implementation is slower on the current CPU models (as of May 2017.)
+// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.
+static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
+  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
+                            (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });
+
+  uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };
+  return (vdouble) vqtbl1q_u8(tab, idx);
+}
+
+static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
+                            vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },
+                                     vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },
+                                              (uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));
+
+  uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } };
+  return (vdouble) vqtbl2q_u8(tab, idx);
+}
+
+static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
+}
+#endif
+
+static INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
+static INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }
+
+/****************************************/
+/* int <--> float conversions           */
+/****************************************/
+static INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) {
+  return vmovn_s64(vcvtq_s64_f64(vf));
+}
+static INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) {
+  return vcvtq_f64_s64(vmovl_s32(vi));
+}
+static INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); }
+static INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) {
+  return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d)));
+}
+
+/***************************************/
+/* Integer operations */
+/***************************************/
+
+// Add, Sub, Neg (-x)
+static INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
+static INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
+static INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); }
+
+// Logical operations
+static INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
+static INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
+static INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
+static INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }
+
+// Comparison returning masks
+static INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) {
+  return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));
+}
+
+// Conditional select
+static INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
+  return vbsl_s32(vget_low_u32(m), x, y);
+}
+
+/***************************************/
+/* Predicates                          */
+/***************************************/
+static INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) {
+  const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY);
+  const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY);
+  uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));
+  return vreinterpretq_u32_u64(cmp);
+}
+
+static INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) {
+  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));
+}
+
+static INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY)));
+}
+
+static INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY)));
+}
+
+static INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
+  return vbslq_f32(mask, x, y);
+}
+
+static INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
+  return vceqq_f32(x, y);
+}
+static INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
+  return vmvnq_u32(vceqq_f32(x, y));
+}
+static INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
+  return vcltq_f32(x, y);
+}
+static INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
+  return vcleq_f32(x, y);
+}
+static INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
+  return vcgtq_f32(x, y);
+}
+static INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
+  return vcgeq_f32(x, y);
+}
+
+static INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
+  return vceqq_s32(x, y);
+}
+static INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
+  return vcgtq_s32(x, y);
+}
+static INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) {
+  return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0));
+}
+static INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) {
+  return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf));
+}
+static INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) {
+  return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf));
+}
+static INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) {
+  return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf));
+}
+static INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) {
+  return vuzpq_u32(m, m).val[0];
+}
+static INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) {
+  return vzipq_u32(m, m).val[0];
+}
+static INLINE VECTOR_CC vopmask vcast_vo_i(int i) {
+  return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)(i ? -1 : 0)));
+}
+
+static INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
+  return vandq_u32(x, y);
+}
+static INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
+  return vorrq_u32(x, y);
+}
+static INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
+  return veorq_u32(x, y);
+}
+
+static INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return vbslq_s32(m, x, y);
+}
+static INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
+  return vandq_s32(vreinterpretq_s32_u32(x), y);
+}
+static INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
+  return vbicq_s32(y, vreinterpretq_s32_u32(x));
+}
+static INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) {
+  return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));
+}
+static INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) {
+  return vandq_u32(x, y);
+}
+static INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) {
+  return vandq_u32(x, y);
+}
+static INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) {
+  return vorrq_u32(x, y);
+}
+static INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) {
+  return vorrq_u32(x, y);
+}
+static INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
+  return veorq_u32(x, y);
+}
+
+static INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }
+
+static INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) {
+  return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
+}
+
+static INLINE vmask vcast_vm_i64(int64_t i) {
+  return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)i));
+}
+static INLINE vmask vcast_vm_u64(uint64_t i) {
+  return vreinterpretq_u32_u64(vdupq_n_u64(i));
+}
+
+static INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
+}
+
+static INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+  return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
+}
+
+static INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return vbsl_s32(vget_low_u32(m), x, y);
+}
+
+// Logical operations
+static INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) {
+  return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);
+}
+
+static INLINE VECTOR_CC vmask vcastu_vm_vi(vint vi) {
+  return vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi))));
+}
+static INLINE VECTOR_CC vint vcastu_vi_vm(vmask vi2) {
+  return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vi2))));
+}
+static INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
+static INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
+
+static INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
+static INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }
+static INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+
+static INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
+static INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+
+static INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
+static INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
+
+static INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
+
+static INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) {
+  uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmax_u32(x0, x0);
+  return ~vget_lane_u32(x1, 0);
+}
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
+  return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
+}
+
+static INLINE vmask vneg64_vm_vm(vmask x) {
+  return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));
+}
+
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
+  return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
+}
+
+#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
+//@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
+#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
+//@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
+  return vor_vm_vm_vm(vcastu_vm_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi)))), m);
+}
+static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v)  { return vreinterpretq_u32_s64(v); }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m)  { return vreinterpretq_s64_u32(m); }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return vreinterpretq_u32_u64(v); }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return vreinterpretq_u64_u32(m); }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx.h
@@ -0,0 +1,638 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#if !defined(__AVX__) && !defined(SLEEF_GENHEADER)
+#error Please specify -mavx.
+#endif
+
+#elif CONFIG == 4
+
+#if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER)
+#error Please specify -mavx and -mfma4.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 2
+//@#define LOG2VECTLENDP 2
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#if !defined(SLEEF_GENHEADER)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <stdint.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+typedef __m256i vmask;
+typedef __m256i vopmask;
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+
+typedef __m256 vfloat;
+typedef struct { __m128i x, y; } vint2;
+
+typedef __m256i vint64;
+typedef __m256i vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+//
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef __SLEEF_H__
+void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+static INLINE int cpuSupportsAVX() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+}
+
+static INLINE int cpuSupportsFMA4() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 0x80000001, 0);
+    return (reg[2] & (1 << 16)) != 0;
+}
+
+#if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__)
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsAVX() && cpuSupportsFMA4();
+  return d ? 3 : 0;
+}
+
+#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+
+#define ISANAME "AVX + AMD FMA4"
+#define DFTPRIORITY 21
+#else
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsAVX();
+  return d ? 3 : 0;
+}
+
+#define ISANAME "AVX"
+#define DFTPRIORITY 20
+#endif
+
+#endif // #if !defined(SLEEF_GENHEADER)
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) {
+  vint2 r;
+  r.x = _mm_loadu_si128((__m128i *) p     );
+  r.y = _mm_loadu_si128((__m128i *)(p + 4));
+  return r;
+}
+
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
+  _mm_storeu_si128((__m128i *) p     , v.x);
+  _mm_storeu_si128((__m128i *)(p + 4), v.y);
+}
+
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask o) {
+  return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask o) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
+
+static INLINE vmask vcastu_vm_vi(vint vi) {
+  __m256i m = _mm256_castsi128_si256(_mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0)));
+  return _mm256_insertf128_si256(m,  _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0)), 1);
+}
+
+static INLINE vint vcastu_vi_vm(vmask vi) {
+  return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(_mm256_castsi256_si128(vi)     , 0x0d), _mm_set_epi32( 0,  0, -1, -1)),
+                      _mm_and_si128(_mm_shuffle_epi32(_mm256_extractf128_si256(vi, 1), 0xd0), _mm_set_epi32(-1, -1,  0,  0)));
+}
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) {
+  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
+}
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));
+}
+
+static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
+
+//
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
+
+#if CONFIG == 1
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
+#else
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }
+#endif
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
+static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
+}
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[VECTLENDP];
+  vstoreu_v_p_vi(a, vi);
+  return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double a[VECTLENDP];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) {
+  vint2 r;
+  r.x = _mm256_castsi256_si128(vm);
+  r.y = _mm256_extractf128_si256(vm, 1);
+  return r;
+}
+
+static INLINE vmask vcast_vm_vi2(vint2 vi) {
+  vmask m = _mm256_castsi128_si256(vi.x);
+  m = _mm256_insertf128_si256(m, vi.y, 1);
+  return m;
+}
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
+
+#if CONFIG == 1
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+#else
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }
+#endif
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vneg_vi2_vi2(vint2 e) {
+  vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };
+  return vi;
+}
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
+  vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };
+  return vi;
+}
+
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
+  vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };
+  return vi;
+}
+
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
+  vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };
+  return vi;
+}
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpeq_epi32(x.x, y.x);
+  r.y = _mm_cmpeq_epi32(x.y, y.y);
+  return vcast_vm_vi2(r);
+}
+
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpgt_epi32(x.x, y.x);
+  r.y = _mm_cmpgt_epi32(x.y, y.y);
+  return vcast_vm_vi2(r);
+}
+
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpeq_epi32(x.x, y.x);
+  r.y = _mm_cmpeq_epi32(x.y, y.y);
+  return r;
+}
+
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpgt_epi32(x.x, y.x);
+  r.y = _mm_cmpgt_epi32(x.y, y.y);
+  return r;
+}
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  vint2 n = vcast_vi2_vm(m);
+  vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };
+  return r;
+}
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+  vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;
+  iz.x = _mm_add_epi64(ix.x, iy.x);
+  iz.y = _mm_add_epi64(ix.y, iy.y);
+  return vcast_vm_vi2(iz);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+//
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  int a[VECTLENSP];
+  vstoreu_v_p_vi2(a, vi2);
+  return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
+                       ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float a[VECTLENSP];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+#endif
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
+
+#if CONFIG == 1
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+#else
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+#endif
+
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
+static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) {
+  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
+}
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
+  return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));
+}
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
+  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
+  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
+  vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));
+  return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);
+}
+
+static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
+  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
+  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
+  vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));
+  return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);
+}
+
+#define vsll64_vm_vm_i(x, c) \
+  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \
+                          _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
+#define vsrl64_vm_vm_i(x, c) \
+  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
+                          _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
+
+//@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
+//@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
+  vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
+  vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
+  return vor_vm_vm_vm(vcastu_vm_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1))), m);
+}
+static INLINE vint vcast_vi_vm(vmask vm) {
+  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
+                      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
+}
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2.h
@@ -0,0 +1,485 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
+#error Please specify -mavx2.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 2
+//@#define LOG2VECTLENDP 2
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#if !defined(SLEEF_GENHEADER)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <stdint.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+typedef __m256i vmask;
+typedef __m256i vopmask;
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+
+typedef __m256 vfloat;
+typedef __m256i vint2;
+
+typedef __m256i vint64;
+typedef __m256i vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+//
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef __SLEEF_H__
+void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+static INLINE int cpuSupportsAVX2() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 7, 0);
+    return (reg[1] & (1 << 5)) != 0;
+}
+
+static INLINE int cpuSupportsFMA() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 12)) != 0;
+}
+
+#if CONFIG == 1 && defined(__AVX2__)
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsAVX2() && cpuSupportsFMA();
+  return d ? 3 : 0;
+}
+#define ISANAME "AVX2"
+#define DFTPRIORITY 25
+#endif
+
+#endif // #if !defined(SLEEF_GENHEADER)
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask o) {
+  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask o) {
+  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
+}
+
+static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
+
+static INLINE vmask vcastu_vm_vi(vint vi) {
+  return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);
+}
+
+static INLINE vint vcastu_vi_vm(vmask vi) {
+  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
+                      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
+}
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) {
+  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
+}
+
+static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }
+
+//
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
+static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
+static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  __m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),
+                                                   vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),
+                                                                    vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),
+                                                                                     _mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));
+  return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
+}
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[4];
+  _mm256_storeu_pd(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return _mm256_blendv_epi8(y, x, m);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
+
+// At this point, the following three functions are implemented in a generic way,
+// but I will try target-specific optimization later on.
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[8];
+  _mm256_storeu_ps(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
+static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) {
+  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
+}
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); }
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); }
+static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); }
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare
+
+#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
+#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
+//@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
+//@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
+
+static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } // signed 32-bit => 64-bit
+static INLINE vint vcast_vi_vm(vmask vm) { // signed 32-bit <= 64-bit
+  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
+                      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
+}
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2_128.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2_128.h
@@ -0,0 +1,463 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
+#error Please specify -mavx2.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 1
+//@#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#if !defined(SLEEF_GENHEADER)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <stdint.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+typedef __m128i vmask;
+typedef __m128i vopmask;
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+
+typedef __m128  vfloat;
+typedef __m128i vint2;
+
+typedef __m128i vint64;
+typedef __m128i vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+//
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef __SLEEF_H__
+void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+static INLINE int cpuSupportsAVX2() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 7, 0);
+    return (reg[1] & (1 << 5)) != 0;
+}
+
+static INLINE int cpuSupportsFMA() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 12)) != 0;
+}
+
+#if CONFIG == 1 && defined(__AVX2__)
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsAVX2() && cpuSupportsFMA();
+  return d ? 3 : 0;
+}
+#define ISANAME "AVX2"
+#define DFTPRIORITY 25
+#endif
+
+#endif // #if !defined(SLEEF_GENHEADER)
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm);  }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i const *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
+
+static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
+
+static INLINE vmask vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
+static INLINE vint vcastu_vi_vm(vmask vi) { return _mm_shuffle_epi32(vi, 0x0d); }
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
+
+static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
+
+//
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmsub_pd(x, y, z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GE_OQ)); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(o)); }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(d, d, _CMP_NEQ_UQ));
+}
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double a[VECTLENDP];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmsub_ps(x, y, z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GE_OQ)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_and_si128(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_andnot_si128(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_or_si128(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return _mm_blendv_epi8(y, x, m);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(o)); }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float a[VECTLENSP];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq = {
+    vloadu_vi2_p((int32_t *)p),
+    vloadu_vi2_p((int32_t *)((uint8_t *)p + sizeof(vmask)))
+  };
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+static void vstoreu_v_p_vq(void *p, vquad vq) {
+  vstoreu_v_p_vi2((int32_t *)p, vcast_vi2_vm(vq.x));
+  vstoreu_v_p_vi2((int32_t *)((uint8_t *)p + sizeof(vmask)), vcast_vi2_vm(vq.y));
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm_blendv_epi8(y, x, o); }
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
+static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpgt_epi64(x, y); } // signed compare
+
+#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
+#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
+//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
+//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
+  return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
+}
+static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx512f.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx512f.h
@@ -0,0 +1,600 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1 || CONFIG == 2
+
+#if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER)
+#error Please specify -mavx512f.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 3
+//@#define LOG2VECTLENDP 3
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#if CONFIG == 1
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+#endif
+
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#if !defined(SLEEF_GENHEADER)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <stdint.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+typedef __m512i vmask;
+typedef __mmask16 vopmask;
+
+typedef __m512d vdouble;
+typedef __m256i vint;
+
+typedef __m512 vfloat;
+typedef __m512i vint2;
+
+typedef __m512i vint64;
+typedef __m512i vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+//
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef __SLEEF_H__
+void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+static INLINE int cpuSupportsAVX512F() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 7, 0);
+    return (reg[1] & (1 << 16)) != 0;
+}
+
+#if CONFIG == 1 && defined(__AVX512F__)
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsAVX512F();
+  return d ? 3 : 0;
+}
+#define ISANAME "AVX512F"
+#define DFTPRIORITY 30
+#endif
+
+#if CONFIG == 2 && defined(__AVX512F__)
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsAVX512F();
+  return d ? 3 : 0;
+}
+#define ISANAME "AVX512FNOFMA"
+#define DFTPRIORITY 0
+#endif
+
+#endif // #if !defined(SLEEF_GENHEADER)
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+#ifdef __INTEL_COMPILER
+static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
+static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }
+#else
+static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
+static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
+#endif
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); }
+static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
+static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
+static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
+static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }
+
+static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) {
+  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vint vtruncate_vi_vd(vdouble vd) {
+  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
+
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
+  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vdouble vrint_vd_vd(vdouble vd) {
+  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vmask vcastu_vm_vi(vint vi) {
+  return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
+}
+
+static INLINE vint vcastu_vi_vm(vmask vi) {
+  return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));
+}
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }
+
+static INLINE vmask vcast_vm_i64(int64_t i) { return _mm512_set1_epi64(i); }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm512_set1_epi64((uint64_t)i); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }
+
+#if CONFIG == 1
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
+#else
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+#endif
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }
+
+static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
+  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));
+}
+static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
+  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));
+}
+
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }
+#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
+#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
+#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
+//@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
+//@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
+//@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
+  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);
+}
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
+  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);
+}
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
+  return _mm512_mask_blend_pd(mask, y, x);
+}
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+#if 1
+// Probably this is faster
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  __m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),
+                                                   vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),
+                                                                    vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),
+                                                                                     _mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));
+  return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
+}
+#else
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+#endif
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);
+}
+
+static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
+
+// vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to
+// be a normalized FP value.
+static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
+
+static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }
+static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }
+
+static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
+static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
+
+#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
+#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
+//@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
+//@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[VECTLENDP];
+  _mm512_storeu_pd(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
+
+//
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));
+}
+
+//
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); }
+
+static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); }
+static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); }
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); }
+
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
+  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vfloat vrint_vf_vf(vfloat vd) {
+  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }
+
+#if CONFIG == 1
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
+#else
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+#endif
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
+  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);
+}
+
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
+  return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));
+}
+
+#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
+#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
+#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
+//@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
+//@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
+//@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }
+
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
+  __mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
+  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
+}
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
+  __mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);
+  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
+}
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return _mm512_mask_blend_epi32(m, y, x);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
+  return _mm512_mask_blend_ps(m, y, x);
+}
+
+// At this point, the following three functions are implemented in a generic way,
+// but I will try target-specific optimization later on.
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
+static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[VECTLENSP];
+  _mm512_storeu_ps(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
+
+//
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) {
+  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
+}
+static INLINE vdouble vnegpos_vd_vd(vdouble d) {
+  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
+}
+static INLINE vfloat vposneg_vf_vf(vfloat d) {
+  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
+}
+static INLINE vfloat vnegpos_vf_vf(vfloat d) {
+  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
+}
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); }
+
+static INLINE vdouble vreva2_vd_vd(vdouble vd) {
+  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd)));
+}
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
+  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
+  _mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
+  _mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
+  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
+  _mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
+  _mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
+}
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); }
+
+static INLINE vfloat vreva2_vf_vf(vfloat vf) {
+  return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf)));
+}
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
+  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
+  _mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
+  _mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+#ifdef __INTEL_COMPILER
+static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; }
+#else
+static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }
+#endif
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); }
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); }
+static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); }
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare
+
+#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
+#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
+//@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
+//@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  return _mm512_cvtepi32_epi64(vi);
+}
+static INLINE vint vcast_vi_vm(vmask vm) {
+  return _mm512_cvtepi64_epi32(vm);
+}
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperneon32.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperneon32.h
@@ -0,0 +1,297 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
+#error Please specify -mfpu=neon.
+#endif
+
+#ifdef __aarch64__
+#warning This implementation is for AARCH32.
+#endif
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP 2
+//@#define LOG2VECTLENSP 2
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#if CONFIG == 4
+#define ISANAME "AARCH32 NEON-VFPV4"
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+#else
+#define ISANAME "AARCH32 NEON"
+#endif
+#define DFTPRIORITY 10
+
+#define ENABLE_RECSQRT_SP
+//@#define ENABLE_RECSQRT_SP
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "misc.h"
+
+typedef uint32x4_t vmask;
+typedef uint32x4_t vopmask;
+
+//typedef int32x4_t vint;
+
+typedef float32x4_t vfloat;
+typedef int32x4_t vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmin_u32(x0, x0);
+  return vget_lane_u32(x1, 0);
+}
+
+static vfloat vloaduf(float *p) { return vld1q_f32(p); }
+static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
+
+static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }
+
+//
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  uint32x4_t t = vceqq_u32(x, y);
+  return vandq_u32(t, vrev64q_u32(t));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+static INLINE vint2 vrint_vi2_vf(vfloat d) {
+  return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
+}
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
+
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+
+static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }
+
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
+static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
+#if CONFIG == 4
+static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
+static INLINE vfloat vfma_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
+
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) {
+  float32x4_t t = vrecpeq_f32(y), u;
+  t = vmulq_f32(t, vrecpsq_f32(y, t));
+  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
+  u = vmulq_f32(x, t);
+  return vfmaq_f32(u, vfmsq_f32(x, y, u), t);
+}
+
+static INLINE vfloat vsqrt_vf_vf(vfloat d) {
+  float32x4_t x = vrsqrteq_f32(d);
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  float32x4_t u = vmulq_f32(x, d);
+  u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
+}
+
+static INLINE vfloat vrec_vf_vf(vfloat y) {
+  float32x4_t t = vrecpeq_f32(y);
+  t = vmulq_f32(t, vrecpsq_f32(y, t));
+  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
+  return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
+}
+
+static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
+  float32x4_t x = vrsqrteq_f32(d);
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
+}
+#else // #if CONFIG == 4
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); }
+
+static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
+  float32x4_t x = vrecpeq_f32(d);
+  x = vmulq_f32(x, vrecpsq_f32(d, x));
+  float32x4_t t = vmulq_f32(n, x);
+  return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
+}
+
+static INLINE vfloat vsqrt_vf_vf(vfloat d) {
+  float32x4_t x = vrsqrteq_f32(d);
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  float32x4_t u = vmulq_f32(x, d);
+  u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
+}
+
+static INLINE vfloat vrec_vf_vf(vfloat d) {
+  float32x4_t x = vrecpeq_f32(d);
+  x = vmulq_f32(x, vrecpsq_f32(d, x));
+  return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
+}
+
+static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
+  float32x4_t x = vrsqrteq_f32(d);
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
+}
+#endif // #if CONFIG == 4
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }
+
+#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
+#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
+#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
+//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
+//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
+//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
+  return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
+}
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float p[4];
+  vst1q_f32 (p, v);
+  return p[0];
+}
+
+static INLINE int vavailability_i(int name) {
+  if (name != 2) return 0;
+  return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
+}
+
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return ((vfloat) {
+      ptr[vgetq_lane_s32(vi2, 0)],
+      ptr[vgetq_lane_s32(vi2, 1)],
+      ptr[vgetq_lane_s32(vi2, 2)],
+      ptr[vgetq_lane_s32(vi2, 3)]
+    });
+}
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
+
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpower_128.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpower_128.h
@@ -0,0 +1,873 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1 || CONFIG == 2 || CONFIG == 3 || CONFIG == 4
+
+#ifndef __VSX__
+#error Please specify -mcpu=power8 or -mcpu=power9
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 1
+//@#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#if CONFIG == 1 || CONFIG == 3
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+#endif
+
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+
+#if !defined(SLEEF_GENHEADER)
+#include <altivec.h>
+// undef altivec types since CPP and C99 use them as compiler tokens
+// use __vector and __bool instead
+#undef vector
+#undef bool
+
+#include <stdint.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+#if CONFIG == 1 || CONFIG == 2
+#define ISANAME "VSX"
+#else
+#define ISANAME "VSX-3"
+#endif
+
+#define DFTPRIORITY 25
+
+static INLINE int vavailability_i(int name) { return 3; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+/**********************************************
+ ** Types
+***********************************************/
+typedef __vector unsigned int vmask;
+// using __bool with typedef may cause ambiguous errors
+#define vopmask __vector __bool int
+//@#define vopmask __vector __bool int
+typedef __vector signed int vint;
+typedef __vector signed int vint2;
+typedef __vector float  vfloat;
+typedef __vector double vdouble;
+
+// internal use types
+typedef __vector unsigned int v__u32;
+typedef __vector unsigned char v__u8;
+typedef __vector signed long long  v__i64;
+typedef __vector unsigned long long  v__u64;
+#define v__b64 __vector __bool long long
+
+typedef __vector long long vint64;
+typedef __vector unsigned long long vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+/**********************************************
+ ** Utilities
+***********************************************/
+#define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1})
+#define vset__vi2(...) ((vint2) {__VA_ARGS__})
+#define vset__vm(...) ((vmask) {__VA_ARGS__})
+#define vset__vo(...) ((vopmask) {__VA_ARGS__})
+#define vset__vf(...) ((vfloat) {__VA_ARGS__})
+#define vset__vd(...) ((vdouble) {__VA_ARGS__})
+#define vset__u8(...) ((v__u8) {__VA_ARGS__})
+#define vset__u32(...) ((v__u32) {__VA_ARGS__})
+#define vset__s64(...) ((v__i64) {__VA_ARGS__})
+#define vset__u64(...) ((v__u64) {__VA_ARGS__})
+
+#define vsetall__vi(v)  vset__vi(v, v)
+#define vsetall__vi2(v) vset__vi2(v, v, v, v)
+#define vsetall__vm(v)  vset__vm(v, v, v, v)
+#define vsetall__vo(v)  vset__vo(v, v, v, v)
+#define vsetall__vf(v)  vset__vf(v, v, v, v)
+#define vsetall__vd(v)  vset__vd(v, v)
+#define vsetall__u8(v)  vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)
+#define vsetall__u32(v) vset__u32(v, v, v, v)
+#define vsetall__s64(v) vset__s64(v, v)
+#define vsetall__u64(v) vset__u64(v, v)
+
+#define vzero__vi()  vsetall__vi(0)
+#define vzero__vi2() vsetall__vi2(0)
+#define vzero__vm()  vsetall__vm(0)
+#define vzero__vo()  vsetall__vo(0)
+#define vzero__vf()  vsetall__vf(0)
+#define vzero__vd()  vsetall__vd(0)
+#define vzero__u8()  vsetall__u8(0)
+#define vzero__u32() vsetall__u32(0)
+#define vzero__s64() vsetall__s64(0)
+#define vzero__u64() vsetall__u64(0)
+
+//// Swap doubleword elements
+#if defined(__clang__) || __GNUC__ >= 7
+  static INLINE v__u64 v__swapd_u64(v__u64 v)
+  { return vec_xxpermdi(v, v, 2); }
+#else
+  static INLINE v__u64 v__swapd_u64(v__u64 v)
+  {
+    __asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v));
+    return v;
+  }
+#endif
+
+/**********************************************
+ ** Memory
+***********************************************/
+
+////////////// Unaligned memory access //////////////
+/**
+ * It's not safe to use vector assignment via (cast & dereference) for unaligned memory access
+ * with almost all clang versions and gcc8 when VSX3 isn't enabled,
+ * these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x'
+ * for more information check https://github.com/seiko2plus/vsx_mem_test
+ *
+ * TODO: check GCC(9, 10)
+*/
+//// load
+#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
+static vint vloadu_vi_p(const int32_t *ptr)
+{ return *((vint*)ptr); }
+static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
+{ return *((vint2*)ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr)
+{ return *((vfloat*)ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr)
+{ return *((vdouble*)ptr); }
+#else
+static vint vloadu_vi_p(const int32_t *ptr)
+{ return vec_vsx_ld(0, ptr); }
+static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
+{ return vec_vsx_ld(0, ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr)
+{ return vec_vsx_ld(0, ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr)
+{ return vec_vsx_ld(0, ptr); }
+#endif
+
+//// store
+#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
+static void vstoreu_v_p_vi(int32_t *ptr, vint v)
+{ *((vint*)ptr) = v; }
+static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
+{ *((vint2*)ptr) = v; }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
+{ *((vfloat*)ptr) = v; }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
+{ *((vdouble*)ptr) = v; }
+#else
+static void vstoreu_v_p_vi(int32_t *ptr, vint v)
+{ vec_vsx_st(v, 0, ptr); }
+static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
+{ vec_vsx_st(v, 0, ptr); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
+{ vec_vsx_st(v, 0, ptr); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
+{ vec_vsx_st(v, 0, ptr); }
+#endif
+
+////////////// aligned memory access //////////////
+//// load
+static INLINE vfloat vload_vf_p(const float *ptr)
+{ return vec_ld(0, ptr); }
+static INLINE vdouble vload_vd_p(const double *ptr)
+{ return *((vdouble*)ptr); }
+
+//// store
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v)
+{ vec_st(v, 0, ptr); }
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v)
+{ *((vdouble*)ptr) = v; }
+
+////////////// non-temporal memory access //////////////
+//// store
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v)
+{ vstore_v_p_vf(ptr, v); }
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v)
+{ vstore_v_p_vd(ptr, v); }
+
+////////////// LUT //////////////
+//// load
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi)
+{ return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2)
+{
+  return vset__vf(
+    ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)],
+    ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)]
+  );
+}
+
+//// store
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
+{
+  const v__u64 vll = (v__u64)v;
+  float *ptr_low = ptr + offset*2;
+  float *ptr_high = ptr + (offset + step)*2;
+  *((uint64_t*)ptr_low) = vec_extract(vll, 0);
+  *((uint64_t*)ptr_high) = vec_extract(vll, 1);
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
+{ vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
+{ vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
+{ vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+
+/**********************************************
+ ** Misc
+ **********************************************/
+
+// vector with a specific value set to all lanes (Vector Splat)
+static INLINE vint vcast_vi_i(int i)
+{ return vsetall__vi(i); }
+static INLINE vint2 vcast_vi2_i(int i)
+{ return vsetall__vi2(i); }
+static INLINE vfloat vcast_vf_f(float f)
+{ return vsetall__vf(f); }
+static INLINE vdouble vcast_vd_d(double d)
+{ return vsetall__vd(d); }
+// cast
+static INLINE vint2 vcast_vi2_vm(vmask vm)
+{ return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi)
+{ return (vmask)vi; }
+// get the first element
+static INLINE float vcast_f_vf(vfloat v)
+{ return vec_extract(v, 0); }
+static INLINE double vcast_d_vd(vdouble v)
+{ return vec_extract(v, 0); }
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd)
+{ return (vmask)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm)
+{ return (vdouble)vm; }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf)
+{ return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm)
+{ return (vfloat)vm; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi)
+{ return (vfloat)vi; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf)
+{ return (vint2)vf; }
+
+// per element select via mask (blend)
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y)
+{ return vec_sel(y, x, (v__b64)o); }
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y)
+{ return vec_sel(y, x, o); }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y)
+{ return vec_sel(y, x, o); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)
+{ return vec_sel(y, x, o); }
+
+static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0)
+{
+  return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0));
+}
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2)
+{
+  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3)
+{
+  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0)
+{
+  return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0));
+}
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2)
+{
+  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3)
+{
+  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE int vtestallones_i_vo32(vopmask g)
+{ return vec_all_ne((vint2)g, vzero__vi2()); }
+static INLINE int vtestallones_i_vo64(vopmask g)
+{ return vec_all_ne((v__i64)g, vzero__s64()); }
+
+/**********************************************
+ ** Conversions
+ **********************************************/
+
+////////////// Numeric //////////////
+// pack 64-bit mask to 32-bit
+static INLINE vopmask vcast_vo32_vo64(vopmask m)
+{ return (vopmask)vec_pack((v__u64)m, (v__u64)m); }
+// clip 64-bit lanes to lower 32-bit
+static INLINE vint vcastu_vi_vi2(vint2 vi2)
+{ return vec_mergeo(vi2, vec_splat(vi2, 3)); }
+static INLINE vint vcastu_vi_vm(vmask vi2)
+{ return vec_mergeo((vint2)vi2, vec_splat((vint2)vi2, 3)); }
+
+
+// expand lower 32-bit mask
+static INLINE vopmask vcast_vo64_vo32(vopmask m)
+{ return vec_mergeh(m, m); }
+// unsigned expand lower 32-bit integer
+static INLINE vint2 vcastu_vi2_vi(vint vi)
+{ return vec_mergeh(vzero__vi(), vi); }
+static INLINE vmask vcastu_vm_vi(vint vi)
+{ return (vmask)vec_mergeh(vzero__vi(), vi); }
+
+static INLINE vopmask vcast_vo_i(int i) {
+  i = i ? -1 : 0;
+  return (vopmask) { i, i, i, i };
+}
+
+// signed int to single-precision
+static INLINE vfloat vcast_vf_vi2(vint2 vi)
+{
+  vfloat ret;
+#if defined(__clang__) || __GNUC__ >= 9
+  ret = __builtin_convertvector(vi, vfloat);
+#else
+  __asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi));
+#endif
+  return ret;
+}
+
+// lower signed int to double-precision
+static INLINE vdouble vcast_vd_vi(vint vi)
+{
+  vdouble ret;
+  vint swap = vec_mergeh(vi, vi);
+#if defined(__clang__) || __GNUC__ >= 7
+  ret = __builtin_vsx_xvcvsxwdp(swap);
+#else
+  __asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap));
+#endif
+  return ret;
+}
+
+// zip two scalars
+static INLINE vmask vcast_vm_i_i(int l, int h)
+{ return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); }
+
+static INLINE vmask vcast_vm_i64(int64_t i) {
+  return (vmask)vsetall__s64(i);
+}
+static INLINE vmask vcast_vm_u64(uint64_t i) {
+  return (vmask)vsetall__u64(i);
+}
+
+////////////// Truncation //////////////
+
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
+{
+  vint2 ret;
+#if defined(__clang__) || __GNUC__ >= 9
+  ret = __builtin_convertvector(vf, vint2);
+#else
+  __asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf));
+#endif
+  return ret;
+}
+
+static INLINE vint vtruncate_vi_vd(vdouble vd)
+{
+  vint ret;
+#if defined(__clang__) || __GNUC__ >= 7
+  ret = __builtin_vsx_xvcvdpsxws(vd);
+#else
+  __asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd));
+#endif
+  return vec_mergeo(ret, vec_splat(ret, 3));
+}
+
+static INLINE vdouble vtruncate_vd_vd(vdouble vd)
+{ return vec_trunc(vd); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf)
+{ return vec_trunc(vf); }
+
+////////////// Rounding //////////////
+
+// towards the nearest even
+static INLINE vint vrint_vi_vd(vdouble vd)
+{ return vtruncate_vi_vd(vec_rint(vd)); }
+static INLINE vint2 vrint_vi2_vf(vfloat vf)
+{ return vtruncate_vi2_vf(vec_rint(vf)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd)
+{ return vec_rint(vd); }
+static INLINE vfloat vrint_vf_vf(vfloat vf)
+{ return vec_rint(vf); }
+
+/**********************************************
+ ** Logical
+ **********************************************/
+
+////////////// And //////////////
+static INLINE vint vand_vi_vi_vi(vint x, vint y)
+{ return vec_and(x, y); }
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y)
+{ return vec_and((vint)x, y); }
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return vec_and(x, y); }
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y)
+{ return (vint2)vec_and((vint2)x, y); }
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y)
+{ return vec_and(x, y); }
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)
+{ return vec_and((vmask)x, y); }
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)
+{ return vec_and((vmask)x, y); }
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y)
+{ return vec_and(x, y); }
+
+////////////// Or //////////////
+static INLINE vint vor_vi_vi_vi(vint x, vint y)
+{ return vec_or(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return vec_or(x, y); }
+
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y)
+{ return vec_or(x, y); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)
+{ return vec_or((vmask)x, y); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)
+{ return vec_or((vmask)x, y); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y)
+{ return vec_or(x, y); }
+
+////////////// Xor //////////////
+static INLINE vint vxor_vi_vi_vi(vint x, vint y)
+{ return vec_xor(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return vec_xor(x, y); }
+
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y)
+{ return vec_xor(x, y); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)
+{ return vec_xor((vmask)x, y); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)
+{ return vec_xor((vmask)x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y)
+{ return vec_xor(x, y); }
+
+////////////// Not //////////////
+static INLINE vopmask vnot_vo_vo(vopmask o)
+{ return vec_nor(o, o); }
+
+////////////// And Not ((~x) & y) //////////////
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y)
+{ return vec_andc(y, x); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y)
+{ return vec_andc(y, (vint)x); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return vec_andc(y, x); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y)
+{ return vec_andc(y, x); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)
+{ return vec_andc(y, x); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)
+{ return vec_andc(y, x); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y)
+{ return vec_andc(y, x); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y)
+{ return vec_andc(y, (vint2)x); }
+
+/**********************************************
+ ** Comparison
+ **********************************************/
+
+////////////// Equal //////////////
+static INLINE vint veq_vi_vi_vi(vint x, vint y)
+{ return (vint)vec_cmpeq(x, y); }
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y)
+{ return vec_cmpeq(x, y); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y)
+{ return vec_cmpeq(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return (vint2)vec_cmpeq(x, y); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y)
+{ return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)
+{ return vec_cmpeq(x, y); }
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)
+{ return (vopmask)vec_cmpeq(x, y); }
+
+////////////// Not Equal //////////////
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y)
+{ return vnot_vo_vo(vec_cmpeq(x, y)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y)
+{ return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); }
+
+////////////// Less Than //////////////
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)
+{ return vec_cmplt(x, y); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)
+{ return (vopmask)vec_cmplt(x, y); }
+
+////////////// Greater Than //////////////
+static INLINE vint vgt_vi_vi_vi(vint x, vint y)
+{ return (vint)vec_cmpgt(x, y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y)
+{ return vec_cmpgt(x, y);}
+
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return (vint2)vec_cmpgt(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y)
+{ return vec_cmpgt(x, y); }
+
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)
+{ return vec_cmpgt(x, y); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)
+{ return (vopmask)vec_cmpgt(x, y); }
+
+////////////// Less Than Or Equal //////////////
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)
+{ return vec_cmple(x, y); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)
+{ return (vopmask)vec_cmple(x, y); }
+
+////////////// Greater Than Or Equal //////////////
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)
+{ return vec_cmpge(x, y); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)
+{ return (vopmask)vec_cmpge(x, y); }
+
+////////////// Special Cases //////////////
+static INLINE vopmask visinf_vo_vf(vfloat d)
+{ return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); }
+static INLINE vopmask visinf_vo_vd(vdouble d)
+{ return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); }
+
+static INLINE vopmask vispinf_vo_vf(vfloat d)
+{ return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vd(vdouble d)
+{ return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); }
+
+static INLINE vopmask visminf_vo_vf(vfloat d)
+{ return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vd(vdouble d)
+{ return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); }
+
+static INLINE vopmask visnan_vo_vf(vfloat d)
+{ return vnot_vo_vo(vec_cmpeq(d, d)); }
+static INLINE vopmask visnan_vo_vd(vdouble d)
+{ return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); }
+
+/**********************************************
+ ** Shift
+ **********************************************/
+////////////// Left //////////////
+static INLINE vint vsll_vi_vi_i(vint x, int c)
+{ return vec_sl (x, vsetall__u32(c)); }
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c)
+{ return vec_sl(x, vsetall__u32(c)); }
+
+////////////// Right //////////////
+static INLINE vint vsrl_vi_vi_i(vint x, int c)
+{ return vec_sr(x, vsetall__u32(c)); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c)
+{ return vec_sr(x, vsetall__u32(c)); }
+
+////////////// Algebraic Right //////////////
+static INLINE vint vsra_vi_vi_i(vint x, int c)
+{ return vec_sra(x, vsetall__u32(c)); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c)
+{ return vec_sra(x, vsetall__u32(c)); }
+
+/**********************************************
+ ** Reorder
+ **********************************************/
+
+////////////// Reverse //////////////
+// Reverse elements order inside the lower and higher parts
+static INLINE vint2 vrev21_vi2_vi2(vint2 vi)
+{ return vec_mergee(vec_mergeo(vi, vi), vi); }
+static INLINE vfloat vrev21_vf_vf(vfloat vf)
+{ return (vfloat)vrev21_vi2_vi2((vint2)vf); }
+
+// Swap the lower and higher parts
+static INLINE vfloat vreva2_vf_vf(vfloat vf)
+{ return (vfloat)v__swapd_u64((v__u64)vf); }
+static INLINE vdouble vrev21_vd_vd(vdouble vd)
+{ return (vdouble)v__swapd_u64((v__u64)vd); }
+static INLINE vdouble vreva2_vd_vd(vdouble vd)
+{ return vd; }
+
+/**********************************************
+ ** Arithmetic
+ **********************************************/
+
+////////////// Negation //////////////
+static INLINE vint vneg_vi_vi(vint e) {
+#if defined(__clang__) || __GNUC__ >= 9
+  return vec_neg(e);
+#else
+  return vec_sub(vzero__vi(), e);
+#endif
+}
+static INLINE vint2 vneg_vi2_vi2(vint2 e)
+{ return vneg_vi_vi(e); }
+
+static INLINE vfloat vneg_vf_vf(vfloat d)
+{
+  vfloat ret;
+#if defined(__clang__) || __GNUC__ >= 9
+  ret = vec_neg(d);
+#else
+  __asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d));
+#endif
+  return ret;
+}
+
+static INLINE vdouble vneg_vd_vd(vdouble d)
+{
+  vdouble ret;
+#if defined(__clang__) || __GNUC__ >= 9
+  ret = vec_neg(d);
+#else
+  __asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d));
+#endif
+  return ret;
+}
+
+static INLINE vfloat vposneg_vf_vf(vfloat d)
+{ return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); }
+static INLINE vdouble vposneg_vd_vd(vdouble d)
+{ return vec_xor(d, vset__vd(+0.0, -0.0)); }
+
+static INLINE vfloat vnegpos_vf_vf(vfloat d)
+{ return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d)
+{ return vec_xor(d, vset__vd(-0.0, +0.0)); }
+
+////////////// Addition //////////////
+static INLINE vint vadd_vi_vi_vi(vint x, vint y)
+{ return vec_add(x, y); }
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return vec_add(x, y); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_add(x, y); }
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_add(x, y); }
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y)
+{ return (vmask)vec_add((v__i64)x, (v__i64)y); }
+
+////////////// Subtraction //////////////
+static INLINE vint vsub_vi_vi_vi(vint x, vint y)
+{ return vec_sub(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y)
+{ return vec_sub(x, y); }
+
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_sub(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_sub(x, y); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_add(x, vnegpos_vd_vd(y)); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_add(x, vnegpos_vf_vf(y)); }
+
+////////////// Multiplication //////////////
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_mul(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_mul(x, y); }
+
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_div(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_div(x, y); }
+
+static INLINE vfloat vrec_vf_vf(vfloat x)
+{ return vec_div(vsetall__vf(1.0f), x); }
+static INLINE vdouble vrec_vd_vd(vdouble x)
+{ return vec_div(vsetall__vd(1.0), x); }
+
+/**********************************************
+ ** Math
+ **********************************************/
+
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_max(x, y); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_max(x, y); }
+
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y)
+{ return vec_min(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y)
+{ return vec_min(x, y); }
+
+static INLINE vfloat vabs_vf_vf(vfloat f)
+{ return vec_abs(f); }
+static INLINE vdouble vabs_vd_vd(vdouble d)
+{ return vec_abs(d); }
+
+static INLINE vfloat vsqrt_vf_vf(vfloat f)
+{ return vec_sqrt(f); }
+static INLINE vdouble vsqrt_vd_vd(vdouble d)
+{ return vec_sqrt(d); }
+
+
+/**********************************************
+ ** FMA3
+ **********************************************/
+#if CONFIG == 1 || CONFIG == 3
+
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_madd(x, y, z); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_madd(x, y, z); }
+
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_msub(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_msub(x, y, z); }
+
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_nmsub(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_nmsub(x, y, z); }
+
+#else
+
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_add(vec_mul(x, y), z); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_add(vec_mul(x, y), z); }
+
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_sub(vec_mul(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_sub(vec_mul(x, y), z); }
+
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_sub(z, vec_mul(x, y)); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_sub(z, vec_mul(x, y)); }
+
+#endif
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_madd(x, y, z); }
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_madd(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_madd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_madd(x, y, z); }
+
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_msub(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_msub(x, y, z); }
+
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_nmsub(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_nmsub(x, y, z); }
+
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vec_nmadd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vec_nmadd(x, y, z); }
+
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
+{ return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
+{ return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) {
+  return vec_all_eq((__vector signed long long)g, vzero__s64());
+}
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
+  return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (v__b64)o);
+}
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
+  return (vmask)vec_sub((__vector signed long long)x, (__vector signed long long)y);
+}
+
+static INLINE vmask vneg64_vm_vm(vmask x) {
+  return (vmask)vec_sub((__vector signed long long) {0, 0}, (__vector signed long long)x);
+}
+
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
+  return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
+}
+
+#define vsll64_vm_vm_i(x, c) ((vmask)vec_sl((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
+#define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
+
+static INLINE vint vcast_vi_vm(vmask vm) {
+  return (vint) { vm[0], vm[2] };
+}
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  return (vmask) (__vector signed long long) { vi[0], vi[1] };
+}
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec.h
@@ -0,0 +1,561 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdint.h>
+#include <math.h>
+#include "misc.h"
+
+#ifndef CONFIG
+#error CONFIG macro not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define ENABLE_SP
+//@#define ENABLE_SP
+
+#define LOG2VECTLENDP CONFIG
+//@#define LOG2VECTLENDP CONFIG
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#define DFTPRIORITY LOG2VECTLENDP
+#define ISANAME "Pure C Array"
+
+typedef union {
+  uint32_t u[VECTLENDP*2];
+  uint64_t x[VECTLENDP];
+  double d[VECTLENDP];
+  float f[VECTLENDP*2];
+  int32_t i[VECTLENDP*2];
+} versatileVector;
+
+typedef versatileVector vmask;
+typedef versatileVector vopmask;
+typedef versatileVector vdouble;
+typedef versatileVector vint;
+typedef versatileVector vfloat;
+typedef versatileVector vint2;
+
+typedef union {
+  uint8_t u[sizeof(long double)*VECTLENDP];
+  long double ld[VECTLENDP];
+} longdoubleVector;
+
+typedef longdoubleVector vmaskl;
+typedef longdoubleVector vlongdouble;
+
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+typedef union {
+  uint8_t u[sizeof(Sleef_quad)*VECTLENDP];
+  Sleef_quad q[VECTLENDP];
+} quadVector;
+
+typedef quadVector vmaskq;
+typedef quadVector vquad;
+#endif
+
+//
+
+static INLINE int vavailability_i(int name) { return -1; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENDP;i++) ret = ret && g.x[i]; return ret;
+}
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENSP;i++) ret = ret && g.u[i]; return ret;
+}
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) {
+  vint2 vi;
+  for(int i=0;i<VECTLENSP;i++) vi.i[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
+  for(int i=0;i<VECTLENSP;i++) p[i] = v.i[i];
+}
+
+static vint vloadu_vi_p(int32_t *p) {
+  vint vi;
+  for(int i=0;i<VECTLENDP;i++) vi.i[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi(int32_t *p, vint v) {
+  for(int i=0;i<VECTLENDP;i++) p[i] = v.i[i];
+}
+
+//
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret.u[i] = m.u[i*2+1];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.u[i] = 0;
+  return ret;
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret.u[i*2] = ret.u[i*2+1] = m.u[i];
+  return ret;
+}
+
+static INLINE vmask vcast_vm_i_i(int h, int l) {
+  vmask ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret.u[i*2+0] = l;
+    ret.u[i*2+1] = h;
+  }
+  return ret;
+}
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret.i[i*2+0] = 0;
+    ret.i[i*2+1] = vi.i[i];
+  }
+  return ret;
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i*2+1];
+  return ret;
+}
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i];
+  return ret;
+}
+
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi.i[i];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.i[i] = 0;
+  return ret;
+}
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.d[i*2+0] = d0.d[i*2+1];
+    r.d[i*2+1] = d0.d[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vdouble vreva2_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.d[i*2+0] = d0.d[(VECTLENDP/2-1-i)*2+0];
+    r.d[i*2+1] = d0.d[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r.f[i*2+0] = d0.f[i*2+1];
+    r.f[i*2+1] = d0.f[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vfloat vreva2_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r.f[i*2+0] = d0.f[(VECTLENSP/2-1-i)*2+0];
+    r.f[i*2+1] = d0.f[(VECTLENSP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vdouble vcast_vd_d(double d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = d; return ret; }
+
+//
+
+static INLINE vopmask vand_vo_vo_vo   (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vopmask vor_vo_vo_vo    (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vopmask vxor_vo_vo_vo   (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+static INLINE vmask vand_vm_vm_vm     (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vmask vandnot_vm_vm_vm  (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vmask vor_vm_vm_vm      (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vmask vxor_vm_vm_vm     (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)   { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)       { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)   { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)       { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+//
+
+static INLINE vdouble vsel_vd_vo_vd_vd   (vopmask o, vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
+static INLINE vint2   vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)     { vint2 ret;   for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = vi.i[i]; return ret; }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = (int)vd.d[i]; return ret; }
+static INLINE vint vrint_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = vd.d[i] > 0 ? (int)(vd.d[i] + 0.5) : (int)(vd.d[i] - 0.5); return ret; }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+static INLINE vint vcast_vi_i(int j) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = j; return ret; }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] == y.x[i] ? -1 : 0; return ret; }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] + y.x[i]; return ret; }
+
+//
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { union { vdouble vd; vmask vm; } cnv; cnv.vd = vd; return cnv.vm; }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { union { vdouble vd; vint2 vi2; } cnv; cnv.vd = vd; return cnv.vi2; }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { union { vint2 vi2; vdouble vd; } cnv; cnv.vi2 = vi; return cnv.vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { union { vmask vm; vdouble vd; } cnv; cnv.vm = vm; return cnv.vd; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] + y.d[i]; return ret; }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] - y.d[i]; return ret; }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i]; return ret; }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] / y.d[i]; return ret; }
+static INLINE vdouble vrec_vd_vd(vdouble x)               { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = 1.0 / x.d[i];    return ret; }
+
+static INLINE vdouble vabs_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.x[i] & 0x7fffffffffffffffULL; return ret; }
+static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = -d.d[i]; return ret; }
+static INLINE vdouble vmla_vd_vd_vd_vd  (vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] + z.d[i]; return ret; }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] - z.d[i]; return ret; }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] > y.d[i] ? x.d[i] : y.d[i]; return ret; }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] < y.d[i] ? x.d[i] : y.d[i]; return ret; }
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ?  d.d[i] : -d.d[i]; return ret; }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? -d.d[i] :  d.d[i]; return ret; }
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? x.d[i] - y.d[i] : x.d[i] + y.d[i]; return ret; }
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] == y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] != y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <  y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <= y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >  y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >= y.d[i] ? -1 : 0; return ret; }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
+static INLINE vint vneg_vi_vi   (vint x)         { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = -x.i[i];         return ret; }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] &  y.i[i]; return ret; }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
+static INLINE vint vor_vi_vi_vi(vint x, vint y)     { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] |  y.i[i]; return ret; }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] ^  y.i[i]; return ret; }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y)    { return vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] << c; return ret; }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] >> c; return ret; }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  union { vopmask vo; vint2 vi2; } cnv;
+  cnv.vo = m;
+  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), x),
+                      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), y));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = (d.d[i] == SLEEF_INFINITY || d.d[i] == -SLEEF_INFINITY) ? -1 : 0; return ret; }
+static INLINE vopmask vispinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == SLEEF_INFINITY ? -1 : 0; return ret; }
+static INLINE vopmask visminf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == -SLEEF_INFINITY ? -1 : 0; return ret; }
+static INLINE vopmask visnan_vo_vd(vdouble d)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] != d.d[i] ? -1 : 0; return ret; }
+
+static INLINE vdouble vsqrt_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = sqrt(d.d[i]); return ret; }
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
+#endif
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.d[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.d[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { union { vint2 vi2; vmask vm; } cnv; cnv.vm = vm; return cnv.vi2; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { union { vint2 vi2; vmask vm; } cnv; cnv.vi2 = vi; return cnv.vm; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = vi.i[i]; return ret; }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = (int)vf.f[i]; return ret; }
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = vf.f[i] > 0 ? (int)(vf.f[i] + 0.5) : (int)(vf.f[i] - 0.5); return ret; }
+static INLINE vint2 vcast_vi2_i(int j) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = j; return ret; }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+
+static INLINE vfloat vcast_vf_f(float f) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = f; return ret; }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { union { vfloat vf; vmask vm; } cnv; cnv.vf = vf; return cnv.vm; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { union { vfloat vf; vmask vm; } cnv; cnv.vm = vm; return cnv.vf; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { union { vfloat vf; vint2 vi2; } cnv; cnv.vi2 = vi; return cnv.vf; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { union { vfloat vf; vint2 vi2; } cnv; cnv.vf = vf; return cnv.vi2; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] + y.f[i]; return ret; }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] - y.f[i]; return ret; }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i]; return ret; }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] / y.f[i]; return ret; }
+static INLINE vfloat vrec_vf_vf   (vfloat x)           { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = 1.0    / x.f[i]; return ret; }
+
+static INLINE vfloat vabs_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & 0x7fffffff; return ret; }
+static INLINE vfloat vneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = -x.f[i]; return ret; }
+static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] + z.f[i]; return ret; }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] - z.f[i]; return ret; }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] > y.f[i] ? x.f[i] : y.f[i]; return ret; }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] < y.f[i] ? x.f[i] : y.f[i]; return ret; }
+
+static INLINE vfloat vposneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ?  x.f[i] : -x.f[i]; return ret; }
+static INLINE vfloat vnegpos_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? -x.f[i] :  x.f[i]; return ret; }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] - y.f[i] : x.f[i] + y.f[i]; return ret; }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] == y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] != y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <  y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <= y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >  y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >= y.f[i]) ? -1 : 0); return ret; }
+
+static INLINE vint vadd_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
+static INLINE vint vsub_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
+static INLINE vint vneg_vi2_vi2(vint x)             { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = -x.i[i]; return ret; }
+
+static INLINE vint vand_vi2_vi2_vi2(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] &  y.i[i]; return ret; }
+static INLINE vint vandnot_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
+static INLINE vint vor_vi2_vi2_vi2(vint x, vint y)     { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] |  y.i[i]; return ret; }
+static INLINE vint vxor_vi2_vi2_vi2(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] ^  y.i[i]; return ret; }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
+  union { vopmask vo; vint2 vi2; } cnv;
+  cnv.vo = x;
+  return vand_vi2_vi2_vi2(cnv.vi2, y);
+}
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(x, y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] << c; return ret; }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >> c; return ret; }
+
+static INLINE vopmask visinf_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (d.f[i] == SLEEF_INFINITYf || d.f[i] == -SLEEF_INFINITYf) ? -1 : 0; return ret; }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == SLEEF_INFINITYf ? -1 : 0; return ret; }
+static INLINE vopmask visminf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == -SLEEF_INFINITYf ? -1 : 0; return ret; }
+static INLINE vopmask visnan_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] != d.f[i] ? -1 : 0; return ret; }
+
+static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
+static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
+static INLINE vint2   veq_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
+static INLINE vint2   vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
+
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = sqrtf(x.f[i]); return ret; }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) { return v.f[0]; }
+#endif
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
+static INLINE vfloat vloadu_vf_p(const float *ptr) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[i];
+  return vf;
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
+  return vf;
+}
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
+  for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
+}
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  for(int i=0;i<VECTLENSP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.f[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.f[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+static INLINE vlongdouble vcast_vl_l(long double d) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = d; return ret; }
+
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.ld[i*2+0] = d0.ld[i*2+1];
+    r.ld[i*2+1] = d0.ld[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.ld[i*2+0] = d0.ld[(VECTLENDP/2-1-i)*2+0];
+    r.ld[i*2+1] = d0.ld[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] + y.ld[i]; return ret; }
+static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] - y.ld[i]; return ret; }
+static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] * y.ld[i]; return ret; }
+
+static INLINE vlongdouble vneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = -x.ld[i]; return ret; }
+static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] - y.ld[i] : x.ld[i] + y.ld[i]; return ret; }
+static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ?  x.ld[i] : -x.ld[i]; return ret; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? -x.ld[i] :  x.ld[i]; return ret; }
+
+static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
+static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
+  vlongdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd.ld[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v.ld[i];
+}
+static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.ld[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.ld[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
+
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+static INLINE vquad vcast_vq_q(Sleef_quad d) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = d; return ret; }
+
+static INLINE vquad vrev21_vq_vq(vquad d0) {
+  vquad r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.q[i*2+0] = d0.q[i*2+1];
+    r.q[i*2+1] = d0.q[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vquad vreva2_vq_vq(vquad d0) {
+  vquad r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.q[i*2+0] = d0.q[(VECTLENDP/2-1-i)*2+0];
+    r.q[i*2+1] = d0.q[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] + y.q[i]; return ret; }
+static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] - y.q[i]; return ret; }
+static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] * y.q[i]; return ret; }
+
+static INLINE vquad vneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = -x.q[i]; return ret; }
+static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] - y.q[i] : x.q[i] + y.q[i]; return ret; }
+static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
+static INLINE vquad vposneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ?  x.q[i] : -x.q[i]; return ret; }
+static INLINE vquad vnegpos_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? -x.q[i] :  x.q[i]; return ret; }
+
+static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
+static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
+  vquad vd;
+  for(int i=0;i<VECTLENDP;i++) vd.q[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v.q[i];
+}
+static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.q[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.q[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec_scalar.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec_scalar.h
@@ -0,0 +1,487 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2023.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if !defined(SLEEF_GENHEADER)
+#include <stdint.h>
+#endif
+
+#ifndef ENABLE_BUILTIN_MATH
+
+#if !defined(SLEEF_GENHEADER)
+#include <math.h>
+#endif
+
+#define SQRT sqrt
+#define SQRTF sqrtf
+#define FMA fma
+#define FMAF fmaf
+#define RINT rint
+#define RINTF rintf
+#define TRUNC trunc
+#define TRUNCF truncf
+
+#else
+
+#define SQRT __builtin_sqrt
+#define SQRTF __builtin_sqrtf
+#define FMA __builtin_fma
+#define FMAF __builtin_fmaf
+#define RINT __builtin_rint
+#define RINTF __builtin_rintf
+#define TRUNC __builtin_trunc
+#define TRUNCF __builtin_truncf
+
+#endif
+
+#if !defined(SLEEF_GENHEADER)
+#include "misc.h"
+#endif
+
+#ifndef CONFIG
+#error CONFIG macro not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define ENABLE_SP
+//@#define ENABLE_SP
+
+#if CONFIG == 2 || CONFIG == 3
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+
+#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) || defined(__riscv) || CONFIG == 3
+#ifndef FP_FAST_FMA
+//@#ifndef FP_FAST_FMA
+#define FP_FAST_FMA
+//@#define FP_FAST_FMA
+#endif
+//@#endif
+#ifndef FP_FAST_FMAF
+//@#ifndef FP_FAST_FMAF
+#define FP_FAST_FMAF
+//@#define FP_FAST_FMAF
+#endif
+//@#endif
+#endif
+
+#if (!defined(FP_FAST_FMA) || !defined(FP_FAST_FMAF)) && !defined(SLEEF_GENHEADER)
+#error FP_FAST_FMA or FP_FAST_FMAF not defined
+#endif
+
+#define ISANAME "Pure C scalar with FMA"
+
+#else // #if CONFIG == 2 || CONFIG == 3
+#define ISANAME "Pure C scalar"
+#endif // #if CONFIG == 2 || CONFIG == 3
+
+#define LOG2VECTLENDP 0
+//@#define LOG2VECTLENDP 0
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+#define LOG2VECTLENSP 0
+//@#define LOG2VECTLENSP 0
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#if defined(__SSE4_1__) || defined(__aarch64__) || CONFIG == 3
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#endif
+
+#define DFTPRIORITY LOG2VECTLENDP
+
+typedef uint64_t vmask;
+typedef uint32_t vopmask;
+typedef double vdouble;
+typedef int32_t vint;
+typedef float vfloat;
+typedef int32_t vint2;
+
+typedef int64_t vint64;
+typedef uint64_t vuint64;
+
+typedef Sleef_uint64_2t vquad;
+
+#if CONFIG != 3
+typedef Sleef_quad vargquad;
+#else
+typedef Sleef_uint64_2t vargquad;
+#endif
+
+//
+
+static INLINE int vavailability_i(int name) { return -1; }
+static INLINE void vprefetch_v_p(const void *ptr) {}
+
+static INLINE int vtestallones_i_vo64(vopmask g) { return g; }
+static INLINE int vtestallones_i_vo32(vopmask g) { return g; }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return *p; }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { *p = v; }
+static vint vloadu_vi_p(int32_t *p) { return *p; }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { *p = v; }
+
+//
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return m; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return m; }
+static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (((uint64_t)h) << 32) | (uint32_t)l; }
+
+static INLINE vmask vcast_vm_i64(int64_t i) { return (int64_t)i; }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return i; }
+
+static INLINE vmask vcastu_vm_vi(vint vi) { return ((uint64_t)vi) << 32; }
+static INLINE vint vcastu_vi_vm(vmask vm) { return (int32_t)(vm >> 32); }
+
+static INLINE vdouble vcast_vd_d(double d) { return d; }
+
+//
+
+static INLINE vopmask vand_vo_vo_vo   (vopmask x, vopmask y) { return x & y; }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
+static INLINE vopmask vor_vo_vo_vo    (vopmask x, vopmask y) { return x | y; }
+static INLINE vopmask vxor_vo_vo_vo   (vopmask x, vopmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vm_vm     (vmask x, vmask y)     { return x & y; }
+static INLINE vmask vandnot_vm_vm_vm  (vmask x, vmask y)     { return y & ~x; }
+static INLINE vmask vor_vm_vm_vm      (vmask x, vmask y)     { return x | y; }
+static INLINE vmask vxor_vm_vm_vm     (vmask x, vmask y)     { return x ^ y; }
+
+static INLINE vmask vcast_vm_vo(vopmask o) { return (vmask)o | (((vmask)o) << 32); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) & y; }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)   { return y & ~vcast_vm_vo(x); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)       { return vcast_vm_vo(x) | y; }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) ^ y; }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) & y; }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)   { return y & ~vcast_vm_vo(x); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)       { return vcast_vm_vo(x) | y; }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) ^ y; }
+
+//
+
+static INLINE vdouble vsel_vd_vo_vd_vd   (vopmask o, vdouble x, vdouble y) { return o ? x : y; }
+static INLINE vint2   vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)     { return o ? x : y; }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return o ? v1 : v0; }
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) { return vi; }
+static INLINE vint vcast_vi_i(int j) { return j; }
+
+#ifdef FULL_FP_ROUNDING
+static INLINE vint vrint_vi_vd(vdouble d) { return (int32_t)RINT(d); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return RINT(vd); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return TRUNC(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return (int32_t)TRUNC(vd); }
+#else
+static INLINE vint vrint_vi_vd(vdouble a) {
+  a += a > 0 ? 0.5 : -0.5;
+  uint64_t vx;
+  memcpy(&vx, &a, sizeof(vx));
+  vx -= 1 & (int)a;
+  memcpy(&a, &vx, sizeof(a));
+  return a;
+}
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return vd; }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+#endif
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return x == y ? ~(uint32_t)0 : 0; }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return x + y; }
+
+//
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { vmask vm; memcpy(&vm, &vd, sizeof(vm)); return vm; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { vdouble vd; memcpy(&vd, &vm, sizeof(vd)); return vd; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
+static INLINE vdouble vrec_vd_vd(vdouble x)               { return 1 / x; }
+
+static INLINE vdouble vabs_vd_vd(vdouble d) {
+  uint64_t vx;
+  memcpy(&vx, &d, sizeof(vx));
+  vx &= UINT64_C(0x7fffffffffffffff);
+  memcpy(&d, &vx, sizeof(d));
+  return d;
+}
+static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
+
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return x > y ? x : y; }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return x < y ? x : y; }
+
+#ifndef ENABLE_FMA_DP
+static INLINE vdouble vmla_vd_vd_vd_vd  (vdouble x, vdouble y, vdouble z) { return x * y + z; }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return -x * y + z; }
+#else
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, -z); }
+#endif
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)  { return x == y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return x != y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)  { return x <  y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)  { return x <= y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)  { return x >  y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)  { return x >= y ? ~(uint32_t)0 : 0; }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
+static INLINE vint vneg_vi_vi   (vint x)         { return   - x; }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y)    { return x & y; }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
+static INLINE vint vor_vi_vi_vi(vint x, vint y)     { return x | y; }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y)    { return x ^ y; }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y)    { return x & y; }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~x; }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return (uint32_t)x << c; }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (uint32_t)x >> c; }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return x == y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return x >  y ? ~(uint32_t)0 : 0; }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return m ? x : y; }
+
+static INLINE vopmask visinf_vo_vd(vdouble d)  { return (d == SLEEF_INFINITY || d == -SLEEF_INFINITY) ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vispinf_vo_vd(vdouble d) { return d == SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
+static INLINE vopmask visminf_vo_vd(vdouble d) { return d == -SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
+static INLINE vopmask visnan_vo_vd(vdouble d)  { return d != d ? ~(uint32_t)0 : 0; }
+
+static INLINE vdouble vsqrt_vd_vd(vdouble d) { return SQRT(d); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return SQRTF(x); }
+
+static INLINE double vcast_d_vd(vdouble v) { return v; }
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return *ptr; }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return *ptr; }
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return ptr[vi]; }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (int32_t)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (uint32_t)vi; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (int32_t)vi; }
+static INLINE vint2 vcast_vi2_i(int j) { return j; }
+
+#ifdef FULL_FP_ROUNDING
+static INLINE vint2 vrint_vi2_vf(vfloat d) { return (int)RINTF(d); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return RINTF(vd); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return TRUNCF(vd); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (int32_t)TRUNCF(vf); }
+#else
+static INLINE vint2 vrint_vi2_vf(vfloat a) {
+  a += a > 0 ? 0.5f : -0.5f;
+  uint32_t vu[1];
+  memcpy(vu, &a, sizeof(vu));
+  vu[0] -= 1 & (int)a;
+  memcpy(&a, vu, sizeof(a));
+  return (int32_t)a;
+}
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vf; }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+#endif
+
+static INLINE vfloat vcast_vf_f(float f) { return f; }
+static INLINE vmask vreinterpret_vm_vf(vfloat f) { vfloat vf[2] = { f, 0 }; vmask vm; memcpy(&vm, &vf, sizeof(vm)); return vm; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { vfloat vf[2]; memcpy(&vf, &vm, sizeof(vf)); return vf[0]; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { vfloat vf; memcpy(&vf, &vi, sizeof(vf)); return vf; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat f) { vint2 vi2; memcpy(&vi2, &f, sizeof(vi2)); return vi2; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
+static INLINE vfloat vrec_vf_vf   (vfloat x)           { return 1 / x; }
+
+static INLINE vfloat vabs_vf_vf(vfloat x) {
+  int32_t vi[1];
+  memcpy(vi, &x, sizeof(vi));
+  vi[0] &= 0x7fffffff;
+  memcpy(&x, vi, sizeof(x));
+  return x;
+}
+static INLINE vfloat vneg_vf_vf(vfloat x) { return -x; }
+
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return x > y ? x : y; }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return x < y ? x : y; }
+
+#ifndef ENABLE_FMA_SP
+static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return x * y + z; }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return - x * y + z; }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x * y - z; }
+#else
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, -z); }
+#endif
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)  { return x == y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return x != y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)  { return x <  y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)  { return x <= y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)  { return x >  y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)  { return x >= y ? ~(uint32_t)0 : 0; }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
+static INLINE vint2 vneg_vi2_vi2(vint2 x) { return -x; }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)    { return x & y; }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)     { return x | y; }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)    { return x ^ y; }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return o ? x : y; }
+static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return o ? v1 : v0; }
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vcast_vm_vo(x) & y; }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~vcast_vm_vo(x); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
+  return x << c;
+}
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
+  return ((uint32_t)x) >> c;
+}
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
+  return x >> c;
+}
+
+static INLINE vopmask visinf_vo_vf (vfloat d) { return (d == SLEEF_INFINITYf || d == -SLEEF_INFINITYf) ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return d == SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return d == -SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
+static INLINE vopmask visnan_vo_vf (vfloat d) { return d != d ? ~(uint32_t)0 : 0; }
+
+static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
+static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x >  (int32_t)y ? ~(uint32_t)0 : 0; }
+static INLINE vint2   veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
+static INLINE vint2   vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x >  (int32_t)y ? ~(uint32_t)0 : 0; }
+
+static INLINE float vcast_f_vf(vfloat v) { return v; }
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return *ptr; }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return *ptr; }
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) { return ptr[vi]; }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
+
+//
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(8 + (char *)&vq, p, 8);
+  memcpy((char *)&vq, 8 + p, 8);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(8 + (char *)&vq, (char *)&aq, 8);
+  memcpy((char *)&vq, 8 + (char *)&aq, 8);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(8 + (char *)&aq, (char *)&vq, 8);
+  memcpy((char *)&aq, 8 + (char *)&vq, 8);
+  return aq;
+}
+#else
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, sizeof(vq));
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, sizeof(vq));
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, sizeof(aq));
+  return aq;
+}
+#endif
+
+//
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) { return !g ? ~(uint32_t)0 : 0; }
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return o ? x : y; }
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return (int64_t)x - (int64_t)y; }
+static INLINE vmask vneg64_vm_vm(vmask x) { return -(int64_t)x; }
+
+#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
+#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
+//@#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
+//@#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
+
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (int64_t)x > (int64_t)y ? ~(uint32_t)0 : 0; }
+
+static INLINE vmask vcast_vm_vi(vint vi) { return vi; }
+static INLINE vint vcast_vi_vm(vmask vm) { return vm; }
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperrvv.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperrvv.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpers390x_128.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpers390x_128.h
@@ -0,0 +1,462 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 140 || CONFIG == 141 || CONFIG == 150 || CONFIG == 151
+
+#if !defined(__VX__) && !defined(SLEEF_GENHEADER)
+#error This helper is for IBM s390x.
+#endif
+
+#if __ARCH__ < 12 && !defined(SLEEF_GENHEADER)
+#error Please specify -march=z14 or higher.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 1
+//@#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#if CONFIG == 140 || CONFIG == 150
+#define ENABLE_FMA_DP
+//@#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+//@#define ENABLE_FMA_SP
+#endif
+
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+
+#if !defined(SLEEF_GENHEADER)
+#ifndef SLEEF_VECINTRIN_H_INCLUDED
+#include <vecintrin.h>
+#define SLEEF_VECINTRIN_H_INCLUDED
+#endif
+
+#include <stdint.h>
+#include <math.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+typedef __vector unsigned long long vmask;
+typedef __vector unsigned long long vopmask;
+
+typedef __vector double vdouble;
+typedef __vector int vint;
+
+typedef __vector float vfloat;
+typedef __vector int vint2;
+
+typedef __vector long long vint64;
+typedef __vector unsigned long long vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+//
+
+#if !defined(SLEEF_GENHEADER)
+
+static INLINE int vavailability_i(int n) {
+  if (n == 1 || n == 2) {
+    return vec_max((vdouble) {n, n}, (vdouble) {n, n})[0] != 0;
+  }
+  return 0;
+}
+
+#if CONFIG == 140 || CONFIG == 141
+#define ISANAME "VXE"
+#else
+#define ISANAME "VXE2"
+#endif
+
+#define DFTPRIORITY 14
+
+#endif // #if !defined(SLEEF_GENHEADER)
+
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static vint2 vloadu_vi2_p(int32_t *p) { return (vint2) { p[0], p[1], p[2], p[3] }; }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
+static vint vloadu_vi_p(int32_t *p) { return (vint) { p[0], p[1] }; }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { p[0] = v[0]; p[1] = v[1]; }
+
+static INLINE vdouble vload_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
+static INLINE void vstore_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
+static INLINE vdouble vloadu_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
+static INLINE void vstoreu_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
+
+static INLINE vfloat vload_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
+static INLINE void vstore_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
+static INLINE void vscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) {
+  *(p+(offset + step * 0)*2 + 0) = v[0];
+  *(p+(offset + step * 0)*2 + 1) = v[1];
+  *(p+(offset + step * 1)*2 + 0) = v[2];
+  *(p+(offset + step * 1)*2 + 1) = v[3];
+}
+
+static INLINE vfloat vloadu_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
+static INLINE void vstoreu_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&p[2*offset]), v); }
+
+static INLINE vdouble vgather_vd_p_vi(const double *p, vint vi) {
+  return ((vdouble) { p[vi[0]], p[vi[1]] });
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *p, vint2 vi2) {
+  return ((vfloat) { p[vi2[0]], p[vi2[1]], p[vi2[2]], p[vi2[3]] });
+}
+
+static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (long long)-1 : 0, i ? (long long)-1 : 0 }; }
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
+
+static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { vi[0], vi[1] }; }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { vi[0], vi[1], vi[2], vi[3] }; }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 5); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 4); }
+
+static INLINE vint vrint_vi_vd(vdouble vd) {
+  vd = vrint_vd_vd(vd);
+  return (vint) { vd[0], vd[1] };
+}
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { vd[0], vd[1] }; }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { vf[0], vf[1], vf[2], vf[3] }; }
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return 1 / x; }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return vec_sel(y, x, o); }
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return vec_sel(y, x, (__vector unsigned int)o); }
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return vec_sel(y, x, (__vector unsigned int)o); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) { return vec_all_ne((vint2)g, (vint2 ) { 0, 0, 0, 0 }); }
+static INLINE int vtestallones_i_vo64(vopmask g) { return vec_all_ne((__vector unsigned long long)g, (__vector unsigned long long) { 0, 0 }); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask g) { return (vopmask)(vint) { g[0] != 0 ? -1 : 0, g[1] != 0 ? -1 : 0, 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask g) { return (vopmask) { ((vint)g)[0] != 0 ? 0xffffffffffffffffLL : 0, ((vint)g)[1] != 0 ? 0xffffffffffffffffLL : 0 }; }
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask)(vint){ h, l, h, l }; }
+static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)(vint64){ i, i }; }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)(vuint64){ i, i }; }
+
+static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)(vint2){ vi[0], 0, vi[1], 0 }; }
+static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ vi2[0] >> 32, vi2[1] >> 32 }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  return (vopmask) { x[0] == y[0] ? 0xffffffffffffffffLL : 0, x[1] == y[1] ? 0xffffffffffffffffLL : 0 };
+}
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+  return (vmask)((__vector long long)x +  (__vector long long)y);
+}
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+//
+
+static INLINE vdouble vabs_vd_vd(vdouble d) { return vec_abs(d); }
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+
+#if CONFIG == 140 || CONFIG == 150
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
+#else
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+#endif
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmadd(x, y, z); }
+
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+
+#if CONFIG == 140 || CONFIG == 150
+static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
+static INLINE vfloat vfma_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmadd(x, y, z); }
+#else
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+#endif
+
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+
+//
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+//
+
+static INLINE vopmask vnot_vo_vo(vopmask o) { return ~o; }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpeq(x, y); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmplt(x, y); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmple(x, y); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpgt(x, y); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpge(x, y); }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
+static INLINE vint vneg_vi_vi(vint e) { return -e; }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, vreinterpretFirstHalf_vi_vi2((vint2)x)); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> (__vector int){c, c, c, c}; }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return vec_cmpgt(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpeq(x, y)); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpgt(x, y));}
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
+                      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY))); }
+static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(SLEEF_INFINITY))); }
+static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(-SLEEF_INFINITY))); }
+static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(vnot_vo_vo(vec_cmpeq(d, d))); }
+
+static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
+static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
+
+static INLINE void vstream_v_p_vd(double *p, vdouble v) { vstore_v_p_vd(p, v); }
+static INLINE void vsscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(p, offset, step, v); }
+
+//
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return  y & ~x; }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> (__vector int){c, c, c, c}; }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpeq(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpgt(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpeq(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpgt(x, y); }
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(p, offset, step, v); }
+static INLINE void vstream_v_p_vf(float *p, vfloat v) { vstore_v_p_vf(p, v); }
+
+//
+
+static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vec_sqrt(d); }
+
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vec_max(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vec_min(x, y); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpeq(x, y); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmplt(x, y); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmple(x, y); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpgt(x, y); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpge(x, y); }
+
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vec_abs(f); }
+static INLINE vfloat vrint_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 4); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 5); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vec_max(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vec_min(x, y); }
+
+static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vec_sqrt(d); }
+
+static INLINE vopmask visinf_vo_vf (vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf (vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) {
+  vf = vrint_vf_vf(vf);
+  return (vint) { vf[0], vf[1], vf[2], vf[3] };
+}
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad m = { aq.y, aq.x };
+  return m;
+}
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad a = { vq.y, vq.x };
+  return a;
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) {
+  return vec_all_eq((__vector signed long long)g, (__vector signed long long){ 0, 0 });
+}
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
+  return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (__vector __bool long long)o);
+}
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
+  return (vmask)((__vector signed long long)x - (__vector signed long long)y);
+}
+
+static INLINE vmask vneg64_vm_vm(vmask x) {
+  return (vmask)((__vector signed long long) {0, 0} - (__vector signed long long)x);
+}
+
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
+  return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
+}
+
+#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { c, c }))
+#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { c, c }))
+
+static INLINE vint vcast_vi_vm(vmask vm) {
+  return (vint) { vm[0], vm[1] };
+}
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  return (vmask) (__vector signed long long) { vi[0], vi[1] };
+}
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersse2.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersse2.h
@@ -0,0 +1,517 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 2
+
+#if !defined(__SSE2__) && !defined(SLEEF_GENHEADER)
+#error Please specify -msse2.
+#endif
+
+#elif CONFIG == 3
+
+#if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER)
+#error Please specify -msse2 and -msse3
+#endif
+
+#elif CONFIG == 4
+
+#if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER)
+#error Please specify -msse2, -msse3 and -msse4.1
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+//@#define ENABLE_DP
+#define LOG2VECTLENDP 1
+//@#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+//@#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+//@#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+//@#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define ACCURATE_SQRT
+//@#define ACCURATE_SQRT
+
+#if !defined(SLEEF_GENHEADER)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <stdint.h>
+#include "misc.h"
+#endif // #if !defined(SLEEF_GENHEADER)
+
+typedef __m128i vmask;
+typedef __m128i vopmask;
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+
+typedef __m128  vfloat;
+typedef __m128i vint2;
+
+typedef __m128i vint64;
+typedef __m128i vuint64;
+
+typedef struct {
+  vmask x, y;
+} vquad;
+
+typedef vquad vargquad;
+
+//
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef __SLEEF_H__
+void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+static INLINE int cpuSupportsSSE2() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 1, 0);
+    return (reg[3] & (1 << 26)) != 0;
+}
+
+static INLINE int cpuSupportsSSE3() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 0)) != 0;
+}
+
+static INLINE int cpuSupportsSSE4_1() {
+    int32_t reg[4];
+    Sleef_x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 19)) != 0;
+}
+
+#if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__)
+static INLINE int vavailability_i(int name) {
+  //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3") && __builtin_cpu_supports("sse4.1");
+  int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1();
+  return d ? 3 : 0;
+}
+#define ISANAME "SSE4.1"
+#define DFTPRIORITY 12
+#elif defined(__SSE2__) && defined(__SSE3__)
+static INLINE int vavailability_i(int name) {
+  //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3");
+  int d = cpuSupportsSSE2() && cpuSupportsSSE3();
+  return d ? 3 : 0;
+}
+#define ISANAME "SSE3"
+#define DFTPRIORITY 11
+#else
+static INLINE int vavailability_i(int name) {
+  int d = cpuSupportsSSE2();
+  return d ? 3 : 0;
+}
+#define ISANAME "SSE2"
+#define DFTPRIORITY 10
+#endif
+
+#endif // #if !defined(SLEEF_GENHEADER)
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
+
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
+
+static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
+static INLINE vint2 vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
+static INLINE vint vcastu_vi_vm(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
+
+#if CONFIG == 4
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
+#define FULL_FP_ROUNDING
+//@#define FULL_FP_ROUNDING
+#else
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  vmask t = _mm_cmpeq_epi32(x, y);
+  return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));
+}
+#endif
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
+
+static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
+static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+#if CONFIG == 4
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }
+#else
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {
+  return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y));
+}
+#endif
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY)));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY)));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY)));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));
+}
+
+//
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[sizeof(vint)/sizeof(int)];
+  vstoreu_v_p_vi(a, vi);
+  return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
+}
+
+// This function is for debugging
+static INLINE double vcast_d_vd(vdouble v) {
+  double a[VECTLENDP];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }
+
+#if CONFIG != 4
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }
+#endif
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+
+#if CONFIG == 4
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }
+#else
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) {
+  return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y));
+}
+#endif
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
+  int a[VECTLENSP];
+  vstoreu_v_p_vi2(a, vi);
+  return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
+// This function is for debugging
+static INLINE float vcast_f_vf(vfloat v) {
+  float a[VECTLENSP];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+#if CONFIG >= 3
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
+#else
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+#endif
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
+
+//
+
+static vquad loadu_vq_p(void *p) {
+  vquad vq;
+  memcpy(&vq, p, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vquad cast_vq_aq(vargquad aq) {
+  vquad vq;
+  memcpy(&vq, &aq, VECTLENDP * 16);
+  return vq;
+}
+
+static INLINE vargquad cast_aq_vq(vquad vq) {
+  vargquad aq;
+  memcpy(&aq, &vq, VECTLENDP * 16);
+  return aq;
+}
+
+static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
+
+static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
+  return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y));
+}
+
+static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
+static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
+
+#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
+#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
+//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
+//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
+
+static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
+  int64_t ax[2], ay[2];
+  _mm_storeu_si128((__m128i *)ax, x);
+  _mm_storeu_si128((__m128i *)ay, y);
+  return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
+}
+
+static INLINE vmask vcast_vm_vi(vint vi) {
+  vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
+  return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
+}
+static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
+
+static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
+static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
+static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
+static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersve.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersve.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpervecext.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpervecext.h
@@ -0,0 +1,871 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdint.h>
+#include "misc.h"
+
+#ifndef CONFIG
+#error CONFIG macro not defined
+#endif
+
+#define ENABLE_DP
+#define ENABLE_SP
+
+#define LOG2VECTLENDP CONFIG
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define DFTPRIORITY LOG2VECTLENDP
+
+#if defined(__clang__)
+#define ISANAME "Clang Vector Extension"
+
+typedef uint32_t vmask __attribute__((ext_vector_type(VECTLENDP*2)));
+typedef uint32_t vopmask __attribute__((ext_vector_type(VECTLENDP*2)));
+
+typedef double vdouble __attribute__((ext_vector_type(VECTLENDP)));
+typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP)));
+
+typedef float vfloat __attribute__((ext_vector_type(VECTLENDP*2)));
+typedef int32_t vint2 __attribute__((ext_vector_type(VECTLENDP*2)));
+
+#ifdef ENABLE_LONGDOUBLE
+typedef uint8_t vmaskl __attribute__((ext_vector_type(sizeof(long double)*VECTLENDP)));
+typedef long double vlongdouble __attribute__((ext_vector_type(VECTLENDP)));
+#endif
+
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+typedef uint8_t vmaskq __attribute__((ext_vector_type(sizeof(Sleef_quad)*VECTLENDP)));
+#ifdef ENABLE_LONGDOUBLE
+typedef Sleef_quad vquad __attribute__((ext_vector_type(VECTLENDP)));
+#endif
+#endif
+#elif defined(__GNUC__)
+#define ISANAME "GCC Vector Extension"
+
+typedef uint32_t vmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+typedef uint32_t vopmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+
+typedef double vdouble __attribute__((vector_size(sizeof(double)*VECTLENDP)));
+typedef int32_t vint __attribute__((vector_size(sizeof(int32_t)*VECTLENDP)));
+
+typedef float vfloat __attribute__((vector_size(sizeof(float)*VECTLENDP*2)));
+typedef int32_t vint2 __attribute__((vector_size(sizeof(int32_t)*VECTLENDP*2)));
+
+#ifdef ENABLE_LONGDOUBLE
+typedef uint8_t vmaskl __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
+typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
+#endif
+
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+typedef uint8_t vmaskq __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
+typedef Sleef_quad vquad __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
+#endif
+#endif
+
+//
+
+#if VECTLENDP == 2
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1] }; }
+
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
+#endif
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+static INLINE vquad vcast_vq_q(Sleef_quad d) { return (vquad) { d, d }; }
+#endif
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h }; }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1] }; }
+static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3] }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0] }; }
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return vd; }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1] }; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1] }; }
+#endif
+
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+static INLINE vquad vrev21_vq_vq(vquad vd) { return (vquad) { vd[1], vd[0] }; }
+static INLINE vquad vreva2_vq_vq(vquad vd) { return vd; }
+static INLINE vquad vposneg_vq_vq(vquad vd) { return (vquad) { +vd[0], -vd[1] }; }
+static INLINE vquad vnegpos_vq_vq(vquad vd) { return (vquad) { -vd[0], +vd[1] }; }
+#endif
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
+#elif VECTLENDP == 4
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], 0, 0, 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3] }; }
+
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d }; }
+#endif
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h }; }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3] }; }
+static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7] }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], 0, 0, 0, 0 }; }
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
+static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3] }; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3] }; }
+#endif
+#elif VECTLENDP == 8
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], m[9], m[11], m[13], m[15], 0, 0, 0, 0, 0, 0, 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3], m[4], m[4], m[5], m[5], m[6], m[6], m[7], m[7] }; }
+
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i, i, i, i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d, d, d, d, d }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d, d, d, d, d }; }
+#endif
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h, l, h, l, h, l, h, l, h }; }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3], 0, vi[4], 0, vi[5], 0, vi[6], 0, vi[7] }; }
+static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7], vi2[9], vi2[11], vi2[13], vi2[15] }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3], vi2[4], vi2[5], vi2[6], vi2[7] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], vi[4], vi[5], vi[6], vi[7], 0, 0, 0, 0, 0, 0, 0, 0 }; }
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 })
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) {
+  return (vfloat) {
+    vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6],
+      vd[9], vd[8], vd[11], vd[10], vd[13], vd[12], vd[15], vd[14] };
+}
+static INLINE vfloat vreva2_vf_vf(vfloat vd) {
+  return (vfloat) {
+    vd[14], vd[15], vd[12], vd[13], vd[10], vd[11], vd[8], vd[9],
+      vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1]};
+}
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3], +vd[4], -vd[5], +vd[6], -vd[7] }; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3], -vd[4], +vd[5], -vd[6], +vd[7] }; }
+#endif
+#else
+static INLINE vint vcast_vi_i(int k) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = k;
+  return ret;
+}
+
+static INLINE vint2 vcast_vi2_i(int k) {
+  vint2 ret;
+  for(int i=0;i<VECTLENSP;i++) ret[i] = k;
+  return ret;
+}
+
+static INLINE vdouble vcast_vd_d(double d) {
+  vdouble ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = d;
+  return ret;
+}
+
+static INLINE vfloat vcast_vf_f(float f) {
+  vfloat ret;
+  for(int i=0;i<VECTLENSP;i++) ret[i] = f;
+  return ret;
+}
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) {
+  vlongdouble ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = d;
+  return ret;
+}
+#endif
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = m[i*2+1];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
+  return ret;
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i*2] = ret[i*2+1] = m[i];
+  return ret;
+}
+
+static INLINE vmask vcast_vm_i_i(int h, int l) {
+  vmask ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret[i*2+0] = l;
+    ret[i*2+1] = h;
+  }
+  return ret;
+}
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret[i*2+0] = 0;
+    ret[i*2+1] = vi[i];
+  }
+  return ret;
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i*2+1];
+  return ret;
+}
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i];
+  return ret;
+}
+
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = vi[i];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
+  return ret;
+}
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[i*2+1];
+    r[i*2+1] = d0[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vdouble vreva2_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
+    r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = d0[i*2+1];
+    r[i*2+1] = d0[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vfloat vreva2_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = d0[(VECTLENSP/2-1-i)*2+0];
+    r[i*2+1] = d0[(VECTLENSP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[i*2+1];
+    r[i*2+1] = d0[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
+    r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+#endif
+
+static INLINE vdouble vposneg_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = +d0[i*2+0];
+    r[i*2+1] = -d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vdouble vnegpos_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = -d0[i*2+0];
+    r[i*2+1] = +d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vposneg_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = +d0[i*2+0];
+    r[i*2+1] = -d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vnegpos_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = -d0[i*2+0];
+    r[i*2+1] = +d0[i*2+1];
+  }
+  return r;
+}
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = +d0[i*2+0];
+    r[i*2+1] = -d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = -d0[i*2+0];
+    r[i*2+1] = +d0[i*2+1];
+  }
+  return r;
+}
+#endif
+#endif
+
+//
+
+static INLINE int vavailability_i(int name) { return -1; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
+}
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
+}
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) {
+  vint2 vi;
+  for(int i=0;i<VECTLENSP;i++) vi[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
+  for(int i=0;i<VECTLENSP;i++) p[i] = v[i];
+}
+
+static vint vloadu_vi_p(int32_t *p) {
+  vint vi;
+  for(int i=0;i<VECTLENDP;i++) vi[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi(int32_t *p, vint v) {
+  for(int i=0;i<VECTLENDP;i++) p[i] = v[i];
+}
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
+
+//
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return (vdouble)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (vint2)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) {
+#if defined(__clang__)
+  return __builtin_convertvector(vi, vdouble);
+#else
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = vi[i];
+  return vd;
+#endif
+}
+static INLINE vint vtruncate_vi_vd(vdouble vd) {
+#if defined(__clang__)
+  return __builtin_convertvector(vd, vint);
+#else
+  vint vi;
+  for(int i=0;i<VECTLENDP;i++) vi[i] = vd[i];
+  return vi;
+#endif
+}
+static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+#if defined(__clang__)
+  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
+#endif
+  return (vopmask)((vi64)x == (vi64)y);
+}
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+#if defined(__clang__)
+  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
+#endif
+  return (vmask)((vi64)x + (vi64)y);
+}
+
+//
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (vint2)vd; }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (vdouble)vi; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return 1.0 / x; }
+
+static INLINE vdouble vabs_vd_vd(vdouble d) { return (vdouble)((vmask)d & ~(vmask)vcast_vd_d(-0.0)); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y + z; }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x > y), x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x < y), x, y); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x == y); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x != y); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x < y); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x <= y); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x > y); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x >= y); }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
+static INLINE vint vneg_vi_vi(vint e) { return -e; }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~vreinterpretFirstHalf_vi_vi2((vint2)x); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
+#endif
+  return (vint)(((vu)x) << c);
+}
+
+static INLINE vint vsrl_vi_vi_i(vint x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
+#endif
+  return (vint)(((vu)x) >> c);
+}
+
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return x == y; }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return x > y; }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x == y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x > y);}
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
+                      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vabs_vd_vd(d) == SLEEF_INFINITY); }
+static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(d == SLEEF_INFINITY); }
+static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(d == -SLEEF_INFINITY); }
+static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(d != d); }
+
+static INLINE vdouble vsqrt_vd_vd(vdouble d) {
+#if defined(__clang__)
+  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
+#endif
+
+  vdouble q = vcast_vd_d(1);
+
+  vopmask o = (vopmask)(d < 8.636168555094445E-78);
+  d = (vdouble)((o & (vmask)(d * 1.157920892373162E77)) | (~o & (vmask)d));
+
+  q = (vdouble)((o & (vmask)vcast_vd_d(2.9387358770557188E-39)) | (~o & (vmask)vcast_vd_d(1)));
+
+  q = (vdouble)vor_vm_vm_vm(vlt_vo_vd_vd(d, vcast_vd_d(0)), (vmask)q);
+
+  vdouble x = (vdouble)(0x5fe6ec85e7de30daLL - ((vi64)(d + 1e-320) >> 1));
+  x = x * (  3 - d * x * x);
+  x = x * ( 12 - d * x * x);
+  x = x * (768 - d * x * x);
+  x *= 1.0 / (1 << 13);
+  x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
+
+  return x * q;
+}
+
+static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
+static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
+static INLINE vdouble vloadu_vd_p(const double *ptr) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
+  return vd;
+}
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+
+//
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return (vfloat)(((vmask)o & (vmask)x) | (~(vmask)o & (vmask)y)); }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) {
+#if defined(__clang__)
+  return __builtin_convertvector(vi, vfloat);
+#else
+  vfloat vf;
+  for(int i=0;i<VECTLENDP*2;i++) vf[i] = vi[i];
+  return vf;
+#endif
+}
+
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) {
+#if defined(__clang__)
+  return __builtin_convertvector(vf, vint2);
+#else
+  vint2 vi;
+  for(int i=0;i<VECTLENDP*2;i++) vi[i] = vf[i];
+  return vi;
+#endif
+}
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return 1.0f / x; }
+
+static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x*y+z; }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return z-x*y; }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x > y), x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x < y), x, y); }
+
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x == y); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x != y); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x < y); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x <= y); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x > y); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x >= y); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return  y & ~x; }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+#endif
+  return (vint2)(((vu)x) << c);
+}
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+#endif
+  return (vint2)(((vu)x) >> c);
+}
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> c; }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x == y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x > y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return x == y; }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return x > y; }
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return (vopmask)(vabs_vf_vf(d) == SLEEF_INFINITYf); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return (vopmask)(d == SLEEF_INFINITYf); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return (vopmask)(d == -SLEEF_INFINITYf); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return (vopmask)(d != d); }
+
+static INLINE vfloat vsqrt_vf_vf(vfloat d) {
+  vfloat q = vcast_vf_f(1);
+
+  vopmask o = (vopmask)(d < 5.4210108624275221700372640043497e-20f); // 2^-64
+  d = (vfloat)((o & (vmask)(d * vcast_vf_f(18446744073709551616.0f))) | (~o & (vmask)d)); // 2^64
+  q = (vfloat)((o & (vmask)vcast_vf_f(0.00000000023283064365386962890625f)) | (~o & (vmask)vcast_vf_f(1))); // 2^-32
+  q = (vfloat)vor_vm_vm_vm(vlt_vo_vf_vf(d, vcast_vf_f(0)), (vmask)q);
+
+  vfloat x = (vfloat)(0x5f330de2 - (((vint2)d) >> 1));
+  x = x * ( 3.0f - d * x * x);
+  x = x * (12.0f - d * x * x);
+  x *= 0.0625f;
+  x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
+
+  return x * q;
+}
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
+static INLINE vfloat vloadu_vf_p(const float *ptr) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[i];
+  return vf;
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
+  return vf;
+}
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
+  for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  for(int i=0;i<VECTLENSP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return x + y; }
+static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { return x - y; }
+static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { return x * y; }
+
+static INLINE vlongdouble vneg_vl_vl(vlongdouble d) { return -d; }
+static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return vadd_vl_vl_vl(x, vnegpos_vl_vl(y)); }
+static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
+
+static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
+static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
+  vlongdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
+#endif
+
+#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
+static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { return x + y; }
+static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { return x - y; }
+static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { return x * y; }
+
+static INLINE vquad vneg_vq_vq(vquad d) { return -d; }
+static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { return vadd_vq_vq_vq(x, vnegpos_vq_vq(y)); }
+static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
+
+static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
+static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
+  vquad vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Compiler properties
+
+set(COMMON_TARGET_PROPERTIES
+  C_STANDARD 99                  # -std=gnu99
+  )
+
+if (BUILD_SHARED_LIBS)
+  list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON)   # -fPIC
+endif()
+
+# This is a workaround of appveyor bug
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}")
+
+# Target TARGET_LIBCOMMON_OBJ
+
+add_library(${TARGET_LIBCOMMON_OBJ} OBJECT common.c)
+set_target_properties(${TARGET_LIBCOMMON_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+# Target TARGET_LIBARRAYMAP_OBJ
+
+add_library(${TARGET_LIBARRAYMAP_OBJ} OBJECT arraymap.c)
+set_target_properties(${TARGET_LIBARRAYMAP_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+add_host_executable("addSuffix" addSuffix.c)
+set_target_properties("addSuffix" PROPERTIES C_STANDARD 99)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/addSuffix.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/addSuffix.c
@@ -0,0 +1,234 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdbool.h>
+
+#define N 1000
+
+FILE *cygopen(const char *path, const char *mode) {
+#if defined(__MINGW64__) || defined(__MINGW32__)
+  FILE *fp = fopen(path, mode);
+  if (fp != NULL) return fp;
+
+  char *buf = malloc(strlen(path) + N + 1);
+  snprintf(buf, strlen(path) + N, "cygpath -m '%s'", path);
+
+  FILE *pfp = popen(buf, "r");
+
+  if (pfp == NULL || fgets(buf, N, pfp) == NULL) {
+    if (pfp != NULL) pclose(pfp);
+    free(buf);
+    return NULL;
+  }
+
+  pclose(pfp);
+
+  int len = strlen(buf);
+  if (0 < len && len < N && buf[len-1] == '\n') buf[len-1] = '\0';
+
+  fp = fopen(buf, mode);
+
+  free(buf);
+
+  return fp;
+#else
+  return fopen(path, mode);
+#endif
+}
+
+int nkeywords = 0, nalloc = 0;
+char **keywords = NULL, *suffix = NULL;
+
+int nIgnore = 0;
+char **ignore = NULL;
+
+void insert(char *buf) {
+  for(int i=0;i<nIgnore;i++) if (strcmp(ignore[i], buf) == 0) return;
+
+  for(int i=0;i<nkeywords;i++) {
+    if (strcmp(keywords[i], buf) == 0) printf("%s", suffix);
+  }
+}
+
+void doit(FILE *fp) {
+  int state = 0;
+  bool nl = true;
+  char buf[N+10], *p = buf;
+
+  for(;;) {
+    int c = getc(fp);
+    if (c == EOF) break;
+    switch(state) {
+    case 0:
+      if (isalnum(c) || c == '_') {
+        ungetc(c, fp);
+        p = buf;
+        state = 1;
+        break;
+      }
+      if (c == '/') {
+        int c2 = getc(fp);
+        if (c2 == '*') {
+          putc(c, stdout);
+          putc(c2, stdout);
+          state = 4;
+          break;
+        } else if (c2 == '/') {
+          putc(c, stdout);
+          putc(c2, stdout);
+          do {
+            c = getc(fp);
+            putc(c, stdout);
+          } while(c != '\n');
+          break;
+        }
+        ungetc(c2, fp);
+      }
+      if (nl && c == '#') {
+        putc(c, stdout);
+        do {
+          c = getc(fp);
+          putc(c, stdout);
+        } while(c != '\n');
+        break;
+      }
+      putc(c, stdout);
+      if (!isspace(c)) nl = false;
+      if (c == '\n') nl = true;
+      if (c == '\"') state = 2;
+      if (c == '\'') state = 3;
+      break;
+
+    case 1: // Identifier
+      if (isalnum(c) || c == '_') {
+        if (p - buf < N) { *p++ = c; *p = '\0'; }
+        putc(c, stdout);
+      } else if (c == '\"') {
+        insert(buf);
+        putc(c, stdout);
+        state = 2;
+      } else if (c == '\'') {
+        insert(buf);
+        putc(c, stdout);
+        state = 3;
+      } else {
+        insert(buf);
+        putc(c, stdout);
+        state = 0;
+      }
+      break;
+
+    case 2: // String
+      if (c == '\\') {
+        putc(c, stdout);
+        putc(getc(fp), stdout);
+      } else if (c == '\"') {
+        putc(c, stdout);
+        state = 0;
+      } else {
+        putc(c, stdout);
+      }
+      break;
+
+    case 3: // Character
+      if (c == '\\') {
+        putc(c, stdout);
+        putc(getc(fp), stdout);
+      } else if (c == '\'') {
+        putc(c, stdout);
+        state = 0;
+      } else {
+        putc(c, stdout);
+      }
+      break;
+
+    case 4: // Comment
+      if (c == '*') {
+        int c2 = getc(fp);
+        if (c2 == '/') {
+          putc(c, stdout);
+          putc(c2, stdout);
+          state = 0;
+          break;
+        }
+        ungetc(c2, fp);
+      }
+      putc(c, stdout);
+      break;
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  nalloc = 1;
+  keywords = malloc(sizeof(char *) * nalloc);
+
+  if (argc < 2) {
+    fprintf(stderr, "%s <input file>\n", argv[0]);
+    fprintf(stderr, "Print the file on the standard output\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s <input file> <keywords file> <suffix> [<keywords to ignore> ... ]\n", argv[0]);
+    fprintf(stderr, "Add the suffix to keywords\n");
+    exit(-1);
+  }
+
+  char buf[N];
+
+  if (argc == 2) {
+    FILE *fp = cygopen(argv[1], "r");
+    if (fp == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[1]);
+      exit(-1);
+    }
+
+    while(fgets(buf, N, fp) != NULL) {
+      fputs(buf, stdout);
+    }
+    fclose(fp);
+    exit(0);
+  }
+
+  FILE *fp = cygopen(argv[2], "r");
+  if (fp == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[2]);
+    exit(-1);
+  }
+
+  while(fgets(buf, N, fp) != NULL) {
+    if (strlen(buf) >= 1) buf[strlen(buf)-1] = '\0';
+    keywords[nkeywords] = malloc(sizeof(char) * (strlen(buf) + 1));
+    strcpy(keywords[nkeywords], buf);
+    nkeywords++;
+    if (nkeywords >= nalloc) {
+      nalloc *= 2;
+      keywords = realloc(keywords, sizeof(char *) * nalloc);
+    }
+  }
+
+  fclose(fp);
+
+  nIgnore = argc - 4;
+  ignore = argv + 4;
+
+  suffix = argv[3];
+
+  fp = cygopen(argv[1], "r");
+  if (fp == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[1]);
+    exit(-1);
+  }
+
+  doit(fp);
+
+  fclose(fp);
+
+  exit(0);
+}
+
+// cat sleef*inline*.h | egrep -o '[a-zA-Z_][0-9a-zA-Z_]*' | sort | uniq > cand.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.c
@@ -0,0 +1,347 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <assert.h>
+
+//
+
+#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER))
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/file.h>
+
+static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); }
+static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); }
+static void FTRUNCATE(FILE *fp, off_t z) {
+  if (ftruncate(fileno(fp), z))
+    ;
+}
+static FILE *OPENTMPFILE() { return tmpfile(); }
+static void CLOSETMPFILE(FILE *fp) { fclose(fp); }
+#else
+#include <windows.h>
+#include <io.h>
+
+static void FLOCK(FILE *fp) { }
+static void FUNLOCK(FILE *fp) { }
+static void FTRUNCATE(FILE *fp, long z) {
+  fseek(fp, 0, SEEK_SET);
+  SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp)));
+}
+static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); }
+static void CLOSETMPFILE(FILE *fp) {
+  fclose(fp);
+  remove("tmpfile.txt");
+}
+#endif
+
+//
+
+#define MAGIC_ARRAYMAPNODE 0xf73130fa
+#define MAGIC_ARRAYMAP 0x8693bd21
+#define LOGNBUCKETS 8
+#define NBUCKETS (1 << LOGNBUCKETS)
+
+static int hash(uint64_t key) {
+  return (key ^ (key >> LOGNBUCKETS) ^ (key >> (LOGNBUCKETS*2)) ^ (key >> (LOGNBUCKETS*3))) & (NBUCKETS-1);
+}
+
+static void String_trim(char *str) {
+  char *dst = str, *src = str, *pterm = src;
+
+  while(*src != '\0' && isspace((int)*src)) src++;
+
+  for(;*src != '\0';src++) {
+    *dst++ = *src;
+    if (!isspace((int)*src)) pterm = dst;
+  }
+
+  *pterm = '\0';
+}
+
+typedef struct ArrayMapNode {
+  uint32_t magic;
+  uint64_t key;
+  void *value;
+} ArrayMapNode;
+
+typedef struct ArrayMap {
+  uint32_t magic;
+  ArrayMapNode *array[NBUCKETS];
+  int size[NBUCKETS], capacity[NBUCKETS], totalSize;
+} ArrayMap;
+
+ArrayMap *initArrayMap() {
+  ArrayMap *thiz = (ArrayMap *)calloc(1, sizeof(ArrayMap));
+  thiz->magic = MAGIC_ARRAYMAP;
+
+  for(int i=0;i<NBUCKETS;i++) {
+    thiz->capacity[i] = 8;
+    thiz->array[i] = (ArrayMapNode *)malloc(thiz->capacity[i] * sizeof(ArrayMapNode));
+    thiz->size[i] = 0;
+  }
+
+  thiz->totalSize = 0;
+  return thiz;
+}
+
+void ArrayMap_dispose(ArrayMap *thiz) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+
+  for(int j=0;j<NBUCKETS;j++) {
+    for(int i=0;i<thiz->size[j];i++) {
+      assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
+      thiz->array[j][i].magic = 0;
+    }
+    free(thiz->array[j]);
+  }
+
+  thiz->magic = 0;
+  free(thiz);
+}
+
+int ArrayMap_size(ArrayMap *thiz) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+  return thiz->totalSize;
+}
+
+uint64_t *ArrayMap_keyArray(ArrayMap *thiz) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+  uint64_t *a = (uint64_t *)malloc(sizeof(uint64_t) * thiz->totalSize);
+  int p = 0;
+  for(int j=0;j<NBUCKETS;j++) {
+    for(int i=0;i<thiz->size[j];i++) {
+      assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
+      a[p++] = thiz->array[j][i].key;
+    }
+  }
+  return a;
+}
+
+void **ArrayMap_valueArray(ArrayMap *thiz) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+  void **a = (void **)malloc(sizeof(void *) * thiz->totalSize);
+  int p = 0;
+  for(int j=0;j<NBUCKETS;j++) {
+    for(int i=0;i<thiz->size[j];i++) {
+      assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
+      a[p++] = thiz->array[j][i].value;
+    }
+  }
+  return a;
+}
+
+void *ArrayMap_remove(ArrayMap *thiz, uint64_t key) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+
+  int h = hash(key);
+  for(int i=0;i<thiz->size[h];i++) {
+    assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
+    if (thiz->array[h][i].key == key) {
+      void *old = thiz->array[h][i].value;
+      thiz->array[h][i].key   = thiz->array[h][thiz->size[h]-1].key;
+      thiz->array[h][i].value = thiz->array[h][thiz->size[h]-1].value;
+      thiz->array[h][thiz->size[h]-1].magic = 0;
+      thiz->size[h]--;
+      thiz->totalSize--;
+      return old;
+    }
+  }
+
+  return NULL;
+}
+
+void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value) {
+  if (value == NULL) return ArrayMap_remove(thiz, key);
+
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+
+  int h = hash(key);
+  for(int i=0;i<thiz->size[h];i++) {
+    assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
+    if (thiz->array[h][i].key == key) {
+      void *old = thiz->array[h][i].value;
+      thiz->array[h][i].value = value;
+      return old;
+    }
+  }
+
+  if (thiz->size[h] >= thiz->capacity[h]) {
+    thiz->capacity[h] *= 2;
+    thiz->array[h] = (ArrayMapNode *)realloc(thiz->array[h], thiz->capacity[h] * sizeof(ArrayMapNode));
+  }
+
+  ArrayMapNode *n = &(thiz->array[h][thiz->size[h]++]);
+  n->magic = MAGIC_ARRAYMAPNODE;
+  n->key = key;
+  n->value = value;
+
+  thiz->totalSize++;
+
+  return NULL;
+}
+
+void *ArrayMap_get(ArrayMap *thiz, uint64_t key) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+
+  int h = hash(key);
+  for(int i=0;i<thiz->size[h];i++) {
+    assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
+    if (thiz->array[h][i].key == key) {
+      return thiz->array[h][i].value;
+    }
+  }
+
+  return NULL;
+}
+
+#define LINELEN (1024*1024)
+
+ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock) {
+  const int idstrlen = (int)strlen(idstr);
+  int prefixLen = (int)strlen(prefix) + 3;
+
+  if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return NULL;
+
+  FILE *fp = fopen(fn, "r");
+  if (fp == NULL) return NULL;
+
+  if (doLock) FLOCK(fp);
+
+  ArrayMap *thiz = initArrayMap();
+
+  char *prefix2 = malloc(prefixLen+10);
+  strcpy(prefix2, prefix);
+  String_trim(prefix2);
+  for(char *p = prefix2;*p != '\0';p++) {
+    if (*p == ':') *p = ';';
+    if (*p == ' ') *p = '_';
+  }
+  strcat(prefix2, " : ");
+  prefixLen = (int)strlen(prefix2);
+
+  char *line = malloc(sizeof(char) * (LINELEN+10));
+  line[idstrlen] = '\0';
+
+  if (fread(line, sizeof(char), idstrlen, fp) != idstrlen ||
+      strcmp(idstr, line) != 0) {
+    if (doLock) FUNLOCK(fp);
+    fclose(fp);
+    free(prefix2);
+    free(line);
+    return NULL;
+  }
+
+  for(;;) {
+    line[LINELEN] = '\0';
+    if (fgets(line, LINELEN, fp) == NULL) break;
+    if (strncmp(line, prefix2, prefixLen) != 0) continue;
+
+    uint64_t key;
+    char *value = malloc(sizeof(char) * LINELEN);
+
+    if (sscanf(line + prefixLen, "%" SCNx64 " : %s\n", &key, value) == 2) {
+      ArrayMap_put(thiz, (uint64_t)key, (void *)value);
+    } else {
+      free(value);
+    }
+  }
+
+  if (doLock) FUNLOCK(fp);
+  fclose(fp);
+
+  free(prefix2);
+  free(line);
+
+  return thiz;
+}
+
+int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr) {
+  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
+
+  const int idstrlen = (int)strlen(idstr);
+  int prefixLen = (int)strlen(prefix) + 3;
+
+  if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return -1;
+
+  // Generate prefix2
+
+  char *prefix2 = malloc(prefixLen+10);
+  strcpy(prefix2, prefix);
+  String_trim(prefix2);
+  for(char *p = prefix2;*p != '\0';p++) {
+    if (*p == ':') *p = ';';
+    if (*p == ' ') *p = '_';
+  }
+  strcat(prefix2, " : ");
+  prefixLen = (int)strlen(prefix2);
+
+  //
+
+  FILE *fp = fopen(fn, "a+");
+  if (fp == NULL) return -1;
+
+  FLOCK(fp);
+  fseek(fp, 0, SEEK_SET);
+
+  // Copy the file specified by fn to tmpfile
+
+  FILE *tmpfp = OPENTMPFILE();
+  if (tmpfp == NULL) {
+    FUNLOCK(fp);
+    fclose(fp);
+    return -1;
+  }
+
+  char *line = malloc(sizeof(char) * (LINELEN+10));
+  line[idstrlen] = '\0';
+
+  if (fread(line, sizeof(char), idstrlen, fp) == idstrlen && strcmp(idstr, line) == 0) {
+    for(;;) {
+      line[LINELEN] = '\0';
+      if (fgets(line, LINELEN, fp) == NULL) break;
+      if (strncmp(line, prefix2, prefixLen) != 0) fputs(line, tmpfp);
+    }
+  }
+
+  // Write the contents in the map into tmpfile
+
+  uint64_t *keys = ArrayMap_keyArray(thiz);
+  int s = ArrayMap_size(thiz);
+
+  for(int i=0;i<s;i++) {
+    char *value = ArrayMap_get(thiz, keys[i]);
+    if (strlen(value) + prefixLen >= LINELEN-10) continue;
+    fprintf(tmpfp, "%s %" PRIx64 " : %s\n", prefix2, keys[i], value);
+  }
+
+  free(keys);
+
+  fseek(fp, 0, SEEK_SET);
+  FTRUNCATE(fp, 0);
+  fwrite(idstr, sizeof(char), strlen(idstr), fp);
+
+  fseek(tmpfp, 0, SEEK_SET);
+
+  for(;;) {
+    size_t s = fread(line, 1, LINELEN, tmpfp);
+    if (s == 0) break;
+    fwrite(line, 1, s, fp);
+  }
+
+  FUNLOCK(fp);
+  fclose(fp);
+
+  CLOSETMPFILE(tmpfp);
+  free(prefix2);
+  free(line);
+  return 0;
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.h
@@ -0,0 +1,21 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef __ARRAYMAP_H__
+#define __ARRAYMAP_H__
+typedef struct ArrayMap ArrayMap;
+
+ArrayMap *initArrayMap();
+void ArrayMap_dispose(ArrayMap *thiz);
+int ArrayMap_size(ArrayMap *thiz);
+void *ArrayMap_remove(ArrayMap *thiz, uint64_t key);
+void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value);
+void *ArrayMap_get(ArrayMap *thiz, uint64_t key);
+
+uint64_t *ArrayMap_keyArray(ArrayMap *thiz);
+void **ArrayMap_valueArray(ArrayMap *thiz);
+int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr);
+ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock);
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.c
@@ -0,0 +1,98 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "misc.h"
+
+#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
+#include <sys/timeb.h>
+
+EXPORT void *Sleef_malloc(size_t z) { return _aligned_malloc(z, 256); }
+EXPORT void Sleef_free(void *ptr) { _aligned_free(ptr); }
+
+EXPORT uint64_t Sleef_currentTimeMicros() {
+  struct __timeb64 t;
+  _ftime64(&t);
+  return t.time * INT64_C(1000000) + t.millitm*1000;
+}
+#elif defined(__APPLE__)
+#include <sys/time.h>
+
+EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 256, z); return ptr; }
+EXPORT void Sleef_free(void *ptr) { free(ptr); }
+
+EXPORT uint64_t Sleef_currentTimeMicros() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return (uint64_t)((time.tv_sec * INT64_C(1000000)) + time.tv_usec);
+}
+#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
+#include <time.h>
+#include <unistd.h>
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+#include <stdlib.h>
+#else
+#include <malloc.h>
+#endif
+
+EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 4096, z); return ptr; }
+EXPORT void Sleef_free(void *ptr) { free(ptr); }
+
+EXPORT uint64_t Sleef_currentTimeMicros() {
+  struct timespec tp;
+  clock_gettime(CLOCK_MONOTONIC, &tp);
+  return (uint64_t)tp.tv_sec * INT64_C(1000000) + ((uint64_t)tp.tv_nsec/1000);
+}
+#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
+
+#ifdef _MSC_VER
+#include <intrin.h>
+EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
+  __cpuidex(out, eax, ecx);
+}
+#else
+#if defined(__x86_64__) || defined(__i386__)
+EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
+static char x86BrandString[256];
+
+EXPORT char *Sleef_getCpuIdString() {
+  union {
+    int32_t info[4];
+    uint8_t str[16];
+  } u;
+  int i,j;
+  char *p;
+
+  p = x86BrandString;
+
+  for(i=0;i<3;i++) {
+    Sleef_x86CpuID(u.info, i + 0x80000002, 0);
+
+    for(j=0;j<16;j++) {
+      *p++ = u.str[j];
+    }
+  }
+
+  *p++ = '\n';
+
+  return x86BrandString;
+}
+#else
+EXPORT char *Sleef_getCpuIdString() {
+  return "Unknown architecture";
+}
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.h
@@ -0,0 +1,9 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+char *Sleef_getCpuIdString();
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/commonfuncs.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/commonfuncs.h
@@ -0,0 +1,438 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2023.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
+typedef struct {
+  vdouble x, y, z;
+} vdouble3;
+
+static INLINE CONST VECTOR_CC vdouble  vd3getx_vd_vd3(vdouble3 v) { return v.x; }
+static INLINE CONST VECTOR_CC vdouble  vd3gety_vd_vd3(vdouble3 v) { return v.y; }
+static INLINE CONST VECTOR_CC vdouble  vd3getz_vd_vd3(vdouble3 v) { return v.z; }
+static INLINE CONST VECTOR_CC vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z)  {
+  vdouble3 v = { x, y, z };
+  return v;
+}
+static INLINE CONST VECTOR_CC vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { v.x = d; return v; }
+static INLINE CONST VECTOR_CC vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { v.y = d; return v; }
+static INLINE CONST VECTOR_CC vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { v.z = d; return v; }
+
+//
+
+typedef struct {
+  vdouble2 a, b;
+} dd2;
+
+static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {
+  dd2 r = { a, b };
+  return r;
+}
+static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; }
+static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; }
+
+//
+
+typedef struct {
+  vmask e;
+  vdouble3 d3;
+} tdx;
+
+static INLINE CONST VECTOR_CC vmask tdxgete_vm_tdx(tdx t) { return t.e; }
+static INLINE CONST VECTOR_CC vdouble3 tdxgetd3_vd3_tdx(tdx t) { return t.d3; }
+static INLINE CONST VECTOR_CC vdouble tdxgetd3x_vd_tdx(tdx t) { return t.d3.x; }
+static INLINE CONST VECTOR_CC vdouble tdxgetd3y_vd_tdx(tdx t) { return t.d3.y; }
+static INLINE CONST VECTOR_CC vdouble tdxgetd3z_vd_tdx(tdx t) { return t.d3.z; }
+static INLINE CONST VECTOR_CC tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { t.e = e; return t; }
+static INLINE CONST VECTOR_CC tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { t.d3 = d3; return t; }
+static INLINE CONST VECTOR_CC tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { t.d3.x = x; return t; }
+static INLINE CONST VECTOR_CC tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { t.d3.y = y; return t; }
+static INLINE CONST VECTOR_CC tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { t.d3.z = z; return t; }
+static INLINE CONST VECTOR_CC tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) {
+  t.d3 = (vdouble3) { x, y, z };
+  return t;
+}
+
+static INLINE CONST VECTOR_CC tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return (tdx) { e, d3 }; }
+static INLINE CONST VECTOR_CC tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) {
+  return (tdx) { e, (vdouble3) { x, y, z } };
+}
+
+static INLINE CONST VECTOR_CC vmask vqgetx_vm_vq(vquad v) { return v.x; }
+static INLINE CONST VECTOR_CC vmask vqgety_vm_vq(vquad v) { return v.y; }
+static INLINE CONST VECTOR_CC vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return (vquad) { x, y }; }
+static INLINE CONST VECTOR_CC vquad vqsetx_vq_vq_vm(vquad v, vmask x) { v.x = x; return v; }
+static INLINE CONST VECTOR_CC vquad vqsety_vq_vq_vm(vquad v, vmask y) { v.y = y; return v; }
+
+//
+
+typedef struct {
+  vdouble d;
+  vint i;
+} di_t;
+
+static INLINE CONST VECTOR_CC vdouble digetd_vd_di(di_t d) { return d.d; }
+static INLINE CONST VECTOR_CC vint digeti_vi_di(di_t d) { return d.i; }
+static INLINE CONST VECTOR_CC di_t disetdi_di_vd_vi(vdouble d, vint i) {
+  di_t r = { d, i };
+  return r;
+}
+
+//
+
+typedef struct {
+  vdouble2 dd;
+  vint i;
+} ddi_t;
+
+static INLINE CONST VECTOR_CC vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; }
+static INLINE CONST VECTOR_CC vint ddigeti_vi_ddi(ddi_t d) { return d.i; }
+static INLINE CONST VECTOR_CC ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {
+  ddi_t r = { v, i };
+  return r;
+}
+static INLINE CONST VECTOR_CC ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {
+  ddi.dd = v;
+  return ddi;
+}
+
+//
+
+typedef struct {
+  vdouble3 td;
+  vint i;
+} tdi_t;
+
+static INLINE CONST VECTOR_CC vdouble3 tdigettd_vd3_tdi(tdi_t d) { return d.td; }
+static INLINE CONST VECTOR_CC vdouble tdigetx_vd_tdi(tdi_t d) { return d.td.x; }
+static INLINE CONST VECTOR_CC vint tdigeti_vi_tdi(tdi_t d) { return d.i; }
+static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) {
+  tdi_t r = { v, i };
+  return r;
+}
+#endif
+
+#if defined(ENABLE_MAIN)
+// Functions for debugging
+#include <stdio.h>
+#include <wchar.h>
+
+static void printvmask(char *mes, vmask g) {
+  uint64_t u[VECTLENDP];
+  vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(g));
+  printf("%s ", mes);
+  for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
+  printf("\n");
+}
+
+#if !defined(ENABLE_SVE)
+static void printvopmask(char *mes, vopmask g) {
+  union {
+    vopmask g;
+    uint8_t u[sizeof(vopmask)];
+  } cnv = { .g = g };
+  printf("%s ", mes);
+  for(int i=0;i<sizeof(vopmask);i++) printf("%02x", cnv.u[i]);
+  printf("\n");
+}
+#else
+static void printvopmask(char *mes, vopmask g) {
+  vmask m = vand_vm_vo64_vm(g, vcast_vm_i64(-1));
+  printvmask(mes, m);
+}
+#endif
+
+static void printvdouble(char *mes, vdouble vd) {
+  double u[VECTLENDP];
+  vstoreu_v_p_vd((double *)u, vd);
+  printf("%s ", mes);
+  for(int i=0;i<VECTLENDP;i++) printf("%.20g : ", u[i]);
+  printf("\n");
+}
+
+static void printvint(char *mes, vint vi) {
+  uint32_t u[VECTLENDP];
+  vstoreu_v_p_vi((int32_t *)u, vi);
+  printf("%s ", mes);
+  for(int i=0;i<VECTLENDP;i++) printf("%08x : ", (unsigned)u[i]);
+  printf("\n");
+}
+
+static void printvint64(char *mes, vint64 vi) {
+  uint64_t u[VECTLENDP*2];
+  vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vreinterpret_vm_vi64(vi)));
+  printf("%s ", mes);
+  for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
+  printf("\n");
+}
+
+static void printvquad(char *mes, vquad g) {
+  uint64_t u[VECTLENDP*2];
+  vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vqgetx_vm_vq(g)));
+  vstoreu_v_p_vd((double *)&u[VECTLENDP], vreinterpret_vd_vm(vqgety_vm_vq(g)));
+  printf("%s ", mes);
+  for(int i=0;i<VECTLENDP*2;i++) printf("%016lx : ", (unsigned long)(u[i]));
+  printf("\n");
+}
+#endif // #if defined(ENABLE_MAIN)
+
+///////////////////////////////////////////////////////////////////////////////////
+
+// vdouble functions
+
+static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) {
+  return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+}
+
+static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) {
+  return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));
+}
+
+static INLINE CONST vopmask visnonfinite_vo_vd(vdouble x) {
+  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(INT64_C(0x7ff0000000000000))), vcast_vm_i64(INT64_C(0x7ff0000000000000)));
+}
+
+static INLINE CONST vmask vsignbit_vm_vd(vdouble d) {
+  return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+}
+
+static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) {
+  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+}
+
+static INLINE CONST vdouble vclearlsb_vd_vd_i(vdouble d, int n) {
+  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_u64((~UINT64_C(0)) << n)));
+}
+
+static INLINE CONST VECTOR_CC vdouble vtoward0_vd_vd(vdouble x) { // returns nextafter(x, 0)
+  vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(-1)));
+  return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);
+}
+
+#if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
+static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {
+  return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
+}
+#endif
+
+static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) {
+  return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);
+}
+
+#if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
+static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) {
+  return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
+}
+
+static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {
+  return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)),
+                                          vand_vm_vm_vm   (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));
+}
+#endif
+
+static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) {
+#ifdef FULL_FP_ROUNDING
+  return vtruncate_vd_vd(x);
+#else
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
+#endif
+}
+
+static INLINE CONST VECTOR_CC vdouble vfloor2_vd_vd(vdouble x) {
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
+}
+
+static INLINE CONST VECTOR_CC vdouble vceil2_vd_vd(vdouble x) {
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
+}
+
+static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) {
+  vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);
+  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
+}
+
+static INLINE  CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) {
+#ifdef FULL_FP_ROUNDING
+  return vrint_vd_vd(d);
+#else
+  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);
+  return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),
+                          d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));
+#endif
+}
+
+static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) {
+  return veq_vo_vd_vd(vrint2_vd_vd(d), d);
+}
+
+static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) {
+  vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5));
+  return vneq_vo_vd_vd(vrint2_vd_vd(x), x);
+}
+
+// ilogb
+
+#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
+static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
+  vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
+  q = vand_vi_vi_vi(q, vcast_vi_i((int)(((1U << 12) - 1) << 20)));
+  q = vsrl_vi_vi_i(q, 20);
+  q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));
+  return q;
+}
+
+static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) {
+  vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
+  q = vsrl_vi_vi_i(q, 20);
+  q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));
+  q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));
+  return q;
+}
+#endif
+
+static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) {
+  vmask m = vreinterpret_vm_vd(d);
+  m = vsrl64_vm_vm_i(m, 20 + 32);
+  m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
+  m = vsub64_vm_vm_vm(m, vcast_vm_i64(0x3ff));
+  return m;
+}
+
+static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) {
+  vmask m = vreinterpret_vm_vd(d);
+  m = vsrl64_vm_vm_i(m, 20 + 32);
+  m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
+  return m;
+}
+
+// ldexp
+
+static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) {
+  q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);
+  vmask r = vcastu_vm_vi(vsll_vi_vi_i(q, 20));
+  return vreinterpret_vd_vm(r);
+}
+
+static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vm(vmask q) {
+  q = vadd64_vm_vm_vm(vcast_vm_i64(0x3ff), q);
+  return vreinterpret_vd_vm(vsll64_vm_vm_i(q, 52));
+}
+
+static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) {
+  vint m = vsra_vi_vi_i(q, 31);
+  m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);
+  q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));
+  m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);
+  m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);
+  m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);
+  vmask r = vcastu_vm_vi(vsll_vi_vi_i(m, 20));
+  vdouble y = vreinterpret_vd_vm(r);
+  return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));
+}
+
+static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {
+  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));
+}
+
+static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {
+  return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vcastu_vm_vi(vsll_vi_vi_i(q, 20))));
+}
+
+static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) {
+  vmask m = vsrl64_vm_vm_i(e, 2);
+  e = vsub64_vm_vm_vm(vsub64_vm_vm_vm(vsub64_vm_vm_vm(e, m), m), m);
+  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
+  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
+  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
+  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(e));
+  return d;
+}
+
+static INLINE CONST vdouble vldexp2_vd_vd_vm(vdouble d, vmask e) {
+  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vm(vsrl64_vm_vm_i(e, 1))), vpow2i_vd_vm(vsub64_vm_vm_vm(e, vsrl64_vm_vm_i(e, 1))));
+}
+
+static INLINE CONST vdouble vldexp3_vd_vd_vm(vdouble d, vmask q) {
+  return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vsll64_vm_vm_i(q, 52)));
+}
+
+// vmask functions
+
+static INLINE CONST vdouble vcast_vd_vm(vmask m) { return vcast_vd_vi(vcast_vi_vm(m)); } // 32 bit only
+static INLINE CONST vmask vtruncate_vm_vd(vdouble d) { return vcast_vm_vi(vtruncate_vi_vd(d)); }
+
+static INLINE CONST vopmask vlt64_vo_vm_vm(vmask x, vmask y) { return vgt64_vo_vm_vm(y, x); }
+
+static INLINE CONST vopmask vnot_vo64_vo64(vopmask x) {
+  return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i64(0), vcast_vm_i64(0)));
+}
+
+static INLINE CONST vopmask vugt64_vo_vm_vm(vmask x, vmask y) { // unsigned compare
+  x = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), x);
+  y = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), y);
+  return vgt64_vo_vm_vm(x, y);
+}
+
+static INLINE CONST vmask vilogbk_vm_vd(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(4.9090934652977266E-91));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
+  vmask q = vreinterpret_vm_vd(d);
+  q = vsrl64_vm_vm_i(q, 20 + 32);
+  q = vand_vm_vm_vm(q, vcast_vm_i64(0x7ff));
+  q = vsub64_vm_vm_vm(q, vsel_vm_vo64_vm_vm(o, vcast_vm_i64(300 + 0x3ff), vcast_vm_i64(0x3ff)));
+  return q;
+}
+
+// vquad functions
+
+static INLINE CONST vquad sel_vq_vo_vq_vq(vopmask o, vquad x, vquad y) {
+  return vqsetxy_vq_vm_vm(vsel_vm_vo64_vm_vm(o, vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vsel_vm_vo64_vm_vm(o, vqgety_vm_vq(x), vqgety_vm_vq(y)));
+}
+
+static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) {
+  vquad r = vqsetxy_vq_vm_vm(vadd64_vm_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vadd64_vm_vm_vm(vqgety_vm_vq(x), vqgety_vm_vq(y)));
+  r = vqsety_vq_vq_vm(r, vadd64_vm_vm_vm(vqgety_vm_vq(r), vand_vm_vo64_vm(vugt64_vo_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(r)), vcast_vm_i64(1))));
+  return r;
+}
+
+
+static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; }
+
+// imm must be smaller than 64
+#define srl128_vq_vq_i(m, imm)                                  \
+  imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), imm), vsll64_vm_vm_i(vqgety_vm_vq(m), 64-imm)), vsrl64_vm_vm_i(vqgety_vm_vq(m), imm))
+
+// This function is equivalent to :
+// di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) };
+static INLINE CONST di_t rempisub(vdouble x) {
+#ifdef FULL_FP_ROUNDING
+  vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4)));
+  vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4))));
+  return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi);
+#else
+  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x);
+  vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)),
+                                    vmul_vd_vd_vd(vcast_vd_d(4), x),
+                                    vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x));
+  vdouble rintx  = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)),
+                                    x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x));
+  return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x),
+                          vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x)));
+#endif
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/dd.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/dd.h
@@ -0,0 +1,324 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2024.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
+#if !defined(SLEEF_ENABLE_CUDA)
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+#else
+typedef double2 vdouble2;
+#endif
+
+static INLINE CONST VECTOR_CC vdouble  vd2getx_vd_vd2(vdouble2 v) { return v.x; }
+static INLINE CONST VECTOR_CC vdouble  vd2gety_vd_vd2(vdouble2 v) { return v.y; }
+static INLINE CONST VECTOR_CC vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y)  { vdouble2 v; v.x = x; v.y = y; return v; }
+static INLINE CONST VECTOR_CC vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; }
+static INLINE CONST VECTOR_CC vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; }
+#endif
+
+#if !defined(SLEEF_ENABLE_CUDA)
+typedef struct {
+  double x, y;
+} double2;
+#endif
+
+static INLINE CONST VECTOR_CC double2 dd(double h, double l) {
+  double2 ret = { h, l };
+  return ret;
+}
+
+static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) {
+  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
+  return vd2setxy_vd2_vd_vd(h, l);
+}
+
+static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {
+  return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d2(double2 dd) {
+  return vd2setxy_vd2_vd_vd(vcast_vd_d(dd.x), vcast_vd_d(dd.y));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
+  return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)),
+                            vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {
+  return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0),
+                            vsel_vd_vo_d_d(o, y1, y0));
+}
+
+static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
+  return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);
+}
+
+static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+  return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);
+}
+
+static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+  return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);
+}
+
+static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+  return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
+  return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
+  return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);
+}
+
+static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+  return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);
+}
+
+static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+  return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);
+}
+
+static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+  return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);
+}
+
+//
+
+static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {
+  return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {
+  return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)),
+                         vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)),
+                                                          vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)),
+                                                                        vreinterpret_vm_vd(vcast_vd_d(-0.0))))));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
+  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
+  return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_d(vdouble2 d, double s) { return ddscale_vd2_vd2_vd(d, vcast_vd_d(s)); }
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble s = vadd_vd_vd_vd(x, y);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble s = vadd_vd_vd_vd(x, y);
+  vdouble v = vsub_vd_vd_vd(s, x);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
+  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
+  vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
+  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
+  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
+  vdouble v = vsub_vd_vd_vd(s, x);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)),
+                                                           vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  // |x| >= |y|
+
+  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
+  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
+  vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
+  // |x| >= |y|
+
+  vdouble s = vsub_vd_vd_vd(x, y);
+  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  // |x| >= |y|
+
+  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
+  vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s);
+  t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y));
+  t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x));
+  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y)));
+}
+
+#ifdef ENABLE_FMA_DP
+static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
+  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
+  vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s);
+  vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1)));
+  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble s = vmul_vd_vd_vd(x, y);
+  return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
+  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
+  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s))));
+}
+
+static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
+  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y))));
+}
+
+static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
+  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
+  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
+  vdouble s = vrec_vd_vd(d);
+  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1))));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
+  vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d));
+  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1)))));
+}
+#else // #ifdef ENABLE_FMA_DP
+static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
+  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
+  vdouble dh  = vupper_vd_vd(vd2getx_vd_vd2(d)), dl  = vsub_vd_vd_vd(vd2getx_vd_vd2(d),  dh);
+  vdouble th  = vupper_vd_vd(t  ), tl  = vsub_vd_vd_vd(t  ,  th);
+  vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh);
+
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
+
+  vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),
+                    vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
+
+  return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);
+  vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);
+
+  vdouble s = vmul_vd_vd_vd(x, y);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
+  vdouble yh = vupper_vd_vd(y  ), yl = vsub_vd_vd_vd(y  , yh);
+
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y)));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
+  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
+
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y))));
+}
+
+static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
+  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
+
+  return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
+  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
+
+  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
+  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x)))));
+}
+
+static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
+  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
+
+  return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
+  vdouble t = vrec_vd_vd(d);
+  vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);
+  vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);
+
+  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
+  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
+  vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
+  vdouble th = vupper_vd_vd(t  ), tl = vsub_vd_vd_vd(t  , th);
+
+  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t))));
+}
+#endif // #ifdef ENABLE_FMA_DP
+
+static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
+  vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));
+  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {
+  vdouble t = vsqrt_vd_vd(d);
+  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
+}
+
+static INLINE CONST VECTOR_CC vdouble2 ddmla_vd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y, vdouble2 z) {
+  return ddadd_vd2_vd2_vd2(z, ddmul_vd2_vd2_vd2(x, y));
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/df.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/df.h
@@ -0,0 +1,369 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2024.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
+#if !defined(SLEEF_ENABLE_CUDA)
+typedef struct {
+  vfloat x, y;
+} vfloat2;
+#else
+typedef float2 vfloat2;
+#endif
+
+static INLINE CONST VECTOR_CC vfloat  vf2getx_vf_vf2(vfloat2 v) { return v.x; }
+static INLINE CONST VECTOR_CC vfloat  vf2gety_vf_vf2(vfloat2 v) { return v.y; }
+static INLINE CONST VECTOR_CC vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y)  { vfloat2 v; v.x = x; v.y = y; return v; }
+static INLINE CONST VECTOR_CC vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; }
+static INLINE CONST VECTOR_CC vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; }
+#endif
+
+static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) {
+  return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
+  return vf2setxy_vf2_vf_vf(h, l);
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {
+  return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) {
+  return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
+  return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {
+  return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {
+  return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))),
+                         vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));
+}
+
+static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
+  return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);
+}
+
+static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
+  return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);
+}
+
+static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
+  return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);
+}
+
+static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {
+  return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {
+  return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
+  return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);
+}
+
+static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
+  return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);
+}
+
+static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
+  return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);
+}
+
+//
+
+static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {
+  return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {
+  return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)),
+                         vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
+  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
+  return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat s = vadd_vf_vf_vf(x, y);
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat s = vadd_vf_vf_vf(x, y);
+  vfloat v = vsub_vf_vf_vf(s, x);
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
+  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
+  vfloat v = vsub_vf_vf_vf(s, x);
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y)));
+
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y);
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
+  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
+  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v));
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
+  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  // |x| >= |y|
+
+  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
+  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
+  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v));
+  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
+  // |x| >= |y|
+
+  vfloat s = vsub_vf_vf_vf(x, y);
+  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  // |x| >= |y|
+
+  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
+  vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s);
+  t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y));
+  t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x));
+  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y)));
+}
+
+#ifdef ENABLE_FMA_SP
+static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
+  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
+  vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s);
+  vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1)));
+  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat s = vmul_vf_vf_vf(x, y);
+  return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));
+  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s)));
+}
+
+static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
+  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x))));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
+  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s))));
+}
+
+static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
+  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y))));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y);
+  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s)));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
+  vfloat s = vrec_vf_vf(d);
+  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1))));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
+  vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d));
+  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1)))));
+}
+#else
+static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
+  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
+  vfloat dh  = vupper_vf_vf(vf2getx_vf_vf2(d)), dl  = vsub_vf_vf_vf(vf2getx_vf_vf2(d),  dh);
+  vfloat th  = vupper_vf_vf(t  ), tl  = vsub_vf_vf_vf(t  ,  th);
+  vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh);
+
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
+
+  vfloat u, w;
+  w = vcast_vf_f(-1);
+  w = vmla_vf_vf_vf_vf(dh, th, w);
+  w = vmla_vf_vf_vf_vf(dh, tl, w);
+  w = vmla_vf_vf_vf_vf(dl, th, w);
+  w = vmla_vf_vf_vf_vf(dl, tl, w);
+  w = vneg_vf_vf(w);
+
+  u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s));
+  u = vmla_vf_vf_vf_vf(nhh, tl, u);
+  u = vmla_vf_vf_vf_vf(nhl, th, u);
+  u = vmla_vf_vf_vf_vf(nhl, tl, u);
+  u = vmla_vf_vf_vf_vf(s, w, u);
+
+  return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);
+  vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);
+
+  vfloat s = vmul_vf_vf_vf(x, y), t;
+
+  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
+  t = vmla_vf_vf_vf_vf(xl, yh, t);
+  t = vmla_vf_vf_vf_vf(xh, yl, t);
+  t = vmla_vf_vf_vf_vf(xl, yl, t);
+
+  return vf2setxy_vf2_vf_vf(s, t);
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
+  vfloat yh = vupper_vf_vf(y  ), yl = vsub_vf_vf_vf(y  , yh);
+
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t;
+
+  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
+  t = vmla_vf_vf_vf_vf(xl, yh, t);
+  t = vmla_vf_vf_vf_vf(xh, yl, t);
+  t = vmla_vf_vf_vf_vf(xl, yl, t);
+  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t);
+
+  return vf2setxy_vf2_vf_vf(s, t);
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
+  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
+
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t;
+
+  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
+  t = vmla_vf_vf_vf_vf(xl, yh, t);
+  t = vmla_vf_vf_vf_vf(xh, yl, t);
+  t = vmla_vf_vf_vf_vf(xl, yl, t);
+  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t);
+  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t);
+
+  return vf2setxy_vf2_vf_vf(s, t);
+}
+
+static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
+  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
+
+  return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
+  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
+
+  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t;
+
+  t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s));
+  t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);
+  t = vmla_vf_vf_vf_vf(xl, xl, t);
+  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t);
+
+  return vf2setxy_vf2_vf_vf(s, t);
+}
+
+static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
+  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
+
+  return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
+  vfloat t = vrec_vf_vf(d);
+  vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);
+  vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);
+
+  vfloat u = vcast_vf_f(-1);
+  u = vmla_vf_vf_vf_vf(dh, th, u);
+  u = vmla_vf_vf_vf_vf(dh, tl, u);
+  u = vmla_vf_vf_vf_vf(dl, th, u);
+  u = vmla_vf_vf_vf_vf(dl, tl, u);
+
+  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
+  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
+  vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
+  vfloat th = vupper_vf_vf(t  ), tl = vsub_vf_vf_vf(t  , th);
+
+  vfloat u = vcast_vf_f(-1);
+  u = vmla_vf_vf_vf_vf(dh, th, u);
+  u = vmla_vf_vf_vf_vf(dh, tl, u);
+  u = vmla_vf_vf_vf_vf(dl, th, u);
+  u = vmla_vf_vf_vf_vf(dl, tl, u);
+  u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u);
+
+  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
+}
+#endif
+
+static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
+#ifdef ENABLE_RECSQRT_SP
+  vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
+  vfloat2 r = dfmul_vf2_vf2_vf(d, x);
+  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));
+#else
+  vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
+  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));
+#endif
+}
+
+static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {
+  vfloat t = vsqrt_vf_vf(d);
+  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/estrin.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/estrin.h
@@ -0,0 +1,40 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+// These are macros for evaluating polynomials using Estrin's method
+
+#define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0))
+#define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0)))
+#define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0)))
+#define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0))
+#define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
+#define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
+#define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
+#define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY20(x, x2, x4, x8, x16, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x16, POLY4(x, x2, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
+#define POLY21(x, x2, x4, x8, x16, d4, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
+  MLA(x16, POLY5(x, x2, x4, d4, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/f128util.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/f128util.h
@@ -0,0 +1,92 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <quadmath.h>
+#include <inttypes.h>
+
+static __float128 mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd) {
+  if (isnan(mpfr_get_d(m, GMP_RNDN))) return __builtin_nan("");
+
+  mpfr_t frr, frd;
+  mpfr_inits(frr, frd, NULL);
+
+  mpfr_exp_t e;
+  mpfr_frexp(&e, frr, m, GMP_RNDN);
+
+  double d0 = mpfr_get_d(frr, GMP_RNDN);
+  mpfr_set_d(frd, d0, GMP_RNDN);
+  mpfr_sub(frr, frr, frd, GMP_RNDN);
+
+  double d1 = mpfr_get_d(frr, GMP_RNDN);
+  mpfr_set_d(frd, d1, GMP_RNDN);
+  mpfr_sub(frr, frr, frd, GMP_RNDN);
+
+  double d2 = mpfr_get_d(frr, GMP_RNDN);
+
+  mpfr_clears(frr, frd, NULL);
+  return ldexpq((__float128)d2 + (__float128)d1 + (__float128)d0, e);
+}
+
+static void mpfr_set_f128(mpfr_t frx, __float128 f, mpfr_rnd_t rnd) {
+  char s[128];
+  quadmath_snprintf(s, 120, "%.50Qg", f);
+  mpfr_set_str(frx, s, 10, rnd);
+}
+
+static void printf128(__float128 f) {
+  char s[128];
+  quadmath_snprintf(s, 120, "%.50Qg", f);
+  printf("%s", s);
+}
+
+static char frstr[16][1000];
+static int frstrcnt = 0;
+
+static char *toBC(double d) {
+  union {
+    double d;
+    uint64_t u64;
+    int64_t i64;
+  } cnv;
+
+  cnv.d = d;
+
+  int64_t l = cnv.i64;
+  int e = (int)((l >> 52) & ~(-1L << 11));
+  int s = (int)(l >> 63);
+  l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52));
+
+  char *ptr = frstr[(frstrcnt++) & 15];
+
+  sprintf(ptr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52));
+  return ptr;
+}
+
+static char *toBCq(__float128 d) {
+  union {
+    __float128 d;
+    __uint128_t u128;
+  } cnv;
+
+  cnv.d = d;
+
+  __uint128_t m = cnv.u128;
+  int e = (int)((m >> 112) & ~(-1L << 15));
+  int s = (int)(m >> 127);
+  m = d == 0 ? 0 : ((m & ((((__uint128_t)1) << 112)-1)) | ((__uint128_t)1 << 112));
+
+  uint64_t h = m / UINT64_C(10000000000000000000);
+  uint64_t l = m % UINT64_C(10000000000000000000);
+
+  char *ptr = frstr[(frstrcnt++) & 15];
+
+  sprintf(ptr, "%s%" PRIu64 "%019" PRIu64 "*2^%d", s != 0 ? "-" : "", h, l, (e-0x3fff-112));
+
+  return ptr;
+}
+
+static int xisnanq(Sleef_quad x) { return x != x; }
+static int xisinfq(Sleef_quad x) { return x == (Sleef_quad)__builtin_inf() || x == -(Sleef_quad)__builtin_inf(); }
+static int xisfiniteq(Sleef_quad x) { return !xisnanq(x) && !isinfq(x); }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/keywords.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/keywords.txt
@@ -0,0 +1,683 @@
+double2
+double3
+float2
+atan2k
+atan2kf
+atan2kf_u1
+atan2k_u1
+cospik
+cospifk
+dd
+dd2
+dd2geta_vd2_dd2
+dd2getb_vd2_dd2
+dd2setab_dd2_vd2_vd2
+ddabs_vd2_vd2
+ddadd2_vd2_vd2_vd
+ddadd2_vd2_vd2_vd2
+ddadd2_vd2_vd_vd
+ddadd2_vd2_vd_vd2
+ddadd_vd2_vd2_vd
+ddadd_vd2_vd2_vd2
+ddadd_vd2_vd_vd
+ddadd_vd2_vd_vd2
+dddiv_vd2_vd2_vd2
+ddi
+ddi_t
+ddigetdd_vd2_ddi
+ddigeti_vi_ddi
+ddisetdd_ddi_ddi_vd2
+ddisetddi_ddi_vd2_vi
+ddmla_vd2_vd2_vd2_vd2
+ddmla_vd2_vd_vd2_vd2
+ddmul_vd2_vd2_vd
+ddmul_vd2_vd2_vd2
+ddmul_vd2_vd_vd
+ddmul_vd_vd2_vd2
+ddneg_vd2_vd2
+ddnormalize_vd2_vd2
+ddrec_vd2_vd
+ddrec_vd2_vd2
+ddscale_vd2_vd2_d
+ddscale_vd2_vd2_vd
+ddsqrt_vd2_vd
+ddsqrt_vd2_vd2
+ddsqu_vd2_vd2
+ddsqu_vd_vd2
+ddsub_vd2_vd2_vd
+ddsub_vd2_vd2_vd2
+ddsub_vd2_vd_vd
+df
+df2
+df2geta_vf2_df2
+df2getb_vf2_df2
+df2setab_df2_vf2_vf2
+dfabs_vf2_vf2
+dfadd2_vf2_vf2_vf
+dfadd2_vf2_vf2_vf2
+dfadd2_vf2_vf_vf
+dfadd2_vf2_vf_vf2
+dfadd_vf2_vf2_vf
+dfadd_vf2_vf2_vf2
+dfadd_vf2_vf_vf
+dfadd_vf2_vf_vf2
+dfdiv_vf2_vf2_vf2
+dfi
+dfi_t
+dfigetdf_vf2_dfi
+dfigeti_vi2_dfi
+dfisetdf_dfi_dfi_vf2
+dfisetdfi_dfi_vf2_vi2
+dfmla_vf2_vf_vf2_vf2
+dfmul_vf2_vf2_vf
+dfmul_vf2_vf2_vf2
+dfmul_vf2_vf_vf
+dfmul_vf_vf2_vf2
+dfneg_vf2_vf2
+dfnormalize_vf2_vf2
+dfrec_vf2_vf
+dfrec_vf2_vf2
+dfscale_vf2_vf2_vf
+dfsqrt_vf2_vf
+dfsqrt_vf2_vf2
+dfsqu_vf2_vf2
+dfsqu_vf_vf2
+dfsub_vf2_vf2_vf
+dfsub_vf2_vf2_vf2
+dfsub_vf2_vf_vf
+di_t
+digetd_vd_di
+digeti_vi_di
+disetdi_di_vd_vi
+expk
+expk2
+expk2f
+expk3f
+expkf
+expm1fk
+expm1k
+fi_t
+figetd_vf_di
+figeti_vi2_di
+fisetdi_fi_vf_vi2
+gammafk
+gammak
+imdvq_vq_vm_vm
+logk
+logk2
+logk2f
+logk3f
+logkf
+poly2dd
+poly2dd_b
+poly2df
+poly2df_b
+poly4dd
+poly4df
+pragma
+rempi
+rempif
+rempisub
+rempisubf
+sinpifk
+sinpik
+td
+tdi_t
+tdigeti_vi_tdi
+tdigettd_vd3_tdi
+tdigetx_vd_tdi
+tdisettdi_tdi_vd3_vi
+tdx
+tdxgetd3_vd3_tdx
+tdxgetd3x_vd_tdx
+tdxgetd3y_vd_tdx
+tdxgetd3z_vd_tdx
+tdxgete_vm_tdx
+tdxsetd3_tdx_tdx_vd3
+tdxsete_tdx_tdx_vm
+tdxseted3_tdx_vm_vd3
+tdxsetexyz_tdx_vm_vd_vd_vd
+tdxsetx_tdx_tdx_vd
+tdxsetxyz_tdx_tdx_vd_vd_vd
+tdxsety_tdx_tdx_vd
+tdxsetz_tdx_tdx_vd
+vabs_vd_vd
+vabs_vf2_vf2
+vabs_vf_vf
+add128_vq_vq_vq
+vadd64_vm_vm_vm
+vadd_vd_3vd
+vadd_vd_4vd
+vadd_vd_5vd
+vadd_vd_6vd
+vadd_vd_7vd
+vadd_vd_vd_vd
+vadd_vf_3vf
+vadd_vf_4vf
+vadd_vf_5vf
+vadd_vf_6vf
+vadd_vf_7vf
+vadd_vf_vf_vf
+vadd_vi2_vi2_vi2
+vadd_vi_vi_vi
+vand_vi2_vi2_vi2
+vand_vi2_vo_vi2
+vand_vi_vi_vi
+vand_vi_vo_vi
+vand_vm_vm_vm
+vand_vm_vo32_vm
+vand_vm_vo64_vm
+vand_vo_vo_vo
+vandnot_vi2_vi2_vi2
+vandnot_vi2_vo_vi2
+vandnot_vi_vi_vi
+vandnot_vi_vo_vi
+vandnot_vm_vm_vm
+vandnot_vm_vo32_vm
+vandnot_vm_vo64_vm
+vandnot_vo_vo_vo
+vargquad
+vavailability_i
+cast_aq_vq
+vcast_d_vd
+vcast_f_vf
+vcast_vd2_d2
+vcast_vd2_d_d
+vcast_vd2_vd_vd
+vcast_vd_d
+vcast_vd_vi
+vcast_vd_vm
+vcast_vf2_d
+vcast_vf2_f_f
+vcast_vf2_vf_vf
+vcast_vf_f
+vcast_vf_vi2
+vcast_vi2_i
+vcast_vi2_i_i
+vcast_vi2_vm
+vcast_vi_i
+vcast_vi_vm
+vcast_vm_i64
+vcast_vm_i_i
+vcast_vm_u64
+vcast_vm_vi
+vcast_vm_vi2
+vcast_vm_vo
+vcast_vo_i
+vcast_vo32_vo64
+vcast_vo64_vo32
+cast_vq_aq
+vclearlsb_vd_vd_i
+vcopysign_vd_vd_vd
+vcopysign_vf_vf_vf
+vd
+vd2getx_vd_vd2
+vd2gety_vd_vd2
+vd2setx_vd2_vd2_vd
+vd2setxy_vd2_vd_vd
+vd2sety_vd2_vd2_vd
+vd3getx_vd_vd3
+vd3gety_vd_vd3
+vd3getz_vd_vd3
+vd3setx_vd3_vd3_vd
+vd3setxyz_vd3_vd_vd_vd
+vd3sety_vd3_vd3_vd
+vd3setz_vd3_vd3_vd
+vdiv_vd_vd_vd
+vdiv_vf_vf_vf
+vdouble
+vdouble2
+vdouble3
+veq64_vo_vm_vm
+veq_vi2_vi2_vi2
+veq_vi_vi_vi
+veq_vo_vd_vd
+veq_vo_vf_vf
+veq_vo_vi2_vi2
+veq_vo_vi_vi
+versatileVector
+vf2getx_vf_vf2
+vf2gety_vf_vf2
+vf2setx_vf2_vf2_vf
+vf2setxy_vf2_vf_vf
+vf2sety_vf2_vf2_vf
+vfloat
+vfloat2
+vfma_vd_vd_vd_vd
+vfma_vf_vf_vf_vf
+vfmann_vd_vd_vd_vd
+vfmann_vf_vf_vf_vf
+vfmanp_vd_vd_vd_vd
+vfmanp_vf_vf_vf_vf
+vfmapn_vd_vd_vd_vd
+vfmapn_vf_vf_vf_vf
+vfmapp_vd_vd_vd_vd
+vfmapp_vf_vf_vf_vf
+vgather_vd_p_vi
+vgather_vf_p_vi2
+vge_vo_vd_vd
+vge_vo_vf_vf
+vgetexp_vd_vd
+vgetexp_vf_vf
+vgetmant_vd_vd
+vgetmant_vf_vf
+vgt64_vo_vm_vm
+vgt_vi2_vi2_vi2
+vgt_vi_vi_vi
+vgt_vo_vd_vd
+vgt_vo_vf_vf
+vgt_vo_vi2_vi2
+vgt_vo_vi_vi
+vilogb2k_vi2_vf
+vilogb2k_vi_vd
+vilogb2k_vm_vd
+vilogb3k_vm_vd
+vilogbk_vi2_vf
+vilogbk_vi_vd
+vilogbk_vm_vd
+vint
+vint2
+vint64
+visinf2_vd_vd_vd
+visinf2_vf_vf_vf
+visinf_vo_vd
+visinf_vo_vf
+visint_vo_vd
+visint_vo_vf
+visminf_vo_vd
+visminf_vo_vf
+visnan_vo_vd
+visnan_vo_vf
+visnegzero_vo_vd
+visnegzero_vo_vf
+visnonfinite_vo_vd
+visnumber_vo_vd
+visnumber_vo_vf
+visodd_vo_vd
+vispinf_vo_vd
+vispinf_vo_vf
+vldexp1_vd_vd_vm
+vldexp2_vd_vd_vi
+vldexp2_vd_vd_vm
+vldexp2_vf_vf_vi2
+vldexp3_vd_vd_vi
+vldexp3_vd_vd_vm
+vldexp3_vf_vf_vi2
+vldexp_vd_vd_vi
+vldexp_vf_vf_vi2
+vle_vo_vd_vd
+vle_vo_vf_vf
+vload_vd_p
+vload_vf_p
+vloadu_vd_p
+vloadu_vf_p
+vloadu_vi2_p
+vloadu_vi_p
+loadu_vq_p
+vlt64_vo_vm_vm
+vlt_vo_vd_vd
+vlt_vo_vf_vf
+vmask
+vmax_vd_vd_vd
+vmax_vf_vf_vf
+vmin_vd_vd_vd
+vmin_vf_vf_vf
+vmla_vd_vd_vd_vd
+vmla_vf_vf_vf_vf
+vmlanp_vd_vd_vd_vd
+vmlanp_vf_vf_vf_vf
+vmlapn_vd_vd_vd_vd
+vmlapn_vf_vf_vf_vf
+vmlsubadd_vd_vd_vd_vd
+vmlsubadd_vf_vf_vf_vf
+vmul_vd_vd_vd
+vmul_vf_vf_vf
+vmulsign_vd_vd_vd
+vmulsign_vf_vf_vf
+vneg64_vm_vm
+vneg_vd_vd
+vneg_vf_vf
+vneg_vi2_vi2
+vneg_vi_vi
+vnegpos_vd_vd
+vnegpos_vf_vf
+vneq_vo_vd_vd
+vneq_vo_vf_vf
+vnot_vo32_vo32
+vnot_vo64_vo64
+vopmask
+vor_vi2_vi2_vi2
+vor_vi_vi_vi
+vor_vm_vm_vm
+vor_vm_vo32_vm
+vor_vm_vo64_vm
+vor_vo_vo_vo
+vorsign_vd_vd_vd
+vorsign_vf_vf_vf
+vposneg_vd_vd
+vposneg_vf_vf
+vpow2i_vd_vi
+vpow2i_vd_vm
+vpow2i_vf_vi2
+vprefetch_v_p
+vptrunc_vd_vd
+vptrunc_vf_vf
+vqgetx_vm_vq
+vqgety_vm_vq
+vqsetx_vq_vq_vm
+vqsetxy_vq_vm_vm
+vqsety_vq_vq_vm
+vquad
+vrec_vd_vd
+vrec_vf_vf
+vreinterpret_vd_vf
+vreinterpret_vd_vm
+vreinterpret_vf_vd
+vreinterpret_vf_vi2
+vreinterpret_vf_vm
+vreinterpret_vi2_vf
+vreinterpret_vi64_vm
+vreinterpret_vm_vd
+vreinterpret_vm_vf
+vreinterpret_vm_vi64
+vreinterpret_vm_vu64
+vreinterpret_vu64_vm
+vrev21_vd_vd
+vrev21_vf_vf
+vreva2_vd_vd
+vreva2_vf_vf
+vrint_vd_vd
+vrint2_vd_vd
+vrint_vf_vf
+vrint_vi2_vf
+vrint_vi_vd
+vrintfk2_vf_vf
+vrintk2_vd_vd
+vscatter2_v_p_i_i_vd
+vscatter2_v_p_i_i_vf
+vsel_vd2_vo_d_d_d_d
+vsel_vd2_vo_vd2_vd2
+vsel_vd_vo_d_d
+vsel_vd_vo_vd_vd
+vsel_vd_vo_vo_d_d_d
+vsel_vd_vo_vo_vo_d_d_d_d
+vsel_vf2_vo_f_f_f_f
+vsel_vf2_vo_vf2_vf2
+vsel_vf2_vo_vo_d_d_d
+vsel_vf2_vo_vo_vo_d_d_d_d
+vsel_vf_vo_f_f
+vsel_vf_vo_vf_vf
+vsel_vf_vo_vo_f_f_f
+vsel_vf_vo_vo_vo_f_f_f_f
+vsel_vi2_vf_vf_vi2_vi2
+vsel_vi2_vf_vi2
+vsel_vi2_vo_vi2_vi2
+vsel_vi_vd_vd_vi_vi
+vsel_vi_vd_vi
+vsel_vi_vo_vi_vi
+vsel_vm_vo64_vm_vm
+sel_vq_vo_vq_vq
+vsign_vd_vd
+vsign_vf_vf
+vsignbit_vm_vd
+vsignbit_vm_vf
+vsignbit_vo_vd
+vsignbit_vo_vf
+vsll_vi2_vi2_i
+vsll_vi_vi_i
+vsqrt_vd_vd
+vsqrt_vf_vf
+vsra_vi2_vi2_i
+vsra_vi_vi_i
+vsrl_vi2_vi2_i
+vsrl_vi_vi_i
+vsscatter2_v_p_i_i_vd
+vsscatter2_v_p_i_i_vf
+vstore_v_p_vd
+vstore_v_p_vf
+vstoreu_v_p_vd
+vstoreu_v_p_vf
+vstoreu_v_p_vi
+vstoreu_v_p_vi2
+storeu_v_p_vq
+vstream_v_p_vd
+vstream_v_p_vf
+vsub64_vm_vm_vm
+vsub_vd_3vd
+vsub_vd_4vd
+vsub_vd_5vd
+vsub_vd_6vd
+vsub_vd_vd_vd
+vsub_vf_3vf
+vsub_vf_4vf
+vsub_vf_5vf
+vsub_vf_vf_vf
+vsub_vi2_vi2_vi2
+vsub_vi_vi_vi
+vsubadd_vd_vd_vd
+vsubadd_vf_vf_vf
+vtestallones_i_vo32
+vtestallones_i_vo64
+vtestallzeros_i_vo64
+vtoward0_vd_vd
+vtoward0_vf_vf
+vtruncate_vd_vd
+vtruncate2_vd_vd
+vtruncate_vf_vf
+vtruncate_vi2_vf
+vtruncate_vi_vd
+vtruncate_vm_vd
+vugt64_vo_vm_vm
+vuint64
+vupper_vd_vd
+vupper_vf_vf
+vxor_vi2_vi2_vi2
+vxor_vi_vi_vi
+vxor_vm_vm_vm
+vxor_vm_vo32_vm
+vxor_vm_vo64_vm
+vxor_vo_vo_vo
+#
+abs_tdx_tdx
+abs_vd3_vd3
+acos_tdx_tdx
+acosh_tdx_tdx
+add2_vd3_vd2_vd3
+add2_vd3_vd3_vd3
+add2_vd3_vd_vd3
+add_tdx_tdx_tdx
+add_vd3_vd2_vd3
+add_vd3_vd_vd3
+asin_tdx_tdx
+asinh_tdx_tdx
+atan2_tdx_tdx_tdx
+atan_tdx_tdx
+atanh_tdx_tdx
+cast_tdx_d
+cast_tdx_d_d_d
+cast_tdx_vd
+cast_tdx_vd3
+cast_tdx_vq
+cast_vd3_d3
+cast_vd3_d_d_d
+cast_vd3_tdx
+cast_vd3_vd_vd_vd
+cast_vd_tdx
+cast_vq_tdx
+cmp_vm_tdx_tdx
+cmpcnv_vq_vq
+cos_tdx_tdx
+cosh_tdx_tdx
+div2_vd3_vd3_vd3
+div_tdx_tdx_tdx
+div_vd3_vd3_vd3
+eq_vo_tdx_tdx
+exp10_tdx_tdx
+exp10i
+exp10tab
+exp2_tdx_tdx
+exp_tdx_tdx
+expm1_tdx_tdx
+fastcast_tdx_vd3
+fastcast_tdx_vq
+fastcast_vq_tdx
+ge_vo_tdx_tdx
+gt_vo_tdx_tdx
+ilogb_vm_tdx
+isinf_vo_vq
+isint_vo_tdx
+isminf_vo_vq
+isnan_vo_tdx
+isnan_vo_vq
+isnonfinite_vo_vq
+isnonfinite_vo_vq_vq
+isnonfinite_vo_vq_vq_vq
+isodd_vo_tdx
+ispinf_vo_vq
+iszero_vo_tdx
+iszero_vo_vq
+le_vo_tdx_tdx
+log10_tdx_tdx
+log1p_tdx_tdx
+log2_tdx_tdx
+log_tdx_tdx
+logk_tdx_tdx
+lt_vo_tdx_tdx
+mla_vd3_vd3_vd3_vd3
+modf_tdx_tdx_ptdx
+mul2_vd3_vd3_vd3
+mul_tdx_tdx_tdx
+mul_vd3_vd2_vd2
+mul_vd3_vd2_vd3
+mul_vd3_vd3_vd
+mul_vd3_vd3_vd2
+mul_vd3_vd3_vd3
+mulsign_tdx_tdx_vd
+mulsign_vd3_vd3_vd
+mulsign_vq_vq_vq
+neg_tdx_tdx
+neg_vd3_vd3
+neq_vo_tdx_tdx
+normalize_vd3_vd3
+poly10dd
+poly10dd_b
+poly11dd
+poly11dd_b
+poly12dd
+poly12dd_b
+poly13dd
+poly13dd_b
+poly14dd
+poly14dd_b
+poly15dd
+poly15dd_b
+poly16dd
+poly16dd_b
+poly17dd
+poly17dd_b
+poly18dd
+poly18dd_b
+poly19dd
+poly19dd_b
+poly20dd
+poly20dd_b
+poly21dd
+poly21dd_b
+poly22dd
+poly22dd_b
+poly23dd
+poly23dd_b
+poly24dd
+poly24dd_b
+poly25dd
+poly25dd_b
+poly26dd
+poly26dd_b
+poly27dd
+poly27dd_b
+poly2d
+poly2td
+poly2td_b
+poly3d
+poly3dd
+poly3dd_b
+poly3td
+poly3td_b
+poly4d
+poly4dd_b
+poly4td
+poly4td_b
+poly5d
+poly5dd
+poly5dd_b
+poly5td
+poly5td_b
+poly6d
+poly6dd
+poly6dd_b
+poly6td
+poly6td_b
+poly7d
+poly7dd
+poly7dd_b
+poly7td
+poly7td_b
+poly8d
+poly8dd
+poly8dd_b
+poly8td
+poly8td_b
+poly9dd
+poly9dd_b
+pow_tdx_tdx_tdx
+quickrenormalize_vd3_vd3
+quicktwosum_vd2_vd_vd
+rec_vd3_vd2
+rec_vd3_vd3
+rempio2q
+scale_vd3_vd3_d
+scale_vd3_vd3_vd
+scaleadd2_vd3_vd3_vd3_vd
+scalesub2_vd3_vd3_vd3_vd
+sel_tdx_vo_tdx_tdx
+sel_vd3_vo_vd3_vd3
+signbit_vo_tdx
+sin_tdx_tdx
+sinh_tdx_tdx
+slowcast_vq_tdx
+snprintquad
+snprintquadhex
+sqrt_tdx_tdx
+sqrt_vd3_vd3
+squ_vd3_vd3
+sub2_vd3_vd3_vd3
+sub_tdx_tdx_tdx
+tan_tdx_tdx
+tanh_tdx_tdx
+twoprod_vd2_vd_vd
+twosub_vd2_vd_vd
+twosubx_vd2_vd_vd_vd
+twosum_vd2_vd_vd
+twosumx_vd2_vd_vd_vd
+vtruncate2_vd_vd
+vfloor2_vd_vd
+vceil2_vd_vd
+vround2_vd_vd
+isinf_vo_tdx
+trunc_tdx_tdx
+rint_tdx_tdx
+fmod_tdx_tdx_tdx
+remainder_tdx_tdx_tdx
+cbrt_tdx_tdx
+frexp_tdx_tdx_pvi
+fma_tdx_tdx_tdx_tdx
+hypot_tdx_tdx_tdx
+ilogb_vi_tdx
+ldexp_tdx_tdx_vi
+Sleef_rempitabsp
+Sleef_rempitabdp
+Sleef_rempitabqp
+vcastu_vm_vi
+vcastu_vi_vm
+rvv_sp_vopmask
+rvv_dp_vopmask
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/main_checkfeature.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/main_checkfeature.c
@@ -0,0 +1,50 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
+static jmp_buf sigjmp;
+#define SETJMP(x) setjmp(x)
+#define LONGJMP longjmp
+#else
+static sigjmp_buf sigjmp;
+#define SETJMP(x) sigsetjmp(x, 1)
+#define LONGJMP siglongjmp
+#endif
+
+int main2(int argc, char **argv);
+int check_feature(double, float);
+
+static void sighandler(int signum) {
+  LONGJMP(sigjmp, 1);
+}
+
+int detectFeature() {
+  signal(SIGILL, sighandler);
+
+  if (SETJMP(sigjmp) == 0) {
+    int r = check_feature(1.0, 1.0f);
+    signal(SIGILL, SIG_DFL);
+    return r;
+  } else {
+    signal(SIGILL, SIG_DFL);
+    return 0;
+  }
+}
+
+int main(int argc, char **argv) {
+  if (!detectFeature()) {
+    printf("0\n");
+    fclose(stdout);
+    exit(0);
+  }
+
+  return main2(argc, argv);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/misc.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/misc.h
@@ -0,0 +1,332 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2024.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+//
+
+#ifndef __MISC_H__
+#define __MISC_H__
+
+#if !defined(SLEEF_GENHEADER)
+#include <stdint.h>
+#include <string.h>
+#endif
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643383279502884
+#endif
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884L
+#endif
+
+#ifndef M_1_PI
+#define M_1_PI 0.318309886183790671537767526745028724
+#endif
+
+#ifndef M_1_PIl
+#define M_1_PIl 0.318309886183790671537767526745028724L
+#endif
+
+#ifndef M_2_PI
+#define M_2_PI 0.636619772367581343075535053490057448
+#endif
+
+#ifndef M_2_PIl
+#define M_2_PIl 0.636619772367581343075535053490057448L
+#endif
+
+#if !defined(SLEEF_GENHEADER)
+
+#ifndef SLEEF_FP_ILOGB0
+#define SLEEF_FP_ILOGB0 ((int)0x80000000)
+#endif
+
+#ifndef SLEEF_FP_ILOGBNAN
+#define SLEEF_FP_ILOGBNAN ((int)2147483647)
+#endif
+
+#endif
+
+#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
+#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
+
+#define SLEEF_FLT_MIN 0x1p-126
+#define SLEEF_DBL_MIN 0x1p-1022
+#define SLEEF_INT_MAX 2147483647
+#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
+#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
+
+//
+
+/*
+  PI_A to PI_D are constants that satisfy the following two conditions.
+
+  * For PI_A, PI_B and PI_C, the last 28 bits are zero.
+  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
+
+  The argument of a trig function is multiplied by 1/PI, and the
+  integral part is divided into two parts, each has at most 28
+  bits. So, the maximum argument that could be correctly reduced
+  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
+  double precision calculation, the actual maximum argument that can
+  be correctly reduced is around 2^47.
+ */
+
+#define PI_A 3.1415926218032836914
+#define PI_B 3.1786509424591713469e-08
+#define PI_C 1.2246467864107188502e-16
+#define PI_D 1.2736634327021899816e-24
+#define TRIGRANGEMAX 1e+14
+
+/*
+  PI_A2 and PI_B2 are constants that satisfy the following two conditions.
+
+  * The last 3 bits of PI_A2 are zero.
+  * PI_A2 + PI_B2 is close to PI as much as possible.
+
+  The argument of a trig function is multiplied by 1/PI, and the
+  integral part is multiplied by PI_A2. So, the maximum argument that
+  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
+  we confirmed that it correctly reduces the argument up to around 15.
+ */
+
+#define PI_A2 3.141592653589793116
+#define PI_B2 1.2246467991473532072e-16
+#define TRIGRANGEMAX2 15
+
+#define M_2_PI_H 0.63661977236758138243
+#define M_2_PI_L -3.9357353350364971764e-17
+
+#define SQRT_DBL_MAX 1.3407807929942596355e+154
+
+#define TRIGRANGEMAX3 1e+9
+
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
+
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
+
+#define L10U 0.30102999566383914498 // log 2 / log 10
+#define L10L 1.4205023227266099418e-13
+#define LOG10_2 3.3219280948873623478703194294893901758648313930
+
+#define L10Uf 0.3010253906f
+#define L10Lf 4.605038981e-06f
+
+//
+
+#define PI_Af 3.140625f
+#define PI_Bf 0.0009670257568359375f
+#define PI_Cf 6.2771141529083251953e-07f
+#define PI_Df 1.2154201256553420762e-10f
+#define TRIGRANGEMAXf 39000
+
+#define PI_A2f 3.1414794921875f
+#define PI_B2f 0.00011315941810607910156f
+#define PI_C2f 1.9841872589410058936e-09f
+#define TRIGRANGEMAX2f 125.0f
+
+#define TRIGRANGEMAX4f 8e+6f
+
+#define SQRT_FLT_MAX 18446743523953729536.0
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+#ifndef M_PIf
+# define M_PIf ((float)M_PI)
+#endif
+
+//
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef ABS
+#define ABS(x) ((x) < 0 ? -(x) : (x))
+#endif
+
+#define stringify(s) stringify_(s)
+#define stringify_(s) #s
+
+#if !defined(SLEEF_GENHEADER)
+typedef long double longdouble;
+#endif
+
+#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
+#define Sleef_double2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double2;
+#endif
+
+#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
+#define Sleef_float2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float2;
+#endif
+
+#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
+#define Sleef_longdouble2_DEFINED
+typedef struct {
+  long double x, y;
+} Sleef_longdouble2;
+#endif
+
+#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
+
+#define LIKELY(condition) __builtin_expect(!!(condition), 1)
+#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
+#define RESTRICT __restrict__
+
+#ifndef __arm__
+#define ALIGNED(x) __attribute__((aligned(x)))
+#else
+#define ALIGNED(x)
+#endif
+
+#if defined(SLEEF_GENHEADER)
+
+#define INLINE SLEEF_ALWAYS_INLINE
+#define EXPORT SLEEF_INLINE
+#define CONST SLEEF_CONST
+#define NOEXPORT
+
+#else // #if defined(SLEEF_GENHEADER)
+
+#define CONST __attribute__((const))
+#define INLINE __attribute__((always_inline))
+
+#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+#ifndef SLEEF_STATIC_LIBS
+#define EXPORT __stdcall __declspec(dllexport)
+#define NOEXPORT
+#else // #ifndef SLEEF_STATIC_LIBS
+#define EXPORT
+#define NOEXPORT
+#endif // #ifndef SLEEF_STATIC_LIBS
+#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+#define EXPORT __attribute__((visibility("default")))
+#define NOEXPORT __attribute__ ((visibility ("hidden")))
+#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+
+#endif // #if defined(SLEEF_GENHEADER)
+
+#define SLEEF_NAN __builtin_nan("")
+#define SLEEF_NANf __builtin_nanf("")
+#define SLEEF_NANl __builtin_nanl("")
+#define SLEEF_INFINITY __builtin_inf()
+#define SLEEF_INFINITYf __builtin_inff()
+#define SLEEF_INFINITYl __builtin_infl()
+
+#if defined(__INTEL_COMPILER) || defined (__clang__)
+#define SLEEF_INFINITYq __builtin_inf()
+#define SLEEF_NANq __builtin_nan("")
+#else
+#define SLEEF_INFINITYq __builtin_infq()
+#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
+#endif
+
+#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
+
+#if defined(SLEEF_GENHEADER)
+
+#define INLINE SLEEF_ALWAYS_INLINE
+#define CONST SLEEF_CONST
+#define EXPORT SLEEF_INLINE
+#define NOEXPORT
+
+#else // #if defined(SLEEF_GENHEADER)
+
+#define INLINE __forceinline
+#define CONST
+#ifndef SLEEF_STATIC_LIBS
+#define EXPORT __declspec(dllexport)
+#define NOEXPORT
+#else
+#define EXPORT
+#define NOEXPORT
+#endif
+
+#endif // #if defined(SLEEF_GENHEADER)
+
+#define RESTRICT
+#define ALIGNED(x)
+#define LIKELY(condition) (condition)
+#define UNLIKELY(condition) (condition)
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
+#include <x86intrin.h>
+#endif
+
+#define SLEEF_INFINITY (1e+300 * 1e+300)
+#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
+#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
+#define SLEEF_NANf ((float)SLEEF_NAN)
+#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
+#define SLEEF_NANl ((long double)SLEEF_NAN)
+
+#if (defined(_M_AMD64) || defined(_M_X64))
+#ifndef __SSE2__
+#define __SSE2__
+#define __SSE3__
+#define __SSE4_1__
+#endif
+#elif _M_IX86_FP == 2
+#ifndef __SSE2__
+#define __SSE2__
+#define __SSE3__
+#define __SSE4_1__
+#endif
+#elif _M_IX86_FP == 1
+#ifndef __SSE__
+#define __SSE__
+#endif
+#endif
+
+#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
+
+#if !defined(__linux__)
+#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
+#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
+#define isnanf(x) ((x) != (x))
+#define isnanl(x) ((x) != (x))
+#endif
+
+#endif // #ifndef __MISC_H__
+
+#ifdef ENABLE_AAVPCS
+#define VECTOR_CC __attribute__((aarch64_vector_pcs))
+#else
+#define VECTOR_CC
+#endif
+
+//
+
+#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#if !defined (__clang__)
+#pragma GCC diagnostic ignored "-Wattribute-alias"
+#pragma GCC diagnostic ignored "-Wlto-type-mismatch"
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
+#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
+#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
+#pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
+#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/quaddef.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/quaddef.h
@@ -0,0 +1,99 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if !defined(SLEEF_GENHEADER)
+
+#if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)
+#define SLEEF_FLOAT128_IS_IEEEQP
+#endif
+
+#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
+#define SLEEF_LONGDOUBLE_IS_IEEEQP
+#endif
+
+#if !defined(Sleef_quad_DEFINED)
+#define Sleef_quad_DEFINED
+typedef struct { uint64_t x, y; } Sleef_uint64_2t;
+#if defined(SLEEF_FLOAT128_IS_IEEEQP) || defined(ENABLEFLOAT128)
+typedef __float128 Sleef_quad;
+#define SLEEF_QUAD_C(x) (x ## Q)
+#elif defined(SLEEF_LONGDOUBLE_IS_IEEEQP)
+typedef long double Sleef_quad;
+#define SLEEF_QUAD_C(x) (x ## L)
+#else
+typedef Sleef_uint64_2t Sleef_quad;
+#endif
+#endif
+
+#if !defined(Sleef_quad1_DEFINED)
+#define Sleef_quad1_DEFINED
+typedef union {
+  struct {
+    Sleef_quad x;
+  };
+  Sleef_quad s[1];
+} Sleef_quad1;
+#endif
+
+#if !defined(Sleef_quad2_DEFINED)
+#define Sleef_quad2_DEFINED
+typedef union {
+  struct {
+    Sleef_quad x, y;
+  };
+  Sleef_quad s[2];
+} Sleef_quad2;
+#endif
+
+#if !defined(Sleef_quad4_DEFINED)
+#define Sleef_quad4_DEFINED
+typedef union {
+  struct {
+    Sleef_quad x, y, z, w;
+  };
+  Sleef_quad s[4];
+} Sleef_quad4;
+#endif
+
+#if !defined(Sleef_quad8_DEFINED)
+#define Sleef_quad8_DEFINED
+typedef union {
+  Sleef_quad s[8];
+} Sleef_quad8;
+#endif
+
+#if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED)
+#define Sleef_quadx_DEFINED
+typedef union {
+  Sleef_quad s[32];
+} Sleef_quadx;
+#endif
+
+
+#else // #if !defined(SLEEF_GENHEADER)
+
+SLEEFSHARPif !defined(SLEEFXXX__NVCC__) && ((defined(SLEEFXXX__SIZEOF_FLOAT128__) && SLEEFXXX__SIZEOF_FLOAT128__ == 16) || (defined(SLEEFXXX__linux__) && defined(SLEEFXXX__GNUC__) && (defined(SLEEFXXX__i386__) || defined(SLEEFXXX__x86_64__))) || (defined(SLEEFXXX__PPC64__) && defined(SLEEFXXX__GNUC__) && !defined(SLEEFXXX__clang__) && SLEEFXXX__GNUC__ >= 8))
+SLEEFSHARPdefine SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP
+SLEEFSHARPendif
+
+SLEEFSHARPif !defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP) && !defined(SLEEFXXX__NVCC__) && defined(SLEEFXXX__SIZEOF_LONG_DOUBLE__) && SLEEFXXX__SIZEOF_LONG_DOUBLE__ == 16 && (defined(SLEEFXXX__aarch64__) || defined(SLEEFXXX__zarch__))
+SLEEFSHARPdefine SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP
+SLEEFSHARPendif
+
+SLEEFSHARPif !defined(SLEEFXXXSleef_quad_DEFINED)
+SLEEFSHARPdefine SLEEFXXXSleef_quad_DEFINED
+typedef struct { uint64_t x, y; } Sleef_uint64_2t;
+SLEEFSHARPif defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP)
+typedef __float128 Sleef_quad;
+SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## Q)
+SLEEFSHARPelif defined(SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP)
+typedef long double Sleef_quad;
+SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## L)
+SLEEFSHARPelse
+typedef Sleef_uint64_2t Sleef_quad;
+SLEEFSHARPendif
+SLEEFSHARPendif
+
+#endif // #if !defined(SLEEF_GENHEADER)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/CMakeLists.txt
@@ -0,0 +1,201 @@
+# Compiler properties
+
+set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
+
+set(COMMON_TARGET_PROPERTIES
+  C_STANDARD 99                  # -std=gnu99
+  )
+
+#
+
+function(add_test_dft TESTNAME)
+  if (ARMIE_COMMAND)
+    add_test(NAME ${TESTNAME} COMMAND ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS} ${ARGN})
+  elseif (NOT EMULATOR AND NOT SDE_COMMAND)
+    add_test(NAME ${TESTNAME} COMMAND ${ARGN})
+  elseif(NOT EMULATOR)
+    add_test(NAME ${TESTNAME} COMMAND ${SDE_COMMAND} "--" ${ARGN})
+  else()
+    add_test(NAME ${TESTNAME} COMMAND ${EMULATOR} ${ARGN})
+  endif()
+  set_tests_properties(${TESTNAME} PROPERTIES COST 0.1)
+endfunction()
+
+# Include directories
+
+include_directories(${PROJECT_SOURCE_DIR}/include)        # sleefdft.h
+include_directories(${sleef_BINARY_DIR}/include)          # sleef.h
+if (FFTW3_INCLUDE_DIR)
+  include_directories(${FFTW3_INCLUDE_DIR})               # fftw3.h
+endif()
+
+# Link directories
+
+link_directories(${sleef_BINARY_DIR}/lib)                 # libsleef, libsleefdft
+
+# Link libraries
+
+set(COMMON_LINK_LIBRARIES ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+
+if (COMPILER_SUPPORTS_OPENMP)
+  set(COMMON_LINK_LIBRARIES ${COMMON_LINK_LIBRARIES} ${OpenMP_C_FLAGS})
+endif()
+
+if((NOT MSVC) AND NOT SLEEF_CLANG_ON_WINDOWS)
+  # Target executable naivetestdp
+  set(TARGET_NAIVETESTDP "naivetestdp")
+  add_executable(${TARGET_NAIVETESTDP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+  add_dependencies(${TARGET_NAIVETESTDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+  target_compile_definitions(${TARGET_NAIVETESTDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
+  target_link_libraries(${TARGET_NAIVETESTDP} ${COMMON_LINK_LIBRARIES})
+  set_target_properties(${TARGET_NAIVETESTDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+  # Target executable naivetestsp
+  set(TARGET_NAIVETESTSP "naivetestsp")
+  add_executable(${TARGET_NAIVETESTSP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+  add_dependencies(${TARGET_NAIVETESTSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+  target_compile_definitions(${TARGET_NAIVETESTSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
+  target_link_libraries(${TARGET_NAIVETESTSP} ${COMMON_LINK_LIBRARIES})
+  set_target_properties(${TARGET_NAIVETESTSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+  # Test naivetestdp
+  add_test_dft(${TARGET_NAIVETESTDP}_1 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 1)
+  add_test_dft(${TARGET_NAIVETESTDP}_2 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 2)
+  add_test_dft(${TARGET_NAIVETESTDP}_3 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 3)
+  add_test_dft(${TARGET_NAIVETESTDP}_4 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 4)
+  add_test_dft(${TARGET_NAIVETESTDP}_5 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 5)
+  add_test_dft(${TARGET_NAIVETESTDP}_10 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 10)
+
+  # Test naivetestsp
+  add_test_dft(${TARGET_NAIVETESTSP}_1 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 1)
+  add_test_dft(${TARGET_NAIVETESTSP}_2 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 2)
+  add_test_dft(${TARGET_NAIVETESTSP}_3 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 3)
+  add_test_dft(${TARGET_NAIVETESTSP}_4 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 4)
+  add_test_dft(${TARGET_NAIVETESTSP}_5 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 5)
+  add_test_dft(${TARGET_NAIVETESTSP}_10 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 10)
+endif()
+
+# Target executable roundtriptest1ddp
+set(TARGET_ROUNDTRIPTEST1DDP "roundtriptest1ddp")
+add_executable(${TARGET_ROUNDTRIPTEST1DDP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+add_dependencies(${TARGET_ROUNDTRIPTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+target_compile_definitions(${TARGET_ROUNDTRIPTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
+target_link_libraries(${TARGET_ROUNDTRIPTEST1DDP} ${COMMON_LINK_LIBRARIES})
+set_target_properties(${TARGET_ROUNDTRIPTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+# Target executable roundtriptest1dsp
+set(TARGET_ROUNDTRIPTEST1DSP "roundtriptest1dsp")
+add_executable(${TARGET_ROUNDTRIPTEST1DSP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+add_dependencies(${TARGET_ROUNDTRIPTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+target_compile_definitions(${TARGET_ROUNDTRIPTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
+target_link_libraries(${TARGET_ROUNDTRIPTEST1DSP} ${COMMON_LINK_LIBRARIES})
+set_target_properties(${TARGET_ROUNDTRIPTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+# Target executable roundtriptest2ddp
+set(TARGET_ROUNDTRIPTEST2DDP "roundtriptest2ddp")
+add_executable(${TARGET_ROUNDTRIPTEST2DDP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+add_dependencies(${TARGET_ROUNDTRIPTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+target_compile_definitions(${TARGET_ROUNDTRIPTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
+target_link_libraries(${TARGET_ROUNDTRIPTEST2DDP} ${COMMON_LINK_LIBRARIES})
+set_target_properties(${TARGET_ROUNDTRIPTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+# Target executable roundtriptest2dsp
+set(TARGET_ROUNDTRIPTEST2DSP "roundtriptest2dsp")
+add_executable(${TARGET_ROUNDTRIPTEST2DSP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+add_dependencies(${TARGET_ROUNDTRIPTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+target_compile_definitions(${TARGET_ROUNDTRIPTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
+target_link_libraries(${TARGET_ROUNDTRIPTEST2DSP} ${COMMON_LINK_LIBRARIES})
+set_target_properties(${TARGET_ROUNDTRIPTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+if (LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
+  # Target executable fftwtest1ddp
+  set(TARGET_FFTWTEST1DDP "fftwtest1ddp")
+  add_executable(${TARGET_FFTWTEST1DDP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+  add_dependencies(${TARGET_FFTWTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+  target_compile_definitions(${TARGET_FFTWTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
+  target_link_libraries(${TARGET_FFTWTEST1DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
+  set_target_properties(${TARGET_FFTWTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+  # Target executable fftwtest1dsp
+  set(TARGET_FFTWTEST1DSP "fftwtest1dsp")
+  add_executable(${TARGET_FFTWTEST1DSP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+  add_dependencies(${TARGET_FFTWTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+  target_compile_definitions(${TARGET_FFTWTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
+  target_link_libraries(${TARGET_FFTWTEST1DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
+  set_target_properties(${TARGET_FFTWTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+  # Target executable fftwtest2ddp
+  set(TARGET_FFTWTEST2DDP "fftwtest2ddp")
+  add_executable(${TARGET_FFTWTEST2DDP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+  add_dependencies(${TARGET_FFTWTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+  target_compile_definitions(${TARGET_FFTWTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
+  target_link_libraries(${TARGET_FFTWTEST2DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
+  set_target_properties(${TARGET_FFTWTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+  # Target executable fftwtest2dsp
+  set(TARGET_FFTWTEST2DSP "fftwtest2dsp")
+  add_executable(${TARGET_FFTWTEST2DSP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
+  add_dependencies(${TARGET_FFTWTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
+  target_compile_definitions(${TARGET_FFTWTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
+  target_link_libraries(${TARGET_FFTWTEST2DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
+  set_target_properties(${TARGET_FFTWTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+  # Test fftwtest1ddp
+  add_test_dft(${TARGET_FFTWTEST1DDP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 12)
+  add_test_dft(${TARGET_FFTWTEST1DDP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 16)
+
+  # Test fftwtest1dsp
+  add_test_dft(${TARGET_FFTWTEST1DSP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 12)
+  add_test_dft(${TARGET_FFTWTEST1DSP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 16)
+
+  # Test fftwtest2ddp
+  add_test_dft(${TARGET_FFTWTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 2 2)
+  add_test_dft(${TARGET_FFTWTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 4 4)
+  add_test_dft(${TARGET_FFTWTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 8 8)
+  add_test_dft(${TARGET_FFTWTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 10 10)
+  add_test_dft(${TARGET_FFTWTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 5 15)
+
+  # Test fftwtest2dsp
+  add_test_dft(${TARGET_FFTWTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 2 2)
+  add_test_dft(${TARGET_FFTWTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 4 4)
+  add_test_dft(${TARGET_FFTWTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 8 8)
+  add_test_dft(${TARGET_FFTWTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 10 10)
+  add_test_dft(${TARGET_FFTWTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 5 15)
+else(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
+  if(MSVC OR SLEEF_CLANG_ON_WINDOWS)
+    # Test roundtriptestdp
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 1 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 2 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 3 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 4 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 5 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 10 10)
+
+    # Test roundtriptestsp
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 1 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 2 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 3 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 4 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 5 10)
+    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 10 10)
+  endif()
+
+  add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 12 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 16 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 12 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 16 10)
+
+  # Test roundtriptest2ddp
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 2 2 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 4 4 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 8 8 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 10 10 2)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 5 15 2)
+
+  # Test roundtriptest2dsp
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 2 2 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 4 4 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 8 8 10)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 10 10 2)
+  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 5 15 2)
+endif(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/bench1d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/bench1d.c
@@ -0,0 +1,116 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#define _DEFAULT_SOURCE
+#define _XOPEN_SOURCE 700
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <complex.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#ifdef USEFFTW
+#include <fftw3.h>
+#include <omp.h>
+#else
+#include "sleef.h"
+#include "sleefdft.h"
+#endif
+
+typedef double real;
+
+static uint64_t gettime() {
+  struct timespec tp;
+  clock_gettime(CLOCK_MONOTONIC, &tp);
+  return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
+}
+
+#define REPEAT 8
+
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  int backward = 0;
+
+  int log2n = atoi(argv[1]);
+  if (log2n < 0) {
+    backward = 1;
+    log2n = -log2n;
+  }
+
+  const int n = 1 << log2n;
+  const int64_t niter = (int)(100000000000.0 / n / log2n);
+
+  printf("Number of iterations = %lld\n", (long long int)niter);
+
+#ifdef USEFFTW
+  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+
+#if 0
+  int fftw_init_threads(void);
+  fftw_plan_with_nthreads(omp_get_max_threads());
+#endif
+
+  fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_MEASURE);
+  //fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_PATIENT);
+
+  for(int i=0;i<n;i++) {
+    in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
+  }
+
+  for(int64_t i=0;i<niter/2;i++) fftw_execute(w);
+#else
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
+
+  real *in  = (real *)Sleef_malloc(n*2 * sizeof(real));
+  real *out = (real *)Sleef_malloc(n*2 * sizeof(real));
+
+  int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE; // | SLEEF_MODE_NO_MT;
+  if (argc >= 3) mode = SLEEF_MODE_VERBOSE | SLEEF_MODE_ESTIMATE;
+
+  if (backward) mode |= SLEEF_MODE_BACKWARD;
+  struct SleefDFT *p = SleefDFT_double_init1d(n, in, out, mode);
+
+  if (argc >= 3) SleefDFT_setPath(p, argv[2]);
+
+  for(int i=0;i<n*2;i++) {
+    in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
+  }
+
+  for(int64_t i=0;i<niter/2;i++) SleefDFT_double_execute(p, in, out);
+#endif
+
+  for(int rep=0;rep<REPEAT;rep++) {
+    uint64_t tm0 = gettime();
+    for(int64_t i=0;i<niter;i++) {
+#ifdef USEFFTW
+      fftw_execute(w);
+#else
+      SleefDFT_double_execute(p, in, out);
+#endif
+    }
+    uint64_t tm1 = gettime();
+
+    printf("Actual    time = %g ns\n", (double)(tm1 - tm0) / niter);
+    double timeus = (tm1 - tm0) / ((double)niter * 1000);
+
+    double mflops = 5 * n * log2n / timeus;
+
+    printf("%g Mflops\n", mflops);
+  }
+
+  //
+
+  exit(0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest1d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest1d.c
@@ -0,0 +1,230 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+
+#include <fftw3.h>
+
+#ifndef MODE
+#define MODE SLEEF_MODE_DEBUG
+#endif
+
+#if BASETYPEID == 1
+#define THRES 1e-30
+#define SleefDFT_init1d SleefDFT_double_init1d
+#define SleefDFT_execute SleefDFT_double_execute
+typedef double real;
+#elif BASETYPEID == 2
+#define THRES 1e-13
+#define SleefDFT_init1d SleefDFT_float_init1d
+#define SleefDFT_execute SleefDFT_float_execute
+typedef float real;
+#else
+#error BASETYPEID not set
+#endif
+
+static double squ(double x) { return x * x; }
+
+// complex forward
+double check_cf(int n) {
+  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+  fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+
+  real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
+  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, MODE);
+
+  for(int i=0;i<n;i++) {
+    real re = (2.0 * random() - 1) / (real)RAND_MAX;
+    real im = (2.0 * random() - 1) / (real)RAND_MAX;
+    sx[(i*2+0)] = re;
+    sx[(i*2+1)] = im;
+    in[i] = re + im * _Complex_I;
+  }
+
+  SleefDFT_execute(p, NULL, NULL);
+  fftw_execute(w);
+
+  double rmsn = 0, rmsd = 0;
+
+  for(int i=0;i<n;i++) {
+    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
+    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
+  }
+
+  fftw_destroy_plan(w);
+  fftw_free(in);
+  fftw_free(out);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  return rmsn / rmsd;
+}
+
+// complex backward
+double check_cb(int n) {
+  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+  fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+  real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
+  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_BACKWARD | MODE);
+
+  for(int i=0;i<n;i++) {
+    real re = (2.0 * random() - 1) / (real)RAND_MAX;
+    real im = (2.0 * random() - 1) / (real)RAND_MAX;
+    sx[(i*2+0)] = re;
+    sx[(i*2+1)] = im;
+    in[i] = re + im * _Complex_I;
+  }
+
+  SleefDFT_execute(p, NULL, NULL);
+  fftw_execute(w);
+
+  double rmsn = 0, rmsd = 0;
+
+  for(int i=0;i<n;i++) {
+    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
+    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
+  }
+
+  fftw_destroy_plan(w);
+  fftw_free(in);
+  fftw_free(out);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  return rmsn / rmsd;
+}
+
+// real forward
+double check_rf(int n) {
+  double       *in  = (double *)      fftw_malloc(sizeof(double) * n);
+  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+  fftw_plan w       = fftw_plan_dft_r2c_1d(n, in, out, FFTW_ESTIMATE);
+
+  real *sx = (real *)Sleef_malloc(n*sizeof(real));
+  real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
+  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | MODE);
+
+  for(int i=0;i<n;i++) {
+    real re = (2.0 * random() - 1) / (real)RAND_MAX;
+    sx[i] = re;
+    in[i] = re;
+  }
+
+  SleefDFT_execute(p, NULL, NULL);
+  fftw_execute(w);
+
+  double rmsn = 0, rmsd = 0;
+
+  for(int i=0;i<n/2+1;i++) {
+    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
+    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
+  }
+
+  fftw_destroy_plan(w);
+  fftw_free(in);
+  fftw_free(out);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  return rmsn / rmsd;
+}
+
+// real backward
+double check_rb(int n) {
+  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+  double       *out = (double *)      fftw_malloc(sizeof(double) * n);
+  fftw_plan w = fftw_plan_dft_c2r_1d(n, in, out, FFTW_ESTIMATE);
+
+  real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
+  real *sy = (real *)Sleef_malloc(sizeof(real)*n);
+  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
+
+  for(int i=0;i<n/2;i++) {
+    if (i == 0) {
+      in[0  ] = (2.0 * (rand() / (real)RAND_MAX) - 1);
+      in[n/2] = (2.0 * (rand() / (real)RAND_MAX) - 1);
+    } else {
+      in[i  ] = (2.0 * (rand() / (real)RAND_MAX) - 1) + (2.0 * (rand() / (real)RAND_MAX) - 1) * _Complex_I;
+    }
+  }
+
+  for(int i=0;i<n/2+1;i++) {
+    sx[2*i+0] = creal(in[i]);
+    sx[2*i+1] = cimag(in[i]);
+  }
+
+  SleefDFT_execute(p, NULL, NULL);
+  fftw_execute(w);
+
+  double rmsn = 0, rmsd = 0;
+
+  for(int i=0;i<n;i++) {
+    rmsn += squ(sy[i] - out[i]);
+    rmsd += squ(        out[i]);
+  }
+
+  fftw_destroy_plan(w);
+  fftw_free(in);
+  fftw_free(out);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  return rmsn / rmsd;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srand((unsigned int)time(NULL));
+
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
+
+  //
+
+  int success = 1;
+  double e;
+
+  e = check_cf(n);
+  success = success && e < THRES;
+  printf("complex forward   : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+  e = check_cb(n);
+  success = success && e < THRES;
+  printf("complex backward  : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+  e = check_rf(n);
+  success = success && e < THRES;
+  printf("real forward      : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+  e = check_rb(n);
+  success = success && e < THRES;
+  printf("real backward     : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+
+  exit(success ? 0 : -1);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest2d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest2d.c
@@ -0,0 +1,143 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+
+#include <fftw3.h>
+
+#ifndef MODE
+#define MODE SLEEF_MODE_DEBUG
+#endif
+
+#if BASETYPEID == 1
+#define THRES 1e-30
+#define SleefDFT_init2d SleefDFT_double_init2d
+#define SleefDFT_execute SleefDFT_double_execute
+typedef double real;
+#elif BASETYPEID == 2
+#define THRES 1e-13
+#define SleefDFT_init2d SleefDFT_float_init2d
+#define SleefDFT_execute SleefDFT_float_execute
+typedef float real;
+#else
+#error BASETYPEID not set
+#endif
+
+static double squ(double x) { return x * x; }
+
+// complex forward
+double check_cf(int n, int m) {
+  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
+  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
+  fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+
+  real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
+  struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, MODE);
+
+  for(int i=0;i<n*m;i++) {
+    double re = (2.0 * random() - 1) / (double)RAND_MAX;
+    double im = (2.0 * random() - 1) / (double)RAND_MAX;
+    sx[(i*2+0)] = re;
+    sx[(i*2+1)] = im;
+    in[i] = re + im * _Complex_I;
+  }
+
+  SleefDFT_execute(p, NULL, NULL);
+  fftw_execute(w);
+
+  double rmsn = 0, rmsd = 0;
+
+  for(int i=0;i<n*m;i++) {
+    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
+    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
+  }
+
+  fftw_destroy_plan(w);
+  fftw_free(in);
+  fftw_free(out);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  return rmsn / rmsd;
+}
+
+// complex backward
+double check_cb(int n, int m) {
+  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
+  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
+  fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+  real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
+  struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, SLEEF_MODE_BACKWARD | MODE);
+
+  for(int i=0;i<n*m;i++) {
+    double re = (2.0 * random() - 1) / (double)RAND_MAX;
+    double im = (2.0 * random() - 1) / (double)RAND_MAX;
+    sx[(i*2+0)] = re;
+    sx[(i*2+1)] = im;
+    in[i] = re + im * _Complex_I;
+  }
+
+  SleefDFT_execute(p, NULL, NULL);
+  fftw_execute(w);
+
+  double rmsn = 0, rmsd = 0;
+
+  for(int i=0;i<n*m;i++) {
+    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
+    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
+  }
+
+  fftw_destroy_plan(w);
+  fftw_free(in);
+  fftw_free(out);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  return rmsn / rmsd;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 3) {
+    fprintf(stderr, "%s <log2n> <log2m>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+  const int m = 1 << atoi(argv[2]);
+
+  srand((unsigned int)time(NULL));
+
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
+
+  //
+
+  int success = 1;
+  double e;
+
+  e = check_cf(n, m);
+  success = success && e < THRES;
+  printf("complex forward   : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+  e = check_cb(n, m);
+  success = success && e < THRES;
+  printf("complex backward  : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+
+  exit(success ? 0 : -1);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/measuredft.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/measuredft.c
@@ -0,0 +1,175 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#define _DEFAULT_SOURCE
+#define _XOPEN_SOURCE 700
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+
+static uint64_t gettime() {
+  struct timespec tp;
+  clock_gettime(CLOCK_MONOTONIC, &tp);
+  return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
+}
+
+int mode[] = { SLEEF_MODE_MEASURE | SLEEF_MODE_NO_MT, SLEEF_MODE_MEASURE};
+
+#define ENABLE_SP
+//#define ROUNDTRIP
+#define REPEAT 2
+//#define ENABLE_SLEEP
+//#define WARMUP
+
+int main(int argc, char **argv) {
+  int start = 1, end = 18;
+  if (argc > 1) start = atoi(argv[1]);
+  if (argc > 2) end = atoi(argv[2]);
+
+  double *din  = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
+  double *dout = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
+  float *sin  = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
+  float *sout = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
+
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
+
+  for(int log2n=start;log2n<=end;log2n++) {
+    const int n = 1 << log2n;
+    int64_t niter = (int64_t)(1000000000.0 / REPEAT / n / log2n);
+
+    printf("%d ", n);
+
+    for(int m=0;m<2;m++) {
+#ifdef ENABLE_SLEEP
+      sleep(1);
+#endif
+
+      struct SleefDFT *pf = SleefDFT_double_init1d(n, NULL, NULL, mode[m]);
+#ifdef ROUNDTRIP
+      struct SleefDFT *pb = SleefDFT_double_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
+#endif
+
+      for(int i=0;i<n*2;i++) {
+        din[i] = 0;
+      }
+
+#ifdef ENABLE_SLEEP
+      sleep(1);
+#endif
+
+#ifdef WARMUP
+      for(int64_t i=0;i<niter/2;i++) {
+        SleefDFT_double_execute(pf, din, dout);
+#ifdef ROUNDTRIP
+        SleefDFT_double_execute(pb, dout, din);
+#endif
+      }
+#endif
+
+      uint64_t best = 1LL << 62;
+
+      //printf("\n");
+      for(int rep=0;rep<REPEAT;rep++) {
+        uint64_t tm0 = gettime();
+        for(int64_t i=0;i<niter;i++) {
+          SleefDFT_double_execute(pf, din, dout);
+#ifdef ROUNDTRIP
+          SleefDFT_double_execute(pb, dout, din);
+#endif
+        }
+        uint64_t tm1 = gettime();
+        if (tm1 - tm0 < best) best = tm1 - tm0;
+        //printf("%g\n", (double)(tm1 - tm0));
+      }
+
+      SleefDFT_dispose(pf);
+#ifdef ROUNDTRIP
+      SleefDFT_dispose(pb);
+#endif
+
+      double timeus = best / ((double)niter * 1000);
+
+#ifdef ROUNDTRIP
+      double mflops = 10 * n * log2n / timeus;
+#else
+      double mflops = 5 * n * log2n / timeus;
+#endif
+
+      printf("%g ", mflops);
+    }
+
+#ifdef ENABLE_SP
+    for(int m=0;m<2;m++) {
+#ifdef ENABLE_SLEEP
+      sleep(1);
+#endif
+
+      struct SleefDFT *pf = SleefDFT_float_init1d(n, NULL, NULL, mode[m]);
+#ifdef ROUNDTRIP
+      struct SleefDFT *pb = SleefDFT_float_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
+#endif
+
+      for(int i=0;i<n*2;i++) {
+        sin[i] = 0;
+      }
+
+#ifdef ENABLE_SLEEP
+      sleep(1);
+#endif
+
+#ifdef WARMUP
+      for(int64_t i=0;i<niter/2;i++) {
+        SleefDFT_float_execute(pf, sin, sout);
+#ifdef OUNDTRIP
+        SleefDFT_float_execute(pb, sout, sin);
+#endif
+      }
+#endif
+
+      uint64_t best = 1LL << 62;
+
+      for(int rep=0;rep<REPEAT;rep++) {
+        uint64_t tm0 = gettime();
+        for(int64_t i=0;i<niter;i++) {
+          SleefDFT_float_execute(pf, sin, sout);
+#ifdef ROUNDTRIP
+          SleefDFT_float_execute(pb, sout, sin);
+#endif
+        }
+        uint64_t tm1 = gettime();
+        if (tm1 - tm0 < best) best = tm1 - tm0;
+      }
+
+      SleefDFT_dispose(pf);
+#ifdef ROUNDTRIP
+      SleefDFT_dispose(pb);
+#endif
+
+      double timeus = best / ((double)niter * 1000);
+
+#ifdef ROUNDTRIP
+      double mflops = 10 * n * log2n / timeus;
+#else
+      double mflops = 5 * n * log2n / timeus;
+#endif
+
+      printf("%g ", mflops);
+    }
+#endif
+
+    printf("\n");
+  }
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/naivetest.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/naivetest.c
@@ -0,0 +1,484 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+#include "misc.h"
+
+#ifndef MODE
+#define MODE SLEEF_MODE_DEBUG
+#endif
+
+#define THRES 1e-4
+
+#if BASETYPEID == 1
+#define SleefDFT_init SleefDFT_double_init1d
+#define SleefDFT_execute SleefDFT_double_execute
+typedef double real;
+
+typedef double complex cmpl;
+
+cmpl omega(double n, double kn) {
+  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
+}
+#elif BASETYPEID == 2
+#define SleefDFT_init SleefDFT_float_init1d
+#define SleefDFT_execute SleefDFT_float_execute
+typedef float real;
+
+typedef double complex cmpl;
+
+cmpl omega(double n, double kn) {
+  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
+}
+#elif BASETYPEID == 3
+#define SleefDFT_init SleefDFT_longdouble_init1d
+#define SleefDFT_execute SleefDFT_longdouble_execute
+typedef double real;
+
+typedef double complex cmpl;
+
+cmpl omega(double n, double kn) {
+  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
+}
+#elif BASETYPEID == 4
+#include <quadmath.h>
+
+#define SleefDFT_init SleefDFT_quad_init1d
+#define SleefDFT_execute SleefDFT_quad_execute
+typedef Sleef_quad real;
+
+typedef double complex cmpl;
+
+cmpl omega(double n, double kn) {
+  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
+}
+#else
+#error No BASETYPEID specified
+#endif
+
+void forward(cmpl *ts, cmpl *fs, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n<len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+void backward(cmpl *fs, cmpl *ts, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    ts[k] = 0;
+
+    for(n=0;n<len;n++) {
+      ts[k] += fs[n] * omega(-len, n*k);
+    }
+  }
+}
+
+// complex forward
+int check_cf(int n) {
+  int i;
+
+  real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  //
+
+  for(i=0;i<n;i++) {
+    ts[i] = 0.5 * ((2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I);
+    sx[(i*2+0)] = creal(ts[i]);
+    sx[(i*2+1)] = cimag(ts[i]);
+  }
+
+  //
+
+  forward(ts, fs, n);
+
+  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_VERBOSE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+
+  //
+
+  int success = 1;
+  double rmsn = 0, rmsd = 0;
+
+  for(i=0;i<n;i++) {
+    if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
+        (fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
+      success = 0;
+    }
+
+    double t;
+    t = (sy[(i*2+0)] - creal(fs[i]));
+    rmsn += t*t;
+    t = (sy[(i*2+1)] - cimag(fs[i]));
+    rmsn += t*t;
+    rmsd += creal(fs[i]) * creal(fs[i]) + cimag(fs[i]) * cimag(fs[i]);
+  }
+
+  //
+
+  free(fs);
+  free(ts);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n) {
+  int i;
+
+  real *sx = (real *)Sleef_malloc(sizeof(real)*n*2);
+  real *sy = (real *)Sleef_malloc(sizeof(real)*n*2);
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  //
+
+  for(i=0;i<n;i++) {
+    fs[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
+    sx[(i*2+0)] = creal(fs[i]);
+    sx[(i*2+1)] = cimag(fs[i]);
+  }
+
+  backward(fs, ts, n);
+
+  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_BACKWARD | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+
+  //
+
+  int success = 1;
+
+  for(i=0;i<n;i++) {
+    if ((fabs(sy[(i*2+0)] - creal(ts[i])) > THRES) ||
+        (fabs(sy[(i*2+1)] - cimag(ts[i])) > THRES)) {
+      success = 0;
+    }
+  }
+
+  //
+
+  free(fs);
+  free(ts);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n) {
+  int i;
+
+  real *sx = (real *)Sleef_malloc(n * sizeof(real));
+  real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  //
+
+  for(i=0;i<n;i++) {
+    ts[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
+    sx[i] = creal(ts[i]);
+  }
+
+  //
+
+  forward(ts, fs, n);
+
+  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_NO_MT | SLEEF_MODE_REAL | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+
+  //
+
+  int success = 1;
+
+  for(i=0;i<n/2+1;i++) {
+    if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
+    if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
+  }
+
+  //
+
+  free(fs);
+  free(ts);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n) {
+  int i;
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  //
+
+  for(i=0;i<n/2;i++) {
+    if (i == 0) {
+      fs[0  ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
+      fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
+    } else {
+      fs[i  ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
+      fs[n-i] = conj(fs[i]);
+    }
+  }
+
+  real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
+  real *sy = (real *)Sleef_malloc(sizeof(real)*n);
+
+  for(i=0;i<n/2+1;i++) {
+    sx[2*i+0] = creal(fs[i]);
+    sx[2*i+1] = cimag(fs[i]);
+  }
+
+  //
+
+  backward(fs, ts, n);
+
+  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+
+  //
+
+  int success = 1;
+
+  for(i=0;i<n;i++) {
+    if (fabs(cimag(ts[i])) > THRES) {
+      success = 0;
+    }
+
+    if ((fabs(sy[i] - creal(ts[i])) > THRES)) {
+      success = 0;
+    }
+  }
+
+  //
+
+  free(fs);
+  free(ts);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  return success;
+}
+
+int check_arf(int n) {
+  int i;
+
+  real *sx = (real *)Sleef_malloc(n * sizeof(real));
+  real *sy = (real *)Sleef_malloc(n * sizeof(real));
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  //
+
+  for(i=0;i<n;i++) {
+    ts[i] = 2 * (rand() / (real)RAND_MAX) - 1;
+    sx[i] = creal(ts[i]);
+  }
+
+  //
+
+  backward(ts, fs, n);
+
+  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_ALT | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+
+  //
+
+  int success = 1;
+
+  for(i=0;i<n/2;i++) {
+    if (i == 0) {
+      if (fabs(sy[(2*0+0)] - creal(fs[0  ])) > THRES) success = 0;
+      if (fabs(sy[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0;
+    } else {
+      if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
+      if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  return success;
+}
+
+int check_arb(int n) {
+  int i;
+
+  real *sx = (real *)Sleef_malloc(n * sizeof(real));
+  real *sy = (real *)Sleef_malloc(n * sizeof(real));
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  //
+
+  for(i=0;i<n/2;i++) {
+    if (i == 0) {
+      fs[0  ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
+      fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
+    } else {
+      fs[i  ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
+      fs[n-i] = conj(fs[i]);
+    }
+  }
+
+  for(i=0;i<n/2;i++) {
+    if (i == 0) {
+      sx[2*0+0] = creal(fs[0  ]);
+      sx[2*0+1] = creal(fs[n/2]);
+    } else {
+      sx[2*i+0] = creal(fs[i]);
+      sx[2*i+1] = cimag(fs[i]);
+    }
+  }
+
+  //
+
+  forward(fs, ts, n);
+
+  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | SLEEF_MODE_ALT | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+
+  //
+
+  int success = 1;
+
+  for(i=0;i<n;i++) {
+    if (fabs(cimag(ts[i])) > THRES) {
+      success = 0;
+    }
+
+    if ((fabs(sy[i]*2 - creal(ts[i])) > THRES)) {
+      success = 0;
+    }
+  }
+
+  //
+
+  free(fs);
+  free(ts);
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srand((unsigned int)time(NULL));
+
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
+
+  //
+
+  int success = 1;
+
+  printf("complex  forward   : %s\n", (success &= check_cf(n))  ? "OK" : "NG");
+  printf("complex  backward  : %s\n", (success &= check_cb(n))  ? "OK" : "NG");
+  printf("real     forward   : %s\n", (success &= check_rf(n))  ? "OK" : "NG");
+  printf("real     backward  : %s\n", (success &= check_rb(n))  ? "OK" : "NG");
+  printf("real alt forward   : %s\n", (success &= check_arf(n)) ? "OK" : "NG");
+  printf("real alt backward  : %s\n", (success &= check_arb(n)) ? "OK" : "NG");
+
+  exit(!success);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest1d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest1d.c
@@ -0,0 +1,174 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+
+#ifndef MODE
+#define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
+#endif
+
+#if BASETYPEID == 1
+#define THRES 1e-30
+#define SleefDFT_init SleefDFT_double_init1d
+#define SleefDFT_execute SleefDFT_double_execute
+typedef double real;
+#elif BASETYPEID == 2
+#define THRES 1e-13
+#define SleefDFT_init SleefDFT_float_init1d
+#define SleefDFT_execute SleefDFT_float_execute
+typedef float real;
+#else
+#error BASETYPEID not set
+#endif
+
+static double squ(double x) { return x * x; }
+
+// complex transforms
+double check_c(int n) {
+  struct SleefDFT *p;
+
+  real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
+  real *sz = (real *)Sleef_malloc(n*2 * sizeof(real));
+
+  for(int i=0;i<n*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
+
+  //
+
+  p = SleefDFT_init(n, NULL, NULL, MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    exit(-1);
+  }
+
+  SleefDFT_execute(p, sx, sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    exit(-1);
+  }
+
+  SleefDFT_execute(p, sy, sz);
+  SleefDFT_dispose(p);
+
+  //
+
+  double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
+
+  for(int i=0;i<n;i++) {
+    rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
+    rmsd += squ(                    sx[i*2+0]) + squ(                    sx[i*2+1]);
+  }
+
+  //
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  Sleef_free(sz);
+
+  //
+
+  return rmsn / rmsd;
+}
+
+// real transforms
+double check_r(int n) {
+  struct SleefDFT *p;
+
+  real *sx = (real *)Sleef_malloc(n * sizeof(real));
+  real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
+  real *sz = (real *)Sleef_malloc(n * sizeof(real));
+
+  for(int i=0;i<n;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
+
+  //
+
+  p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sx, sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    return 0;
+  }
+
+  SleefDFT_execute(p, sy, sz);
+  SleefDFT_dispose(p);
+
+  //
+
+  double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
+
+  for(int i=0;i<n;i++) {
+    rmsn += squ(scale * sz[i] - sx[i]);
+    rmsd += squ(                sx[i]);
+  }
+
+  //
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  Sleef_free(sz);
+
+  //
+
+  return rmsn / rmsd;
+}
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    fprintf(stderr, "%s <log2n> [<nloop>]\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+  const int nloop = argc >= 3 ? atoi(argv[2]) : 1;
+
+  srand((unsigned int)time(NULL));
+
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
+
+  //
+
+  int success = 1;
+  double e;
+
+  for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
+    e = check_c(n);
+    success = success && e < THRES;
+    printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+    e = check_r(n);
+    success = success && e < THRES;
+    printf("real    : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+  }
+
+  exit(!success);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest2d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest2d.c
@@ -0,0 +1,118 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+
+#ifndef MODE
+#define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
+#endif
+
+#if BASETYPEID == 1
+#define THRES 1e-30
+#define SleefDFT_init2d SleefDFT_double_init2d
+#define SleefDFT_execute SleefDFT_double_execute
+typedef double real;
+#elif BASETYPEID == 2
+#define THRES 1e-13
+#define SleefDFT_init2d SleefDFT_float_init2d
+#define SleefDFT_execute SleefDFT_float_execute
+typedef float real;
+#else
+#error BASETYPEID not set
+#endif
+
+static double squ(double x) { return x * x; }
+
+// complex transforms
+double check_c(int n, int m) {
+  struct SleefDFT *p;
+
+  real *sx = (real *)Sleef_malloc(n*m*2 * sizeof(real));
+  real *sy = (real *)Sleef_malloc(n*m*2 * sizeof(real));
+  real *sz = (real *)Sleef_malloc(n*m*2 * sizeof(real));
+
+  for(int i=0;i<n*m*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
+
+  //
+
+  p = SleefDFT_init2d(n, m, NULL, NULL, MODE);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    exit(-1);
+  }
+
+  SleefDFT_execute(p, sx, sy);
+  SleefDFT_dispose(p);
+
+  //
+
+  p = SleefDFT_init2d(n, m, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    exit(-1);
+  }
+
+  SleefDFT_execute(p, sy, sz);
+  SleefDFT_dispose(p);
+
+  //
+
+  double rmsn = 0, rmsd = 0, scale = 1 / (n*(double)m);
+
+  for(int i=0;i<n*m;i++) {
+    rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
+    rmsd += squ(                    sx[i*2+0]) + squ(                    sx[i*2+1]);
+  }
+
+  //
+
+  Sleef_free(sx);
+  Sleef_free(sy);
+  Sleef_free(sz);
+
+  //
+
+  return rmsn / rmsd;
+}
+
+int main(int argc, char **argv) {
+  if (argc < 3) {
+    fprintf(stderr, "%s <log2n> <log2m> [<nloop>]\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+  const int m = 1 << atoi(argv[2]);
+  const int nloop = argc >= 4 ? atoi(argv[3]) : 1;
+
+  srand((unsigned int)time(NULL));
+
+  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
+
+  //
+
+  int success = 1;
+  double e;
+
+  for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
+    e = check_c(n, m);
+    success = success && e < THRES;
+    printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
+  }
+
+  exit(!success);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/tutorial.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/tutorial.c
@@ -0,0 +1,80 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+// gcc tutorial.c -lsleef -lsleefdft -lm
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <complex.h>
+
+#include "sleef.h"
+#include "sleefdft.h"
+
+#define THRES 1e-4
+
+typedef double complex cmpl;
+
+cmpl omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(cmpl *ts, cmpl *fs, int len) {
+  for(int k=0;k<len;k++) {
+    fs[k] = 0;
+    for(int n=0;n<len;n++) fs[k] += ts[n] * omega(len, n*k);
+  }
+}
+
+int main(int argc, char **argv) {
+  int n = 256;
+  if (argc == 2) n = 1 << atoi(argv[1]);
+
+  SleefDFT_setPlanFilePath("plan.txt", NULL, SLEEF_PLAN_AUTOMATIC);
+
+  double *sx = (double *)Sleef_malloc(n*2 * sizeof(double));
+  double *sy = (double *)Sleef_malloc(n*2 * sizeof(double));
+
+  struct SleefDFT *p = SleefDFT_double_init1d(n, sx, sy, SLEEF_MODE_FORWARD);
+
+  if (p == NULL) {
+    printf("SleefDFT initialization failed\n");
+    exit(-1);
+  }
+
+  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
+  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
+
+  for(int i=0;i<n;i++) {
+    ts[i] =
+      (2.0 * (rand() / (double)RAND_MAX) - 1) * 1.0 +
+      (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
+
+    sx[(i*2+0)] = creal(ts[i]);
+    sx[(i*2+1)] = cimag(ts[i]);
+  }
+
+  forward(ts, fs, n);
+
+  SleefDFT_double_execute(p, NULL, NULL);
+
+  int success = 1;
+
+  for(int i=0;i<n;i++) {
+    if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
+        (fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
+      success = 0;
+    }
+  }
+
+  printf("%s\n", success ? "OK" : "NG");
+
+  free(fs); free(ts);
+  Sleef_free(sy); Sleef_free(sx);
+
+  SleefDFT_dispose(p);
+
+  exit(success);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/CMakeLists.txt
@@ -0,0 +1,425 @@
+
+# Options
+
+if (COMPILER_SUPPORTS_SVE)
+  set(SLEEFDFT_MAXBUTWIDTH 6 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
+else()
+  set(SLEEFDFT_MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
+endif()
+
+if (SLEEFDFT_MAXBUTWIDTH GREATER 7)
+  message(FATAL_ERROR "SLEEFDFT_MAXBUTWIDTH has to be smaller than 8." )
+endif()
+
+option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF)
+
+# Settings
+
+# Constants definition
+
+set(LISTSHORTTYPENAME "dp" "sp")
+set(LISTLONGTYPENAME "double" "float")
+set(LISTTYPEID "1" "2")
+
+set(MACRODEF_vecextdp BASETYPEID=1 ENABLE_VECEXT CONFIG=1)
+set(CFLAGS_vecextdp ${FLAGS_ENABLE_VECEXT})
+set(MACRODEF_vecextsp BASETYPEID=2 ENABLE_VECEXT CONFIG=1)
+set(CFLAGS_vecextsp ${FLAGS_ENABLE_VECEXT})
+set(MACRODEF_vecextld BASETYPEID=3 ENABLE_VECEXT CONFIG=1)
+set(CFLAGS_vecextld ${FLAGS_ENABLE_VECEXT})
+set(MACRODEF_vecextqp BASETYPEID=4 ENABLE_VECEXT CONFIG=1)
+set(CFLAGS_vecextqp ${FLAGS_ENABLE_VECEXT})
+set(MACRODEF_purecdp BASETYPEID=1 ENABLE_PUREC CONFIG=1)
+set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC})
+set(MACRODEF_purecsp BASETYPEID=2 ENABLE_PUREC CONFIG=1)
+set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC})
+set(MACRODEF_purecld BASETYPEID=3 ENABLE_PUREC CONFIG=1)
+set(CFLAGS_purecld ${FLAGS_ENABLE_PUREC})
+set(MACRODEF_purecqp BASETYPEID=4 ENABLE_PUREC CONFIG=1)
+set(CFLAGS_purecqp ${FLAGS_ENABLE_PUREC})
+set(MACRODEF_sse2dp BASETYPEID=1 ENABLE_SSE2 CONFIG=4)
+set(CFLAGS_sse2dp ${FLAGS_ENABLE_SSE4})
+set(MACRODEF_sse2sp BASETYPEID=2 ENABLE_SSE2 CONFIG=4)
+set(CFLAGS_sse2sp ${FLAGS_ENABLE_SSE4})
+set(MACRODEF_avxdp BASETYPEID=1 ENABLE_AVX CONFIG=1)
+set(CFLAGS_avxdp ${FLAGS_ENABLE_AVX})
+set(MACRODEF_avxsp BASETYPEID=2 ENABLE_AVX CONFIG=1)
+set(CFLAGS_avxsp ${FLAGS_ENABLE_AVX})
+set(MACRODEF_avx2dp BASETYPEID=1 ENABLE_AVX2 CONFIG=1)
+set(CFLAGS_avx2dp ${FLAGS_ENABLE_AVX2})
+set(MACRODEF_avx2sp BASETYPEID=2 ENABLE_AVX2 CONFIG=1)
+set(CFLAGS_avx2sp ${FLAGS_ENABLE_AVX2})
+set(MACRODEF_avx512fdp BASETYPEID=1 ENABLE_AVX512F CONFIG=1)
+set(CFLAGS_avx512fdp ${FLAGS_ENABLE_AVX512F})
+set(MACRODEF_avx512fsp BASETYPEID=2 ENABLE_AVX512F CONFIG=1)
+set(CFLAGS_avx512fsp ${FLAGS_ENABLE_AVX512F})
+set(MACRODEF_advsimddp BASETYPEID=1 ENABLE_ADVSIMD CONFIG=1)
+set(CFLAGS_advsimddp ${FLAGS_ENABLE_ADVSIMD})
+set(MACRODEF_advsimdsp BASETYPEID=2 ENABLE_ADVSIMD CONFIG=1)
+set(CFLAGS_advsimdsp ${FLAGS_ENABLE_ADVSIMD})
+set(MACRODEF_neon32sp BASETYPEID=2 ENABLE_NEON32 CONFIG=1)
+set(CFLAGS_neon32sp ${FLAGS_ENABLE_NEON32})
+set(MACRODEF_sve256dp BASETYPEID=1 ENABLE_SVE CONFIG=8)
+set(CFLAGS_sve256dp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve256sp BASETYPEID=2 ENABLE_SVE CONFIG=8)
+set(CFLAGS_sve256sp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve512dp BASETYPEID=1 ENABLE_SVE CONFIG=9)
+set(CFLAGS_sve512dp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve512sp BASETYPEID=2 ENABLE_SVE CONFIG=9)
+set(CFLAGS_sve512sp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve1024dp BASETYPEID=1 ENABLE_SVE CONFIG=10)
+set(CFLAGS_sve1024dp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve1024sp BASETYPEID=2 ENABLE_SVE CONFIG=10)
+set(CFLAGS_sve1024sp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11)
+set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11)
+set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE})
+set(MACRODEF_rvvm1128dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=7)
+set(CFLAGS_rvvm1128dp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm1128sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=7)
+set(CFLAGS_rvvm1128sp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm1256dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=8)
+set(CFLAGS_rvvm1256dp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm1256sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=8)
+set(CFLAGS_rvvm1256sp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm1512dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=9)
+set(CFLAGS_rvvm1512dp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm1512sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=9)
+set(CFLAGS_rvvm1512sp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm11024dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=10)
+set(CFLAGS_rvvm11024dp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm11024sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=10)
+set(CFLAGS_rvvm11024sp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm12048dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=11)
+set(CFLAGS_rvvm12048dp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm12048sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=11)
+set(CFLAGS_rvvm12048sp ${FLAGS_ENABLE_RVVM1})
+set(MACRODEF_rvvm2128dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=7)
+set(CFLAGS_rvvm2128dp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm2128sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=7)
+set(CFLAGS_rvvm2128sp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm2256dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=8)
+set(CFLAGS_rvvm2256dp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm2256sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=8)
+set(CFLAGS_rvvm2256sp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm2512dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=9)
+set(CFLAGS_rvvm2512dp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm2512sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=9)
+set(CFLAGS_rvvm2512sp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm21024dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=10)
+set(CFLAGS_rvvm21024dp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm21024sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=10)
+set(CFLAGS_rvvm21024sp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm22048dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=11)
+set(CFLAGS_rvvm22048dp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_rvvm22048sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=11)
+set(CFLAGS_rvvm22048sp ${FLAGS_ENABLE_RVVM2})
+set(MACRODEF_vsxdp BASETYPEID=1 ENABLE_VSX CONFIG=1)
+set(CFLAGS_vsxdp ${FLAGS_ENABLE_VSX})
+set(MACRODEF_vsxsp BASETYPEID=2 ENABLE_VSX CONFIG=1)
+set(CFLAGS_vsxsp ${FLAGS_ENABLE_VSX})
+set(MACRODEF_vsx3dp BASETYPEID=1 ENABLE_VSX3 CONFIG=1)
+set(CFLAGS_vsx3dp ${FLAGS_ENABLE_VSX3})
+set(MACRODEF_vsx3sp BASETYPEID=2 ENABLE_VSX3 CONFIG=1)
+set(CFLAGS_vsx3sp ${FLAGS_ENABLE_VSX3})
+set(MACRODEF_vxedp BASETYPEID=1 ENABLE_VXE CONFIG=140)
+set(CFLAGS_vxedp ${FLAGS_ENABLE_VXE})
+set(MACRODEF_vxesp BASETYPEID=2 ENABLE_VXE CONFIG=140)
+set(CFLAGS_vxesp ${FLAGS_ENABLE_VXE})
+set(MACRODEF_vxe2dp BASETYPEID=1 ENABLE_VXE2 CONFIG=150)
+set(CFLAGS_vxe2dp ${FLAGS_ENABLE_VXE2})
+set(MACRODEF_vxe2sp BASETYPEID=2 ENABLE_VXE2 CONFIG=150)
+set(CFLAGS_vxe2sp ${FLAGS_ENABLE_VXE2})
+
+# List all available scalar data types
+
+set(ISALIST_SP purecsp)
+set(ISALIST_DP purecdp)
+
+set(LIST_SUPPORTED_FPTYPE 0 1)
+if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
+  set(ISALIST_SP vecextsp)
+  set(ISALIST_DP vecextdp)
+endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
+
+# List all available vector data types
+
+if (COMPILER_SUPPORTS_SSE4)
+  set(ISALIST_SP ${ISALIST_SP} sse2sp)
+  set(ISALIST_DP ${ISALIST_DP} sse2dp)
+endif(COMPILER_SUPPORTS_SSE4)
+
+if (COMPILER_SUPPORTS_AVX)
+  set(ISALIST_SP ${ISALIST_SP} avxsp)
+  set(ISALIST_DP ${ISALIST_DP} avxdp)
+endif(COMPILER_SUPPORTS_AVX)
+
+if (COMPILER_SUPPORTS_AVX2)
+  set(ISALIST_SP ${ISALIST_SP} avx2sp)
+  set(ISALIST_DP ${ISALIST_DP} avx2dp)
+endif(COMPILER_SUPPORTS_AVX2)
+
+if (COMPILER_SUPPORTS_AVX512F)
+  set(ISALIST_SP ${ISALIST_SP} avx512fsp)
+  set(ISALIST_DP ${ISALIST_DP} avx512fdp)
+endif(COMPILER_SUPPORTS_AVX512F)
+
+if (COMPILER_SUPPORTS_ADVSIMD)
+  set(ISALIST_SP ${ISALIST_SP} advsimdsp)
+  set(ISALIST_DP ${ISALIST_DP} advsimddp)
+endif(COMPILER_SUPPORTS_ADVSIMD)
+
+if (COMPILER_SUPPORTS_SVE)
+  set(ISALIST_SP ${ISALIST_SP} sve256sp sve512sp sve1024sp sve2048sp)
+  set(ISALIST_DP ${ISALIST_DP} sve256dp sve512dp sve1024dp sve2048dp)
+endif(COMPILER_SUPPORTS_SVE)
+
+if (COMPILER_SUPPORTS_NEON32)
+  set(ISALIST_SP ${ISALIST_SP} neon32sp)
+endif(COMPILER_SUPPORTS_NEON32)
+
+if (COMPILER_SUPPORTS_RVVM1)
+  set(ISALIST_SP ${ISALIST_SP} rvvm1128sp rvvm1256sp rvvm1512sp rvvm11024sp rvvm12048sp)
+  set(ISALIST_DP ${ISALIST_DP} rvvm1128dp rvvm1256dp rvvm1512dp rvvm11024dp rvvm12048dp)
+endif(COMPILER_SUPPORTS_RVVM1)
+
+if (COMPILER_SUPPORTS_RVVM2)
+  set(ISALIST_SP ${ISALIST_SP} rvvm2128sp rvvm2256sp rvvm2512sp rvvm21024sp rvvm22048sp)
+  set(ISALIST_DP ${ISALIST_DP} rvvm2128dp rvvm2256dp rvvm2512dp rvvm21024dp rvvm22048dp)
+endif(COMPILER_SUPPORTS_RVVM2)
+
+if (COMPILER_SUPPORTS_VSX)
+  set(ISALIST_SP ${ISALIST_SP} vsxsp)
+  set(ISALIST_DP ${ISALIST_DP} vsxdp)
+endif(COMPILER_SUPPORTS_VSX)
+
+if (COMPILER_SUPPORTS_VSX3)
+  set(ISALIST_SP ${ISALIST_SP} vsx3sp)
+  set(ISALIST_DP ${ISALIST_DP} vsx3dp)
+endif(COMPILER_SUPPORTS_VSX3)
+
+if (COMPILER_SUPPORTS_VXE)
+  set(ISALIST_SP ${ISALIST_SP} vxesp)
+  set(ISALIST_DP ${ISALIST_DP} vxedp)
+endif(COMPILER_SUPPORTS_VXE)
+
+if (COMPILER_SUPPORTS_VXE2)
+  set(ISALIST_SP ${ISALIST_SP} vxe2sp)
+  set(ISALIST_DP ${ISALIST_DP} vxe2dp)
+endif(COMPILER_SUPPORTS_VXE2)
+
+if(SLEEFDFT_ENABLE_STREAM)
+  set(NLIST 0 1 2 3)
+else()
+  set(NLIST 0 2)
+endif()
+
+#
+
+# Compiler properties
+
+set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
+set(COMMON_TARGET_PROPERTIES
+  C_STANDARD 99                  # -std=gnu99
+  )
+
+if (BUILD_SHARED_LIBS)
+  list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON)   # -fPIC
+endif()
+
+set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} MAXBUTWIDTH=${SLEEFDFT_MAXBUTWIDTH})
+
+if (SLEEFDFT_ENABLE_STREAM)
+  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=1)
+else()
+  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=0)
+endif()
+
+if(COMPILER_SUPPORTS_OPENMP)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+endif(COMPILER_SUPPORTS_OPENMP)
+
+
+# Include directories
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_BINARY_DIR}/include)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+# Target mkunroll
+
+set(TARGET_MKUNROLL "mkunroll")
+add_host_executable(${TARGET_MKUNROLL} mkunroll.c)
+set_target_properties(${TARGET_MKUNROLL} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+if (NOT CMAKE_CROSSCOMPILING)
+  target_compile_definitions(${TARGET_MKUNROLL} PRIVATE ${COMMON_TARGET_DEFINITIONS})
+endif()
+
+# Target mkdispatch
+
+set(TARGET_MKDISPATCH "mkdispatch")
+add_host_executable(${TARGET_MKDISPATCH} mkdispatch.c)
+set_target_properties(${TARGET_MKDISPATCH} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+if (NOT CMAKE_CROSSCOMPILING)
+  target_compile_definitions(${TARGET_MKDISPATCH} PRIVATE ${COMMON_TARGET_DEFINITIONS})
+endif()
+
+# Target dispatchparam.h
+
+add_custom_command(OUTPUT dispatchparam.h
+  COMMENT "Generating dispatchparam.h"
+  COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> paramonly ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_DP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h
+  DEPENDS ${TARGET_MKDISPATCH}
+  )
+add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h)
+
+# Target dispatch*.h
+
+foreach(T ${LIST_SUPPORTED_FPTYPE})
+  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
+  string(TOUPPER ${ST} CST)                                 # CST is "DP"
+  list(GET LISTLONGTYPENAME ${T} LT)                        # LT is "double"
+  list(GET LISTTYPEID ${T} ID)                              # ID is 1
+
+  string(CONCAT S "dispatch" ${ST} ".h")                    # S is dispatchdp.h
+  add_custom_command(OUTPUT ${S}
+    COMMENT "Generating ${S}"
+    COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> ${LT} ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_${CST}} > ${S}
+    DEPENDS ${TARGET_MKDISPATCH}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    )
+
+  string(CONCAT G ${S} "_generated")                        # G is dispatchdp.h_generated
+  add_custom_target(${G} SOURCES ${S})
+endforeach()
+
+# Target dftcommon.o
+
+add_library(dftcommon_obj OBJECT dftcommon.c dftcommon.h ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h)
+add_dependencies(dftcommon_obj ${TARGET_HEADERS} dispatchparam.h_generated)
+set_source_files_properties(${sleef_BINARY_DIR}/include/sleef.h PROPERTIES GENERATED TRUE)
+set_target_properties(dftcommon_obj PROPERTIES ${COMMON_TARGET_PROPERTIES})
+target_compile_definitions(dftcommon_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
+
+# Target dft*.o
+
+foreach(T ${LIST_SUPPORTED_FPTYPE})
+  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
+
+  string(CONCAT G "dft" ${ST} "_obj")                       # G is "dftdp_obj"
+  string(CONCAT S "dispatch" ${ST} ".h")                    # S is "dispatchdp.h"
+  add_library(${G} OBJECT dft.c dftcommon.h ${S})
+  string(CONCAT SG ${S} "_generated")                       # SG is "dispatchdp.h_generated"
+  add_dependencies(${G} ${SG} ${TARGET_HEADERS})
+  set_target_properties(${G} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+  list(GET LISTTYPEID ${T} ID)                              # ID is 1
+  target_compile_definitions(${G} PRIVATE BASETYPEID=${ID} ${COMMON_TARGET_DEFINITIONS})
+endforeach()
+
+# Copy unroll0.org to ${CMAKE_CURRENT_BINARY_DIR}
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org)
+add_custom_target(unroll0.org.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org)
+
+# Target unroll*.c
+
+foreach(T ${LIST_SUPPORTED_FPTYPE})
+  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
+  string(TOUPPER ${ST} CST)                                 # CST is "DP"
+  list(GET LISTLONGTYPENAME ${T} LT)                        # LT is "double"
+
+  foreach(E ${ISALIST_${CST}})                              # E is "sse2dp"
+    foreach(N ${NLIST})
+      string(CONCAT UC unroll_ ${N} _ ${E} ".c")            # UC is "unroll_0_sse2dp.c"
+      set(UNROLL_TARGET_${CST} ${UNROLL_TARGET_${CST}} ${UC})
+    endforeach()
+  endforeach()
+  message(STATUS "Unroll target for ${CST} : ${UNROLL_TARGET_${CST}}")
+
+  if(UNROLL_TARGET_${CST})
+    add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}}
+      COMMENT "Generating ${UNROLL_TARGET_${CST}}"
+      COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> ${LT} ${ISALIST_${CST}}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${TARGET_MKUNROLL} unroll0.org.copied
+      )
+    add_custom_target(unroll_target_${ST} DEPENDS ${UNROLL_TARGET_${CST}})
+  endif()
+endforeach()
+
+# Target unroll*.o
+
+foreach(T ${LIST_SUPPORTED_FPTYPE})
+  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
+  string(TOUPPER ${ST} CST)                                 # CST is "DP"
+  list(GET LISTLONGTYPENAME ${T} LT)                        # LT is "double"
+
+  foreach(E ${ISALIST_${CST}})                              # E is "sse2dp"
+    foreach(N ${NLIST})
+      string(CONCAT U unroll_ ${N} _ ${E})                  # U is "unroll_0_sse2dp"
+      string(CONCAT UG ${U} "_obj")                         # UG is "unroll_0_sse2dp_obj"
+      string(CONCAT UC ${U} ".c")                           # UC is "unroll_0_sse2dp.c"
+      add_library(${UG} OBJECT ${UC})
+      set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+      target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+      target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}})
+      target_compile_options(${UG} PRIVATE ${CFLAGS_${E}})
+      add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST})
+    endforeach()
+  endforeach()
+endforeach()
+
+# Target libdft
+
+add_library(${TARGET_LIBDFT} $<TARGET_OBJECTS:dftcommon_obj> $<TARGET_OBJECTS:${TARGET_LIBARRAYMAP_OBJ}>)
+target_link_libraries(${TARGET_LIBDFT} ${TARGET_LIBSLEEF} ${LIBM})
+
+foreach(T ${LIST_SUPPORTED_FPTYPE})
+  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
+
+  string(CONCAT G "dft" ${ST} "_obj")                       # G is "dftdp_obj"
+  target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${G}>)
+endforeach()
+
+foreach(T ${LIST_SUPPORTED_FPTYPE})
+  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
+  string(TOUPPER ${ST} CST)                                 # CST is "DP"
+
+  foreach(E ${ISALIST_${CST}})                              # E is "sse2dp"
+    foreach(N ${NLIST})
+      string(CONCAT UG unroll_ ${N} _ ${E} "_obj")          # U is "unroll_0_sse2dp_obj"
+      target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${UG}>)
+    endforeach()
+  endforeach()
+endforeach()
+
+set_target_properties(${TARGET_LIBDFT} PROPERTIES
+  VERSION ${SLEEF_VERSION}
+  SOVERSION ${SLEEF_SOVERSION}
+  PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/sleefdft.h
+  ${COMMON_TARGET_PROPERTIES}
+  )
+
+# Install
+install(
+    TARGETS ${TARGET_LIBDFT}
+    EXPORT sleefTargets
+    PUBLIC_HEADER #
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    COMPONENT sleef_Development
+    LIBRARY #
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    COMPONENT sleef_Runtime
+    NAMELINK_COMPONENT sleef_Development
+    ARCHIVE #
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    COMPONENT sleef_Development
+    RUNTIME #
+    DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    COMPONENT sleef_Runtime
+    INCLUDES #
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dft.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dft.c
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.c
@@ -0,0 +1,423 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "misc.h"
+#include "sleef.h"
+
+#define IMPORT_IS_EXPORT
+#include "sleefdft.h"
+#include "dispatchparam.h"
+#include "dftcommon.h"
+#include "common.h"
+#include "arraymap.h"
+
+#define MAGIC_FLOAT 0x31415926
+#define MAGIC_DOUBLE 0x27182818
+
+#define MAGIC2D_FLOAT 0x22360679
+#define MAGIC2D_DOUBLE 0x17320508
+
+const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" };
+
+static int parsePathStr(char *p, int *path, int *config, int pathLenMax, int log2len) {
+  int pathLen = 0, l2l = 0;
+
+  for(;;) {
+    while(*p == ' ') p++;
+    if (*p == '\0') break;
+    if (!isdigit((int)*p)) return -1;
+
+    pathLen++;
+    if (pathLen >= pathLenMax) return -2;
+
+    int n = 0;
+    while(isdigit((int)*p)) n = n * 10 + *p++ - '0';
+
+    if (n > MAXBUTWIDTH) return -6;
+    path[pathLen-1] = n;
+    l2l += n;
+    config[pathLen-1] = 0;
+
+    if (*p != '(') continue;
+
+    int c;
+    for(c=3;c>=0;c--) if (strncmp(p+1, configStr[c], strlen(configStr[c])) == 0) break;
+    if (c == -1) return -3;
+    p += strlen(configStr[c]) + 1;
+    if (*p != ')') return -4;
+    p++;
+
+    config[pathLen-1] = c;
+  }
+
+  if (l2l != log2len) return -5;
+
+  return pathLen;
+}
+
+EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) {
+  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
+
+  int path[32], config[32];
+  int pathLen = parsePathStr(pathStr, path, config, 31, p->log2len);
+
+  if (pathLen < 0) {
+    if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Error %d in parsing path string : %s\n", pathLen, pathStr);
+    return;
+  }
+
+  for(uint32_t j = 0;j <= p->log2len;j++) p->bestPath[j] = 0;
+
+  for(int level = p->log2len, j=0;level > 0 && j < pathLen;) {
+    p->bestPath[level] = path[j];
+    p->bestPathConfig[level] = config[j];
+    level -= path[j];
+    j++;
+  }
+
+  p->pathLen = 0;
+  for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
+
+  if ((p->mode & SLEEF_MODE_VERBOSE) != 0) {
+    printf("Set path : ");
+    for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]);
+    printf("\n");
+  }
+}
+
+void freeTables(SleefDFT *p) {
+  for(int N=1;N<=MAXBUTWIDTH;N++) {
+    for(uint32_t level=N;level<=p->log2len;level++) {
+      Sleef_free(p->tbl[N][level]);
+    }
+    free(p->tbl[N]);
+    p->tbl[N] = NULL;
+  }
+}
+
+EXPORT void SleefDFT_dispose(SleefDFT *p) {
+  if (p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)) {
+    Sleef_free(p->tBuf);
+    SleefDFT_dispose(p->instH);
+    if (p->hlen != p->vlen) SleefDFT_dispose(p->instV);
+
+    p->magic = 0;
+    free(p);
+    return;
+  }
+
+  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
+
+  if (p->log2len <= 1) {
+    p->magic = 0;
+    free(p);
+    return;
+  }
+
+  if ((p->mode & SLEEF_MODE_REAL) != 0) {
+    Sleef_free(p->rtCoef1);
+    Sleef_free(p->rtCoef0);
+    p->rtCoef0 = p->rtCoef1 = NULL;
+  }
+
+  for(int level = p->log2len;level >= 1;level--) {
+    Sleef_free(p->perm[level]);
+  }
+  free(p->perm);
+  p->perm = NULL;
+
+  freeTables(p);
+
+  p->magic = 0;
+  free(p);
+}
+
+uint32_t ilog2(uint32_t q) {
+  static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
+  uint32_t r = 0,qq;
+
+  if (q & 0xffff0000) r = 16;
+
+  q >>= r;
+  qq = q | (q >> 1);
+  qq |= (qq >> 2);
+  qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
+
+  return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
+}
+
+//
+
+char *dftPlanFilePath = NULL;
+char *archID = NULL;
+uint64_t planMode = SLEEF_PLAN_REFERTOENVVAR;
+ArrayMap *planMap = NULL;
+int planFilePathSet = 0, planFileLoaded = 0;
+#ifdef _OPENMP
+omp_lock_t planMapLock;
+int planMapLockInitialized = 0;
+#endif
+
+static void initPlanMapLock() {
+#ifdef _OPENMP
+#pragma omp critical
+  {
+    if (!planMapLockInitialized) {
+      planMapLockInitialized = 1;
+      omp_init_lock(&planMapLock);
+    }
+  }
+#endif
+}
+
+static void planMap_clear() {
+  if (planMap != NULL) ArrayMap_dispose(planMap);
+  planMap = NULL;
+}
+
+EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
+  initPlanMapLock();
+
+  if ((mode & SLEEF_PLAN_RESET) != 0) {
+    planMap_clear();
+    planFileLoaded = 0;
+    planFilePathSet = 0;
+  }
+
+  if (dftPlanFilePath != NULL) free(dftPlanFilePath);
+  if (path != NULL) {
+    dftPlanFilePath = malloc(strlen(path)+10);
+    strcpy(dftPlanFilePath, path);
+  } else {
+    dftPlanFilePath = NULL;
+  }
+
+  if (archID != NULL) free(archID);
+  if (arch == NULL) arch = Sleef_getCpuIdString();
+  archID = malloc(strlen(arch)+10);
+  strcpy(archID, arch);
+
+  planMode = mode;
+  planFilePathSet = 1;
+}
+
+static void loadPlanFromFile() {
+  if (planFilePathSet == 0 && (planMode & SLEEF_PLAN_REFERTOENVVAR) != 0) {
+    char *s = getenv(ENVVAR);
+    if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode);
+  }
+
+  if (planMap != NULL) ArrayMap_dispose(planMap);
+
+  if (dftPlanFilePath != NULL && (planMode & SLEEF_PLAN_RESET) == 0) {
+    planMap = ArrayMap_load(dftPlanFilePath, archID, PLANFILEID, (planMode & SLEEF_PLAN_NOLOCK) == 0);
+  }
+
+  if (planMap == NULL) planMap = initArrayMap();
+
+  planFileLoaded = 1;
+}
+
+static void savePlanToFile() {
+  assert(planFileLoaded);
+  if ((planMode & SLEEF_PLAN_READONLY) == 0 && dftPlanFilePath != NULL) {
+    ArrayMap_save(planMap, dftPlanFilePath, archID, PLANFILEID);
+  }
+}
+
+#define CATBIT 8
+#define BASETYPEIDBIT 2
+#define LOG2LENBIT 8
+#define DIRBIT 1
+
+#define BUTSTATBIT 16
+
+static uint64_t keyButStat(int baseTypeID, int log2len, int dir, int butStat) {
+  dir = (dir & SLEEF_MODE_BACKWARD) == 0;
+  int cat = 0;
+  uint64_t k = 0;
+  k = (k << BUTSTATBIT) | (butStat & ~(~(uint64_t)0 << BUTSTATBIT));
+  k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
+  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
+  return k;
+}
+
+#define LEVELBIT LOG2LENBIT
+#define BUTCONFIGBIT 8
+#define TRANSCONFIGBIT 8
+
+static uint64_t keyTrans(int baseTypeID, int hlen, int vlen, int transConfig) {
+  int max = MAX(hlen, vlen), min = MIN(hlen, vlen);
+  int cat = 2;
+  uint64_t k = 0;
+  k = (k << TRANSCONFIGBIT) | (transConfig & ~(~(uint64_t)0 << TRANSCONFIGBIT));
+  k = (k << LOG2LENBIT) | (max & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << LOG2LENBIT) | (min & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
+  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
+  return k;
+}
+
+static uint64_t keyPath(int baseTypeID, int log2len, int dir, int level, int config) {
+  dir = (dir & SLEEF_MODE_BACKWARD) == 0;
+  int cat = 3;
+  uint64_t k = 0;
+  k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
+  k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
+  k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
+  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
+  return k;
+}
+
+static uint64_t keyPathConfig(int baseTypeID, int log2len, int dir, int level, int config) {
+  dir = (dir & SLEEF_MODE_BACKWARD) == 0;
+  int cat = 4;
+  uint64_t k = 0;
+  k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
+  k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
+  k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
+  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
+  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
+  return k;
+}
+
+static uint64_t planMap_getU64(uint64_t key) {
+  char *s = ArrayMap_get(planMap, key);
+  if (s == NULL) return 0;
+  uint64_t ret;
+  if (sscanf(s, "%" SCNx64, &ret) != 1) return 0;
+  return ret;
+}
+
+static void planMap_putU64(uint64_t key, uint64_t value) {
+  char *s = malloc(100);
+  sprintf(s, "%" PRIx64, value);
+  s = ArrayMap_put(planMap, key, s);
+  if (s != NULL) free(s);
+}
+
+int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat) {
+  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
+
+  initPlanMapLock();
+
+#ifdef _OPENMP
+  omp_set_lock(&planMapLock);
+#endif
+  if (!planFileLoaded) loadPlanFromFile();
+
+  int stat = planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10));
+  if (stat == 0) {
+#ifdef _OPENMP
+    omp_unset_lock(&planMapLock);
+#endif
+    return 0;
+  }
+
+  int ret = 1;
+
+  for(int j = p->log2len;j >= 0;j--) {
+    p->bestPath[j] = planMap_getU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat));
+    p->bestPathConfig[j] = planMap_getU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat));
+    if (p->bestPath[j] > MAXBUTWIDTH) ret = 0;
+  }
+
+  p->pathLen = 0;
+  for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
+
+#ifdef _OPENMP
+  omp_unset_lock(&planMapLock);
+#endif
+  return ret;
+}
+
+void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat) {
+  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
+
+  initPlanMapLock();
+
+#ifdef _OPENMP
+  omp_set_lock(&planMapLock);
+#endif
+  if (!planFileLoaded) loadPlanFromFile();
+
+  if (planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)) != 0) {
+#ifdef _OPENMP
+    omp_unset_lock(&planMapLock);
+#endif
+    return;
+  }
+
+  for(int j = p->log2len;j >= 0;j--) {
+    planMap_putU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPath[j]);
+    planMap_putU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPathConfig[j]);
+  }
+
+  planMap_putU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10), 1);
+
+  if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
+
+#ifdef _OPENMP
+  omp_unset_lock(&planMapLock);
+#endif
+}
+
+int PlanManager_loadMeasurementResultsT(SleefDFT *p) {
+  assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
+
+  initPlanMapLock();
+
+#ifdef _OPENMP
+  omp_set_lock(&planMapLock);
+#endif
+  if (!planFileLoaded) loadPlanFromFile();
+
+  p->tmNoMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0));
+  p->tmMT   = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1));
+
+#ifdef _OPENMP
+  omp_unset_lock(&planMapLock);
+#endif
+  return p->tmNoMT != 0;
+}
+
+void PlanManager_saveMeasurementResultsT(SleefDFT *p) {
+  assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
+
+  initPlanMapLock();
+
+#ifdef _OPENMP
+  omp_set_lock(&planMapLock);
+#endif
+  if (!planFileLoaded) loadPlanFromFile();
+
+  planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0), p->tmNoMT);
+  planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1), p->tmMT  );
+
+  if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
+
+#ifdef _OPENMP
+  omp_unset_lock(&planMapLock);
+#endif
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.h
@@ -0,0 +1,69 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#define CONFIGMAX 4
+#define CONFIG_STREAM 1
+#define CONFIG_MT 2
+
+#define MAXLOG2LEN 32
+
+typedef struct SleefDFT {
+  uint32_t magic;
+  uint64_t mode, mode2, mode3;
+  int baseTypeID;
+  const void *in;
+  void *out;
+
+  union {
+    struct {
+      uint32_t log2len;
+
+      void **tbl[MAXBUTWIDTH+1];
+      void *rtCoef0, *rtCoef1;
+      uint32_t **perm;
+
+      void **x0, **x1;
+
+      int isa;
+      int planMode;
+
+      int vecwidth, log2vecwidth;
+      int nThread;
+
+      uint64_t tm[CONFIGMAX][(MAXBUTWIDTH+1)*32];
+      uint64_t bestTime;
+      int16_t bestPath[32], bestPathConfig[32], pathLen;
+    };
+
+    struct {
+      int32_t hlen, vlen;
+      int32_t log2hlen, log2vlen;
+      uint64_t tmNoMT, tmMT;
+      struct SleefDFT *instH, *instV;
+      void *tBuf;
+    };
+  };
+} SleefDFT;
+
+#define SLEEF_MODE2_MT1D       (1 << 0)
+#define SLEEF_MODE3_MT2D       (1 << 0)
+
+#define PLANFILEID "SLEEFDFT0\n"
+#define ENVVAR "SLEEFDFTPLAN"
+
+#define SLEEF_MODE_MEASUREBITS (3 << 20)
+
+void freeTables(SleefDFT *p);
+uint32_t ilog2(uint32_t q);
+
+//int PlanManager_loadMeasurementResultsB(SleefDFT *p);
+//void PlanManager_saveMeasurementResultsB(SleefDFT *p, int butStat);
+int PlanManager_loadMeasurementResultsT(SleefDFT *p);
+void PlanManager_saveMeasurementResultsT(SleefDFT *p);
+int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat);
+void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat);
+
+#define GETINT_VECWIDTH 100
+#define GETINT_DFTPRIORITY 101
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkdispatch.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkdispatch.c
@@ -0,0 +1,193 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifndef ENABLE_STREAM
+#error ENABLE_STREAM not defined
+#endif
+
+int main(int argc, char **argv) {
+  if (argc < 3) {
+    fprintf(stderr, "Usage : %s <basetype> <unrollmax> <unrollmax2> <maxbutwidth> <isa> ...\n", argv[0]);
+    exit(-1);
+  }
+
+  const char *basetype = argv[1];
+  const int maxbutwidth = atoi(argv[2]);
+  const int isastart = 3;
+  const int isamax = argc - isastart;
+
+#if ENABLE_STREAM == 1
+  const int enable_stream = 1;
+#else
+  const int enable_stream = 0;
+#endif
+
+  printf("#define MAXBUTWIDTH %d\n", maxbutwidth);
+  printf("\n");
+
+  if (strcmp(basetype, "paramonly") == 0) exit(0);
+
+  printf("#define ISAMAX %d\n", isamax);
+  printf("#define CONFIGMAX 4\n");
+
+  for(int k=isastart;k<argc;k++) {
+    for(int config=0;config<4;config++) {
+#if ENABLE_STREAM == 0
+      if ((config & 1) != 0) continue;
+#endif
+      for(int j=1;j<=maxbutwidth;j++) {
+        printf("void dft%df_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
+        printf("void dft%db_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
+        printf("void tbut%df_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
+        printf("void tbut%db_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
+        printf("void but%df_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
+        printf("void but%db_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
+      }
+    }
+    printf("void realSub0_%s(real *, const real *, const int, const real *, const real *);\n", argv[k]);
+    printf("void realSub1_%s(real *, const real *, const int, const real *, const real *, const int);\n", argv[k]);
+    printf("int getInt_%s(int);\n", argv[k]);
+    printf("const void *getPtr_%s(int);\n", argv[k]);
+  }
+
+  printf("\n");
+
+  printf("void (*dftf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
+  for(int config=0;config<4;config++) {
+    printf("  {\n");
+    for(int k=isastart;k<argc;k++) {
+      printf("    {NULL, ");
+      for(int i=1;i<=maxbutwidth;i++) {
+        if (enable_stream || (config & 1) == 0) {
+          printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
+        } else {
+          printf("NULL, ");
+        }
+      }
+      printf("},\n");
+    }
+    printf("},\n");
+  }
+  printf("};\n\n");
+
+  printf("void (*dftb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
+  for(int config=0;config<4;config++) {
+    printf("  {\n");
+    for(int k=isastart;k<argc;k++) {
+      printf("    {NULL, ");
+      for(int i=1;i<=maxbutwidth;i++) {
+        if (enable_stream || (config & 1) == 0) {
+          if (i == 1) {
+            printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
+          } else {
+            printf("dft%db_%d_%s, ", 1 << i, config, argv[k]);
+          }
+        } else {
+          printf("NULL, ");
+        }
+      }
+      printf("},\n");
+    }
+    printf("},\n");
+  }
+  printf("};\n\n");
+
+  printf("void (*tbutf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
+  for(int config=0;config<4;config++) {
+    printf("  {\n");
+    for(int k=isastart;k<argc;k++) {
+      printf("    {NULL, ");
+      for(int i=1;i<=maxbutwidth;i++) {
+        if (enable_stream || (config & 1) == 0) {
+          printf("tbut%df_%d_%s, ", 1 << i, config, argv[k]);
+        } else {
+          printf("NULL, ");
+        }
+      }
+      printf("},\n");
+    }
+    printf("},\n");
+  }
+  printf("};\n\n");
+
+  printf("void (*tbutb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
+  for(int config=0;config<4;config++) {
+    printf("  {\n");
+    for(int k=isastart;k<argc;k++) {
+      printf("    {NULL, ");
+      for(int i=1;i<=maxbutwidth;i++) {
+        if (enable_stream || (config & 1) == 0) {
+          printf("tbut%db_%d_%s, ", 1 << i, config, argv[k]);
+        } else {
+          printf("NULL, ");
+        }
+      }
+      printf("},\n");
+    }
+    printf("},\n");
+  }
+  printf("};\n\n");
+
+  printf("void (*butf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
+  for(int config=0;config<4;config++) {
+    printf("  {\n");
+    for(int k=isastart;k<argc;k++) {
+      printf("    {NULL, ");
+      for(int i=1;i<=maxbutwidth;i++) {
+        if (enable_stream || (config & 1) == 0) {
+          printf("but%df_%d_%s, ", 1 << i, config, argv[k]);
+        } else {
+          printf("NULL, ");
+        }
+      }
+      printf("},\n");
+    }
+    printf("},\n");
+  }
+  printf("};\n\n");
+
+  printf("void (*butb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
+  for(int config=0;config<4;config++) {
+    printf("  {\n");
+    for(int k=isastart;k<argc;k++) {
+      printf("    {NULL, ");
+      for(int i=1;i<=maxbutwidth;i++) {
+        if (enable_stream || (config & 1) == 0) {
+          printf("but%db_%d_%s, ", 1 << i, config, argv[k]);
+        } else {
+          printf("NULL, ");
+        }
+      }
+      printf("},\n");
+    }
+    printf("},\n");
+  }
+  printf("};\n\n");
+
+  //
+
+  printf("void (*realSub0_%s[ISAMAX])(real *, const real *, const int, const real *, const real *) = {\n  ", basetype);
+  for(int k=isastart;k<argc;k++) printf("realSub0_%s, ", argv[k]);
+  printf("\n};\n\n");
+
+  printf("void (*realSub1_%s[ISAMAX])(real *, const real *, const int, const real *, const real *, const int) = {\n  ", basetype);
+  for(int k=isastart;k<argc;k++) printf("realSub1_%s, ", argv[k]);
+  printf("\n};\n\n");
+
+  printf("int (*getInt_%s[16])(int) = {\n  ", basetype);
+  for(int k=isastart;k<argc;k++) printf("getInt_%s, ", argv[k]);
+  for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
+  printf("\n};\n\n");
+
+  printf("const void *(*getPtr_%s[16])(int) = {\n  ", basetype);
+  for(int k=isastart;k<argc;k++) printf("getPtr_%s, ", argv[k]);
+  for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
+  printf("\n};\n\n");
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkunroll.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkunroll.c
@@ -0,0 +1,104 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define CONFIGMAX 4
+
+char *replaceAll(const char *in, const char *pat, const char *replace) {
+  const int replaceLen = (int)strlen(replace);
+  const int patLen = (int)strlen(pat);
+
+  char *str = malloc(strlen(in)+1);
+  strcpy(str, in);
+
+  for(;;) {
+    char *p = strstr(str, pat);
+    if (p == NULL) return str;
+
+    int replace_pos = (int)(p - str);
+    int tail_len = (int)strlen(p + patLen);
+
+    char *newstr = malloc(strlen(str) + (replaceLen - patLen) + 1);
+
+    memcpy(newstr, str, replace_pos);
+    memcpy(newstr + replace_pos, replace, replaceLen);
+    memcpy(newstr + replace_pos + replaceLen, str + replace_pos + patLen, tail_len+1);
+
+    free(str);
+    str = newstr;
+  }
+
+  return str;
+}
+
+#define LEN 1024
+char line[LEN+10];
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    fprintf(stderr, "Usage : %s <Base type> <ISA> ...\n", argv[0]);
+    exit(-1);
+  }
+
+  const char *baseType = argv[1];
+  const int isastart = 2;
+
+  for(int config=0;config<CONFIGMAX;config++) {
+#if ENABLE_STREAM == 0
+    if ((config & 1) != 0) continue;
+#endif
+    for(int isa=isastart;isa<argc;isa++) {
+      char *isaString = argv[isa];
+      char configString[100];
+      sprintf(configString, "%d", config);
+
+      FILE *fpin = fopen("unroll0.org", "r");
+
+      sprintf(line, "unroll_%d_%s.c", config, isaString);
+      FILE *fpout = fopen(line, "w");
+      fputs("#include \"vectortype.h\"\n\n", fpout);
+      fprintf(fpout, "extern %s ctbl_%s[];\n", baseType, baseType);
+      fprintf(fpout, "#define ctbl ctbl_%s\n\n", baseType);
+
+      for(;;) {
+        if (fgets(line, LEN, fpin) == NULL) break;
+        char *s;
+        if ((config & 1) == 0) {
+          char *s0 = replaceAll(line, "%ISA%", isaString);
+          s = replaceAll(s0, "%CONFIG%", configString);
+          free(s0);
+        } else {
+          char *s0 = replaceAll(line, "%ISA%", isaString);
+          char *s1 = replaceAll(s0, "%CONFIG%", configString);
+          char *s2 = replaceAll(s1, "store(", "stream(");
+          s = replaceAll(s2, "scatter(", "scstream(");
+          free(s0); free(s1); free(s2);
+        }
+
+        if ((config & 2) == 0) {
+          char *s0 = replaceAll(s, "#pragma", "//");
+          free(s);
+          s = s0;
+        }
+
+        if (config == 0) {
+          char *s0 = replaceAll(s, "#undef EMITREALSUB", "#define EMITREALSUB");
+          free(s);
+          s = s0;
+        }
+
+        fputs(s, fpout);
+        free(s);
+      }
+
+      fclose(fpin);
+      fclose(fpout);
+    }
+  }
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/unroll0.org
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/unroll0.org
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/vectortype.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/vectortype.h
@@ -0,0 +1,145 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef __VECTORTYPE_H__
+#define __VECTORTYPE_H__
+
+#include <math.h>
+#include "sleef.h"
+
+#ifdef ENABLE_SSE2
+#include "helpersse2.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "helperavx.h"
+#endif
+
+#ifdef ENABLE_AVX2
+#include "helperavx2.h"
+#endif
+
+#ifdef ENABLE_AVX512F
+#include "helperavx512f.h"
+#endif
+
+#ifdef ENABLE_NEON32
+#include "helperneon32.h"
+#endif
+
+#ifdef ENABLE_ADVSIMD
+#include "helperadvsimd.h"
+#endif
+
+#ifdef ENABLE_SVE
+#include "helpersve.h"
+#endif
+
+#if defined(ENABLE_RVVM1) || defined(ENABLE_RVVM2)
+#include "helperrvv.h"
+#endif
+
+#ifdef ENABLE_VSX
+#include "helperpower_128.h"
+#endif
+
+#ifdef ENABLE_VSX3
+#include "helperpower_128.h"
+#endif
+
+#ifdef ENABLE_VXE
+#include "helpers390x_128.h"
+#endif
+
+#ifdef ENABLE_VXE2
+#include "helpers390x_128.h"
+#endif
+
+#ifdef ENABLE_VECEXT
+#include "helpervecext.h"
+#endif
+
+#ifdef ENABLE_PUREC
+#include "helperpurec.h"
+#endif
+
+#define IMPORT_IS_EXPORT
+#include "sleefdft.h"
+
+#if BASETYPEID == 1
+#define LOG2VECWIDTH (LOG2VECTLENDP-1)
+#define VECWIDTH (1 << LOG2VECWIDTH)
+
+typedef double real;
+typedef vdouble real2;
+
+static int available(int name) { return vavailability_i(name); }
+
+static INLINE real2 uminus(real2 d0) { return vneg_vd_vd(d0); }
+static INLINE real2 uplusminus(real2 d0) { return vposneg_vd_vd(d0); }
+static INLINE real2 uminusplus(real2 d0) { return vnegpos_vd_vd(d0); }
+
+static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vd_vd_vd(d0, d1); }
+static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vd_vd_vd(d0, d1); }
+static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vd_vd_vd(d0, d1); }
+static INLINE real2 times(real2 d0, real2 d1) { return vmul_vd_vd_vd(d0, d1); }
+static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, d2, d1); }
+static INLINE real2 ctimes(real2 d0, real d) { return vmul_vd_vd_vd(d0, vcast_vd_d(d)); }
+static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, vcast_vd_d(c), d1); }
+
+static INLINE real2 reverse(real2 d0) { return vrev21_vd_vd(d0); }
+static INLINE real2 reverse2(real2 d0) { return vreva2_vd_vd(d0); }
+
+static INLINE real2 loadc(real c) { return vcast_vd_d(c); }
+
+static INLINE real2 load(const real *ptr, int offset) { return vload_vd_p(&ptr[2*offset]); }
+static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vd_p(&ptr[2*offset]); }
+static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vd(&ptr[2*offset], v); }
+static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vd(&ptr[2*offset], v); }
+static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vd(&ptr[2*offset], v); }
+static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+
+static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
+#elif BASETYPEID == 2
+#define LOG2VECWIDTH (LOG2VECTLENSP-1)
+#define VECWIDTH (1 << LOG2VECWIDTH)
+
+typedef float real;
+typedef vfloat real2;
+
+static int available(int name) { return vavailability_i(name); }
+
+static INLINE real2 uminus(real2 d0) { return vneg_vf_vf(d0); }
+static INLINE real2 uplusminus(real2 d0) { return vposneg_vf_vf(d0); }
+static INLINE real2 uminusplus(real2 d0) { return vnegpos_vf_vf(d0); }
+
+static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vf_vf_vf(d0, d1); }
+static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vf_vf_vf(d0, d1); }
+static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vf_vf_vf(d0, d1); }
+static INLINE real2 times(real2 d0, real2 d1) { return vmul_vf_vf_vf(d0, d1); }
+static INLINE real2 ctimes(real2 d0, real d) { return vmul_vf_vf_vf(d0, vcast_vf_f(d)); }
+static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, d2, d1); }
+static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, vcast_vf_f(c), d1); }
+
+static INLINE real2 reverse(real2 d0) { return vrev21_vf_vf(d0); }
+static INLINE real2 reverse2(real2 d0) { return vreva2_vf_vf(d0); }
+
+static INLINE real2 loadc(real c) { return vcast_vf_f(c); }
+
+static INLINE real2 load(const real *ptr, int offset) { return vload_vf_p(&ptr[2*offset]); }
+static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vf_p(&ptr[2*offset]); }
+static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vf(&ptr[2*offset], v); }
+static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vf(&ptr[2*offset], v); }
+static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vf(&ptr[2*offset], v); }
+static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
+#else
+#error No BASETYPEID specified
+#endif
+
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/Makefile
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/Makefile
@@ -0,0 +1,16 @@
+.PHONY: all
+all : gencoef mkrempitab mkrempitabqp
+
+gencoef : gencoef.c simplexfr.c sp.h dp.h ld.h qp.h
+        gcc -O gencoef.c simplexfr.c -o gencoef -lmpfr -lm
+
+mkrempitab : mkrempitab.c
+        gcc -O mkrempitab.c -o mkrempitab -lmpfr
+
+mkrempitabqp : mkrempitabqp.c
+        gcc -O mkrempitabqp.c -o mkrempitabqp -lmpfr
+
+.PHONY: clean
+clean :
+        rm -f gencoef gencoefdp gencoefld mkrempitab mkrempitabqp a.out *~
+        rm -f *.obj *.lib *.dll *.exp *.exe
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/dp.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/dp.h
@@ -0,0 +1,196 @@
+// This is part of SLEEF, written by Naoki
+// Shibata. http://shibatch.sourceforge.net
+
+// The code in this file is distributed under the Creative Commons
+// Attribution 4.0 International License.
+
+#define PREC_TARGET 53
+
+#if 0
+#define N 8           // Degree of equation
+#define S 40          // Number of samples for phase 1
+#define L 4           // Number of high precision coefficients
+#define MIN 0.0       // Min argument
+#define MAX (M_PI/4)  // Max argument
+#define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0  // Fix coef 0 to 1.0
+#endif
+
+#if 0
+#define N 10
+#define S 40
+#define L 2
+#define MIN 0.0
+#define MAX (M_PI/4)
+
+void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
+  mpfr_t x;
+  mpfr_init(x);
+  mpfr_cos(ret, a, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clear(x);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+
+#define PMUL 2
+#define PADD 2
+#define FIXCOEF0 (-0.5)
+#endif
+
+
+#if 0 // for xsincospi4_u05
+#define S 40
+#define N 8
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_sin(ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+#if 0 // for xsincospi4_u05
+#define N 8
+#define S 40
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_cos(ret, x, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define PMUL 2
+#define PADD 2
+#endif
+
+
+#if 0 // for xsincospi4
+#define N 7
+#define S 40
+#define L 0
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_sin(ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+
+#if 0
+#define N 17
+#define S 60
+#define L 0
+#define MIN 0.0
+#define MAX (M_PI/4)
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
+
+#if 0
+#define N 11
+#define S 35
+#define L 2
+#define MIN 1 //0.75
+#define MAX 1.5
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t frd, mpfr_t fra) {
+  mpfr_t tmp, one;
+  mpfr_inits(tmp, one, NULL);
+  mpfr_set_d(one, 1, GMP_RNDN);
+  mpfr_add(tmp, fra, one, GMP_RNDN);
+  mpfr_sub(frd, fra, one, GMP_RNDN);
+  mpfr_div(frd, frd, tmp, GMP_RNDN);
+  mpfr_clears(tmp, one, NULL);
+}
+#define FIXCOEF0 2.0
+#endif
+
+#if 1
+#define N 12
+#define S 50
+#define L 2
+#define MIN -0.347
+#define MAX 0.347 // 0.5 log 2
+#define PMUL 1
+#define PADD 0
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#define FIXCOEF1 1.0
+//#define FIXCOEF2 0.5
+#endif
+
+#if 0
+#define N 21
+#define S 100
+#define L 1
+#define P 1.1
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
+
+#if 0
+#define N 20
+#define S 100
+#define L 0
+#define P 1.54
+#define MIN 0.0
+#define MAX 0.708
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_asin(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.c
@@ -0,0 +1,375 @@
+// This is part of SLEEF, written by Naoki Shibata. http://shibatch.sourceforge.net
+
+// Since the original code for simplex algorithm is developed by Haruhiko Okumura and
+// the code is distributed under the Creative Commons Attribution 4.0 International License,
+// the contents under this directory are also distributed under the same license.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <time.h>
+#include <mpfr.h>
+
+//#include "sp.h"
+#include "dp.h"
+//#include "ld.h"
+//#include "qp.h"
+
+#undef VERBOSE
+
+#define PREC 4096
+
+#define EPS 1e-50
+
+#define PREC2 (PREC_TARGET*4)
+
+#ifndef P
+#define P 1
+#endif
+
+#ifndef Q
+#define Q 10000
+#endif
+
+void mpfr_zinit(mpfr_t m);
+void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result);
+
+char *mpfrToStr(mpfr_t m) {
+  mpfr_t fra;
+  mpfr_init2(fra, mpfr_get_prec(m));
+
+  mpfr_abs(fra, m, GMP_RNDN);
+  mpfr_exp_t e;
+  char *s = mpfr_get_str(NULL, &e, 10, 0, fra, GMP_RNDN);
+
+  char *ret = malloc(strlen(s) + 20);
+
+  if (mpfr_sgn(m) == -1) ret[0] = '-'; else ret[0] = '+';
+  ret[1] = '0';
+  ret[2] = '.';
+
+  strcpy(&ret[3], s);
+  mpfr_free_str(s);
+
+  char estr[10];
+  sprintf(estr, "e%+d", (int)e);
+  strcat(ret, estr);
+
+  mpfr_clears(fra, NULL);
+  return ret;
+}
+
+double countULP(mpfr_t d, mpfr_t c) {
+  mpfr_t fry, frw;
+  mpfr_inits(fry, frw, NULL);
+
+  double c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && mpfr_cmp_d(d, 0) != 0) return 10000;
+
+  long e;
+  mpfr_get_d_2exp(&e, c, GMP_RNDN);
+  mpfr_set_ui_2exp(frw, 1, e-PREC_TARGET, GMP_RNDN);
+
+  mpfr_sub(fry, d, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  mpfr_clears(fry, frw, NULL);
+
+  return u;
+}
+
+void func(mpfr_t s, mpfr_t x, mpfr_t *coef, int n) {
+  mpfr_set_prec(s, PREC_TARGET);
+  mpfr_set(s, coef[n-1], GMP_RNDN);
+
+  for(int i=n-1;i>0;i--) {
+    if (i == L-1) {
+      mpfr_t t;
+      mpfr_init2(t, PREC2);
+      mpfr_set(t, s, GMP_RNDN);
+      mpfr_set_prec(s, PREC2);
+      mpfr_set(s, t, GMP_RNDN);
+      mpfr_clear(t);
+    }
+    mpfr_mul(s, s, x, GMP_RNDN);
+    mpfr_add(s, s, coef[i-1], GMP_RNDN);
+  }
+}
+
+int main(int argc, char **argv)
+{
+  int i, j;
+  int n, m;
+  double p;
+
+  mpfr_set_default_prec(PREC);
+
+#if 0
+  {
+    mpfr_t a, b;
+    mpfr_inits(a, b, NULL);
+
+    float x = M_PI;
+    mpfr_set_d(a, x, GMP_RNDN);
+    x = nexttowardf(x, 100);
+    x = nexttowardf(x, 100);
+    x = nexttowardf(x, 100);
+    mpfr_set_d(b, x, GMP_RNDN);
+
+    printf("%g\n", countULP(b, a));
+    mpfr_clears(a, b, NULL);
+    exit(0);
+  }
+#endif
+
+#if 0
+  {
+    mpfr_t a, b;
+    mpfr_inits(a, b, NULL);
+
+    double x = M_PI;
+    mpfr_set_d(a, x, GMP_RNDN);
+    x = nexttoward(x, 100);
+    x = nexttoward(x, 100);
+    x = nexttoward(x, 100);
+    mpfr_set_d(b, x, GMP_RNDN);
+
+    printf("%g\n", countULP(b, a));
+    mpfr_clears(a, b, NULL);
+    exit(0);
+  }
+#endif
+
+#if 0
+  {
+    mpfr_t a, b;
+    mpfr_inits(a, b, NULL);
+
+    long double x = M_PI;
+    mpfr_set_ld(a, x, GMP_RNDN);
+    x = nexttowardl(x, 100);
+    x = nexttowardl(x, 100);
+    x = nexttowardl(x, 100);
+    mpfr_set_ld(b, x, GMP_RNDN);
+
+    printf("%g\n", countULP(b, a));
+    mpfr_clears(a, b, NULL);
+    exit(0);
+  }
+#endif
+
+#if 0
+  {
+    mpfr_t a, b;
+    mpfr_inits(a, b, NULL);
+
+    __float128 x = M_PI;
+    mpfr_set_f128(a, x, GMP_RNDN);
+    x = nextafterq(x, 100);
+    x = nextafterq(x, 100);
+    x = nextafterq(x, 100);
+    mpfr_set_f128(b, x, GMP_RNDN);
+
+    printf("%g\n", countULP(b, a));
+    mpfr_clears(a, b, NULL);
+    exit(0);
+  }
+#endif
+
+  m = N+1;
+  n = argc >= 2 ? atoi(argv[1]) : S;
+  p = argc >= 3 ? atof(argv[2]) : P;
+
+  mpfr_t **x, *result;  // x[m][n], result[m]
+
+  x = calloc(sizeof(mpfr_t *), m);
+  result = calloc(sizeof(mpfr_t), m);
+  for(i=0;i<m;i++) {
+    x[i] = calloc(sizeof(mpfr_t), n);
+    for(j=0;j<n;j++) mpfr_zinit(x[i][j]);
+    mpfr_zinit(result[i]);
+  }
+
+  mpfr_t fra, frb, frc, frd, fre;
+
+  mpfr_zinit(fra);
+  mpfr_zinit(frb);
+  mpfr_zinit(frc);
+  mpfr_zinit(frd);
+  mpfr_zinit(fre);
+
+  for(i=0;i<n;i++) {
+    double b = 1.0 - pow((double)i / (n-1), p);
+    double a = ((double)MAX - MIN) * b + MIN;
+    mpfr_set_d(fra, a, GMP_RNDN);
+    CFUNC(frd, fra);
+
+    for(j=0;j<m-1;j++) {
+      mpfr_set_d(frb, (double)j*PMUL+PADD, GMP_RNDN);
+      mpfr_pow(x[j][i], frd, frb, GMP_RNDN);
+      //printf("%g ", mpfr_get_d(x[j][i], GMP_RNDN));
+    }
+
+    TARGET(x[m-1][i], fra);
+    //printf(" : %g\n", mpfr_get_d(x[m-1][i], GMP_RNDN));
+  }
+
+  for(i=0;i<m-1;i++) mpfr_set_d(result[i], 0, GMP_RNDN);
+
+  regressMinRelError_fr(n, m-1, x, result);
+
+  for(i=m-2;i>=0;i--) {
+    mpfr_set_prec(fra, PREC_TARGET+4);
+    mpfr_set(fra, result[i], GMP_RNDN);
+
+    char *s;
+    printf("%s, \n", s = mpfrToStr(fra));
+    free(s);
+  }
+  printf("\n");
+
+  mpfr_set_prec(fra, PREC);
+
+  double emax = 0;
+
+  for(i=0;i<=n*10;i++) {
+    double a = i * (double)(MAX - MIN) / (n*10.0) + MIN;
+    mpfr_set_d(fra, a, GMP_RNDN);
+
+    CFUNC(frd, fra);
+
+    mpfr_set_d(frb, 0, GMP_RNDN);
+
+    for(j=m-1;j>=0;j--) {
+      mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
+      mpfr_pow(frc, frd, frc, GMP_RNDN);
+      mpfr_mul(frc, frc, result[j], GMP_RNDN);
+      mpfr_add(frb, frb, frc, GMP_RNDN);
+    }
+
+    TARGET(frc, fra);
+    double u = countULP(frb, frc);
+
+    if (u > emax) emax = u;
+  }
+
+  printf("Phase 1 : Max error = %g ULP\n\n", emax);
+  fflush(stdout);
+
+  //
+
+  mpfr_t bestcoef[N], curcoef[N];
+
+  for(i=0;i<N;i++) {
+    mpfr_init2(bestcoef[i], i >= L ? PREC_TARGET : PREC2);
+    mpfr_set(bestcoef[i], result[i], GMP_RNDN);
+
+    mpfr_init2(curcoef[i], i >= L ? PREC_TARGET : PREC2);
+    mpfr_set(curcoef[i], result[i], GMP_RNDN);
+  }
+
+  srandom(time(NULL));
+
+  mpfr_set_default_prec(PREC2);
+
+  static mpfr_t a[Q], v[Q], am[Q], aa[Q];
+
+  for(i=0;i<Q;i++) {
+    mpfr_inits(a[i], v[i], am[i], aa[i], NULL);
+
+    mpfr_set_d(fra, ((double)MAX - (double)MIN) * i / (double)(Q-1) + (double)MIN, GMP_RNDN);
+
+    TARGET(v[i], fra);
+    CFUNC(a[i], fra);
+    mpfr_set_d(frb, PMUL, GMP_RNDN);
+    mpfr_pow(am[i], a[i], frb, GMP_RNDN);
+    mpfr_set_d(frb, PADD, GMP_RNDN);
+    mpfr_pow(aa[i], a[i], frb, GMP_RNDN);
+    mpfr_clears(a[i], v[i], am[i], aa[i], NULL);
+  }
+
+  double best = 1e+100, bestsum = 1e+100, bestworstx;
+
+  for(int k=0;k<10000;k++) {
+    double emax = 0, esum = 0, worstx = 0;
+
+#ifdef FIXCOEF0
+    mpfr_set_d(curcoef[0], FIXCOEF0, GMP_RNDN);
+#endif
+
+#ifdef FIXCOEF1
+    mpfr_set_d(curcoef[1], FIXCOEF1, GMP_RNDN);
+#endif
+
+#ifdef FIXCOEF2
+    mpfr_set_d(curcoef[2], FIXCOEF2, GMP_RNDN);
+#endif
+
+    for(i=0;i<Q;i++) {
+      if (mpfr_cmp_d(v[i], 0) == 0) continue;
+
+      mpfr_set_d(frb, 0, GMP_RNDN);
+      for(j=N-1;j>=0;j--) {
+        mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
+        mpfr_pow(frc, a[i], frc, GMP_RNDN);
+        mpfr_mul(frc, frc, curcoef[j], GMP_RNDN);
+        mpfr_add(frb, frb, frc, GMP_RNDN);
+      }
+
+      double e = countULP(frb, v[i]);
+
+      //printf("c = %.20g, t = %.20g, ulp = %g\n", mpfr_get_d(v[i], GMP_RNDN), mpfr_get_d(frb, GMP_RNDN), e);
+
+      if (!isfinite(e)) continue;
+      if (e > emax) { emax = e; worstx = mpfr_get_d(a[i], GMP_RNDN); }
+      esum += e;
+    }
+    mpfr_set_prec(frb, PREC);
+
+    //printf("emax = %g\n", emax);
+
+    if (emax < best || (emax == best && esum < bestsum)) {
+      for(i=0;i<N;i++) {
+        mpfr_set(bestcoef[i], curcoef[i], GMP_RNDN);
+      }
+      if (best == 1e+100 || k > 10) printf("Max error = %g ULP, Sum error = %g (Max error at %g)\n", emax, esum, worstx);
+      if ((best - emax) / best > 0.0001) k = 0;
+      best = emax;
+      bestsum = esum;
+      bestworstx = worstx;
+    }
+
+    for(i=0;i<N;i++) {
+      mpfr_set(curcoef[i], bestcoef[i], GMP_RNDN);
+    }
+
+    for(i=0;i<N;i++) {
+      static int tab[] = {0, 0, 0, 0, 0, 0, 1, -1};
+      //static int tab[] = {0, 0, 0, 0, 2, -2, 1, -1};
+      int r = tab[random() & 7];
+      if (r > 0) {
+        for(int j=0;j<r;j++) mpfr_nextabove(curcoef[i]);
+      } else if (r < 0) {
+        for(int j=0;j>r;j--) mpfr_nextbelow(curcoef[i]);
+      }
+    }
+  }
+
+  printf("\n");
+
+  for(i=N-1;i>=0;i--) {
+    mpfr_set_prec(fra, i >= L ? PREC_TARGET+4 : PREC2);
+    mpfr_set(fra, bestcoef[i], GMP_RNDN);
+
+    char *s;
+    printf("%s, \n", s = mpfrToStr(fra));
+    free(s);
+  }
+  printf("\nPhase 2 : max error = %g ULP at %g\n", best, bestworstx);
+
+  exit(0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.txt
@@ -0,0 +1,43 @@
+
+With this small tool, the coefficients for polynomial approximation
+used in kernels can be generated.
+
+Usage
+
+Edit gencoefdp.c. In the beginning of the file, specifications of the
+parameters for generating coefficients are listed. Enable one of them
+by changing #if. Then, run make to compile the source code. Run the
+gencoef, and it will show the generated coefficients in a few minutes.
+
+
+How it works
+
+There are two phases of the program.
+
+The first phase is the regression for minimizing the maximum relative
+error. This problem can be reduced to a linear programming problem,
+and the Simplex method is used in this implementation. This requires
+multi-precision calculation, and the implementation uses the MPFR
+library to do this. In this phase, only a small number of values
+(specified by S macro, usually 40 or so) of the function to
+approximate are sampled within the argument range. The function to
+approximate can be given by FRFUNC function. Specifying higher values
+for S does not always give better results.
+
+The second phase is to optimize the coefficients so that it gives good
+accuracy with double precision calculation. In this phase, it checks
+100000 points (specified by Q macro) within the specified argument
+range to see if the polynomial gives good error bound. In some cases,
+the last few terms have to be calculated in higher precision in order
+to achieve 1 ULP overall accuracy, and this implementation can take
+care of that. The L parameter specifies the number of high precision
+coefficients.
+
+In some cases, it is desirable to fix the last few coefficients to
+values like 1. This can be specified if you define FIXCOEF0
+macro. This sometimes does not work, however. In this case, you need
+to specify the function to approximate as shown in the definition for
+cos.
+
+Finding a set of good parameters is not a straightforward process. You
+usually need many iterations of trial and error.
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/ld.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/ld.h
@@ -0,0 +1,178 @@
+// This is part of SLEEF, written by Naoki
+// Shibata. http://shibatch.sourceforge.net
+
+// The code in this file is distributed under the Creative Commons
+// Attribution 4.0 International License.
+
+#define PREC_TARGET 64
+
+#if 0
+#define N 8           // Degree of equation
+#define S 40          // Number of samples for phase 1
+#define L 4           // Number of high precision coefficients
+#define MIN 0.0       // Min argument
+#define MAX (M_PI/4)  // Max argument
+#define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0  // Fix coef 0 to 1.0
+#endif
+
+#if 0
+#define N 10
+#define S 40
+#define L 2
+#define MIN 0.0
+#define MAX (M_PI/4)
+void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
+  mpfr_t x;
+  mpfr_init(x);
+  mpfr_cos(ret, a, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clear(x);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+
+#define PMUL 2
+#define PADD 2
+#define FIXCOEF0 (-0.5)
+#endif
+
+
+#if 0 // for xsincospi4_u05
+#define N 9
+#define S 40
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_sin(ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+#if 0 // for xsincospi4_u05
+#define N 9
+#define S 40
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_cos(ret, x, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define PMUL 2
+#define PADD 2
+#endif
+
+
+#if 0 // for xsincospi4
+#define N 7
+#define S 40
+#define L 0
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_sin(ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+
+#if 0
+#define N 17
+#define S 40
+#define L 0
+#define MIN 0.0
+#define MAX (M_PI/4)
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
+
+#if 0
+#define N 9
+#define S 40
+#define L 2
+#define MIN 1 //0.75
+#define MAX 1.5
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t frd, mpfr_t fra) {
+  mpfr_t tmp, one;
+  mpfr_inits(tmp, one, NULL);
+  mpfr_set_d(one, 1, GMP_RNDN);
+  mpfr_add(tmp, fra, one, GMP_RNDN);
+  mpfr_sub(frd, fra, one, GMP_RNDN);
+  mpfr_div(frd, frd, tmp, GMP_RNDN);
+  mpfr_clear(tmp, one, NULL);
+}
+#define FIXCOEF0 2.0
+#endif
+
+#if 0
+#define N 12
+#define S 50
+#define L 0
+#define MIN -0.347
+#define MAX 0.347 // 0.5 log 2
+#define PMUL 1
+#define PADD 0
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#define FIXCOEF1 1.0
+#define FIXCOEF2 0.5
+#endif
+
+#if 0
+#define N 22
+#define S 100
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitab.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitab.c
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <mpfr.h>
+
+static int64_t doubleToRawLongBits(double d) {
+  union {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static double longBitsToDouble(int64_t i) {
+  union {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+static double removelsb(double d) {
+  return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffffffffffeLL);
+}
+
+static int32_t floatToRawIntBits(float d) {
+  union {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static float intBitsToFloat(int32_t i) {
+  union {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+static float removelsbf(float x) {
+  return intBitsToFloat(0xfffffffc & floatToRawIntBits(x));
+}
+
+int main(int argc, char **argv) {
+  mpfr_set_default_prec(2048);
+  mpfr_t pi, rpi, xrpi, x, y, z, r;
+  mpfr_inits(pi, rpi, xrpi, x, y, z, r, NULL);
+  mpfr_const_pi(pi, GMP_RNDN);
+  mpfr_set_d(x, 0.5, GMP_RNDN);
+  mpfr_div(rpi, x, pi, GMP_RNDN);
+
+  printf("NOEXPORT ALIGNED(64) const double rempitabdp[] = {\n");
+  for(int i=55;i<1024;i++) {
+    int M = i > 700 ? -64 : 0;
+    int ex = i - 53;
+    if (ex < -52) ex = -52;
+    mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
+    mpfr_mul(y, x, rpi, GMP_RNDN);
+    mpfr_frac(xrpi, y, GMP_RNDN);
+    mpfr_div(xrpi, xrpi, x, GMP_RNDN);
+
+    mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
+
+    mpfr_set(x, xrpi, GMP_RNDN);
+
+    double rpi0 = removelsb(mpfr_get_d(x, GMP_RNDN));
+    mpfr_set_d(y, rpi0, GMP_RNDN);
+    mpfr_sub(x, x, y, GMP_RNDN);
+
+    double rpi1 = removelsb(mpfr_get_d(x, GMP_RNDN));
+    mpfr_set_d(y, rpi1, GMP_RNDN);
+    mpfr_sub(x, x, y, GMP_RNDN);
+
+    double rpi2 = removelsb(mpfr_get_d(x, GMP_RNDN));
+    mpfr_set_d(y, rpi2, GMP_RNDN);
+    mpfr_sub(x, x, y, GMP_RNDN);
+
+    double rpi3 = mpfr_get_d(x, GMP_RNDN);
+
+    printf("  %.20g, %.20g, %.20g, %.20g,\n", rpi0, rpi1, rpi2, rpi3);
+  }
+  printf("};\n\n");
+
+  printf("NOEXPORT ALIGNED(64) const float rempitabsp[] = {\n");
+  for(int i=25;i<128;i++) {
+    int M = i > 90 ? -64 : 0;
+    int ex = i - 23;
+    mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
+    mpfr_mul(y, x, rpi, GMP_RNDN);
+    mpfr_frac(xrpi, y, GMP_RNDN);
+    mpfr_div(xrpi, xrpi, x, GMP_RNDN);
+
+    mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
+
+    mpfr_set(x, xrpi, GMP_RNDN);
+
+    float rpi20 = removelsbf(mpfr_get_d(x, GMP_RNDN));
+    mpfr_set_d(y, rpi20, GMP_RNDN);
+    mpfr_sub(x, x, y, GMP_RNDN);
+
+    float rpi21 = removelsbf(mpfr_get_d(x, GMP_RNDN));
+    mpfr_set_d(y, rpi21, GMP_RNDN);
+    mpfr_sub(x, x, y, GMP_RNDN);
+
+    float rpi22 = removelsbf(mpfr_get_d(x, GMP_RNDN));
+    mpfr_set_d(y, rpi22, GMP_RNDN);
+    mpfr_sub(x, x, y, GMP_RNDN);
+
+    float rpi23 = mpfr_get_d(x, GMP_RNDN);
+
+    printf("  %.10g, %.10g, %.10g, %.10g,\n", rpi20, rpi21, rpi22, rpi23);
+  }
+  printf("};\n");
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitabqp.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitabqp.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <math.h>
+#include <mpfr.h>
+#include <quadmath.h>
+
+#define N 8
+#define B 8
+#define NCOL (53-B)
+#define NROW ((16385+(53-B)*N-106)/NCOL+1)
+
+static double *rempitabqp = NULL;
+
+void generateRempitabqp() {
+  rempitabqp = calloc(16385-106+(53-B)*(N+1), sizeof(double));
+
+  int orgprec = mpfr_get_default_prec();
+  mpfr_set_default_prec(18000);
+
+  mpfr_t pi, m, n, o;
+  mpfr_inits(pi, m, n, o, NULL);
+  mpfr_const_pi(pi, GMP_RNDN);
+
+  mpfr_d_div(n, 0.5, pi, GMP_RNDN);
+
+  for(int e=106;e<16385+(53-B)*N;e++) {
+    mpfr_set(m, n, GMP_RNDN);
+
+    mpfr_set_ui_2exp(o, 1, -(113 - e), GMP_RNDN);
+    mpfr_mul(m, m, o, GMP_RNDN);
+
+    mpfr_frac(m, m, GMP_RNDN);
+
+    mpfr_set_ui_2exp(o, 1, (53-B), GMP_RNDN);
+    mpfr_mul(m, m, o, GMP_RNDN);
+
+    mpfr_trunc(m, m);
+
+    mpfr_set_ui_2exp(o, 1, 7-(53-B), GMP_RNDN);
+    mpfr_mul(m, m, o, GMP_RNDN);
+
+    int col = (e - 106) % NCOL;
+    int row = (e - 106) / NCOL;
+    rempitabqp[col * NROW + row] = mpfr_get_d(m, GMP_RNDN);
+  }
+
+  mpfr_clears(pi, m, n, o, NULL);
+  mpfr_set_default_prec(orgprec);
+}
+
+
+int main(int argc, char **argv) {
+  generateRempitabqp();
+
+  printf("NOEXPORT const double Sleef_rempitabqp[] = {\n  ");
+  for(int i=0;i<16385-106+(53-B)*(N+1);i++) {
+    printf("%.20g, ", rempitabqp[i]);
+    if ((i & 3) == 3) printf("\n  ");
+  }
+  printf("\n};\n");
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/qp.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/qp.h
@@ -0,0 +1,161 @@
+// This is part of SLEEF, written by Naoki
+// Shibata. http://shibatch.sourceforge.net
+
+// The code in this file is distributed under the Creative Commons
+// Attribution 4.0 International License.
+
+#define PREC_TARGET 113
+
+//
+
+#if 0
+#define N 15          // Degree of equation
+#define S 150         // Number of samples for phase 1
+#define L 0           // Number of high precision coefficients
+#define P 0.37
+#define MIN 0.0       // Min argument
+#define MAX (M_PI/2)  // Max argument
+#define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
+#define PADD 3
+void TARGET(mpfr_t ret, mpfr_t a) { // The function to approximate
+  mpfr_sin(ret, a, GMP_RNDN);
+  mpfr_sub(ret, ret, a, GMP_RNDN); // ret = sin(a) - a
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+#if 0
+#define N 15
+#define S 150
+#define L 0
+#define MIN 0.0
+#define MAX (M_PI/2)
+void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
+  mpfr_t x;
+  mpfr_init(x);
+  mpfr_cos(ret, a, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clear(x);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+
+#define PMUL 2
+#define PADD 2
+//#define FIXCOEF0 (-0.5)
+#endif
+
+
+#if 0 // for xsincospi4_u05
+#define N 13
+#define S 150
+#define L 2
+#define P 0.9
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_sin(ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+#if 0 // for xsincospi4_u05
+#define N 13
+#define S 150
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_cos(ret, x, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define PMUL 2
+#define PADD 2
+#endif
+
+#if 0 // running
+#define N 31
+#define S 100
+#define P 1.7
+#define L 0
+#define MIN 0.0
+#define MAX (M_PI/4)
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
+
+#if 0 // running
+#define N 20
+#define S 110
+#define L 2
+#define MIN 1 //0.75
+#define MAX 1.5
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t frd, mpfr_t fra) {
+  mpfr_t tmp, one;
+  mpfr_inits(tmp, one, NULL);
+  mpfr_set_d(one, 1, GMP_RNDN);
+  mpfr_add(tmp, fra, one, GMP_RNDN);
+  mpfr_sub(frd, fra, one, GMP_RNDN);
+  mpfr_div(frd, frd, tmp, GMP_RNDN);
+  mpfr_clears(tmp, one, NULL);
+}
+#define FIXCOEF0 2.0
+#endif
+
+#if 1
+#define N 22
+#define S 140
+#define L 2
+#define MIN -0.347
+#define MAX 0.347 // 0.5 log 2
+#define PMUL 1
+#define PADD 0
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#define FIXCOEF1 1.0
+//#define FIXCOEF2 0.5
+#endif
+
+#if 0 // running
+#define N 45
+#define S 100
+#define P 1.55
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/simplexfr.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/simplexfr.c
@@ -0,0 +1,459 @@
+ // The original code for simplex algorithm is taken from Haruhiko Okumura's book.
+// https://oku.edu.mie-u.ac.jp/~okumura/algo/
+// The code is distributed under the Creative Commons Attribution 4.0 International License.
+// https://creativecommons.org/licenses/by/4.0/
+
+// The code is modified by Naoki Shibata to process arbitrary precision numbers.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <float.h>
+#include <time.h>
+#include <mpfr.h>
+
+#define PREC 4096
+#define EPS 1e-50
+
+#define OK 0
+#define MAXIMIZABLE_TO_INFINITY 1
+#define NOT_FEASIBLE 2
+#define ERROR (-1)
+
+#define NOP (-1)
+#define EQU (0)
+#define LEQ 1
+#define GEQ 2
+
+static int m, n, n1, n2, n3, jmax;
+static int *col, *row, *nonzero_row, *inequality;
+static mpfr_t **a, *c, **q, *pivotcolumn;
+
+static mpfr_t zero, one, eps, minuseps, large;
+
+void mpfr_zinit(mpfr_t m) {
+  mpfr_init(m);
+  mpfr_set_d(m, 0, GMP_RNDN);
+}
+
+static void init(int n0, int m0) {
+  int i, j;
+
+  m = m0; n = n0;
+
+  mpfr_init(zero); mpfr_set_d(zero, 0, GMP_RNDN);
+  mpfr_init(one); mpfr_set_d(one, 1, GMP_RNDN);
+
+  mpfr_init(eps);
+  mpfr_set_d(eps, EPS, GMP_RNDN);
+
+  mpfr_init(minuseps);
+  mpfr_set_d(minuseps, -EPS, GMP_RNDN);
+
+  mpfr_init(large);
+  mpfr_set_d(large, 1.0 / EPS, GMP_RNDN);
+
+  a = malloc(sizeof(mpfr_t *) * (m + 1));
+  for(i=0;i < m+1;i++) {
+    a[i] = malloc(sizeof(mpfr_t) * (n + 1));
+    for(j=0;j < (n+1);j++) {
+      mpfr_zinit(a[i][j]);
+    }
+  }
+
+  q = malloc(sizeof(mpfr_t *) * (m + 1));
+  for(i=0;i < m+1;i++) {
+    q[i] = malloc(sizeof(mpfr_t) * (m + 1));
+    for(j=0;j < m+1;j++) {
+      mpfr_zinit(q[i][j]);
+    }
+  }
+
+  c = malloc(sizeof(mpfr_t) * (n + 1));
+  for(j=0;j < (n+1);j++) {
+    mpfr_zinit(c[j]);
+  }
+
+  pivotcolumn = malloc(sizeof(mpfr_t) * (m + 1));
+  for(j=0;j < (m+1);j++) {
+    mpfr_zinit(pivotcolumn[j]);
+  }
+
+  col = calloc(m+1, sizeof(int));
+  row = calloc(n+2*m+1, sizeof(int));
+  nonzero_row = calloc(n+2*m+1, sizeof(int));
+  inequality = calloc(m+1, sizeof(int));
+}
+
+static void dispose() {
+  mpfr_clears(zero, one, eps, minuseps, large, (mpfr_ptr)0);
+
+  int i, j;
+
+  for(i=0;i < m+1;i++) {
+    for(j=0;j < m+1;j++) {
+      mpfr_clear(q[i][j]);
+    }
+    free(q[i]);
+  }
+  free(q);
+
+  for(i=0;i < m+1;i++) {
+    for(j=0;j < n+1;j++) {
+      mpfr_clear(a[i][j]);
+    }
+    free(a[i]);
+  }
+  free(a);
+
+  for(j=0;j < n+1;j++) {
+    mpfr_clear(c[j]);
+  }
+  free(c);
+
+  for(j=0;j < m+1;j++) {
+    mpfr_clear(pivotcolumn[j]);
+  }
+  free(pivotcolumn);
+
+  free(col);
+  free(row);
+  free(nonzero_row);
+  free(inequality);
+}
+
+static void prepare() {
+  int i;
+
+  n1 = n;
+  for (i = 1; i <= m; i++)
+    if (inequality[i] == GEQ) {
+      n1++;  nonzero_row[n1] = i;
+    }
+  n2 = n1;
+  for (i = 1; i <= m; i++)
+    if (inequality[i] == LEQ) {
+      n2++;  col[i] = n2;
+      nonzero_row[n2] = row[n2] = i;
+    }
+  n3 = n2;
+  for (i = 1; i <= m; i++)
+    if (inequality[i] != LEQ) {
+      n3++;  col[i] = n3;
+      nonzero_row[n3] = row[n3] = i;
+    }
+
+  for (i = 0; i <= m; i++) {
+    mpfr_set_d(q[i][i], 1, GMP_RNDN);
+  }
+}
+
+static void tableau(mpfr_t ret, int i, int j) {
+  int k;
+
+  if (col[i] < 0) { mpfr_set_d(ret, 0, GMP_RNDN); return; }
+
+  if (j <= n) {
+    mpfr_t s;
+    mpfr_zinit(s);
+    mpfr_set_d(s, 0, GMP_RNDN);
+
+    mpfr_t *tab = malloc(sizeof(mpfr_t) * (m + 1));
+    mpfr_ptr *ptab = malloc(sizeof(mpfr_ptr) * (m + 1));
+    for (k = 0; k <= m; k++) {
+      mpfr_zinit(tab[k]);
+      ptab[k] = (mpfr_ptr)&tab[k];
+      mpfr_mul(tab[k], q[i][k], a[k][j], GMP_RNDN);
+    }
+    mpfr_sum(s, ptab, m+1, GMP_RNDN);
+    for (k = 0; k <= m; k++) {
+      mpfr_clear(tab[k]);
+    }
+    free(ptab);
+    free(tab);
+
+    mpfr_set(ret, s, GMP_RNDN);
+    mpfr_clear(s);
+    return;
+  }
+
+  mpfr_set(ret, q[i][nonzero_row[j]], GMP_RNDN);
+
+  if (j <= n1) { mpfr_neg(ret, ret, GMP_RNDN); return; }
+  if (j <= n2 || i != 0) return;
+
+  mpfr_add(ret, ret, one, GMP_RNDN);
+  return;
+}
+
+static void pivot(int ipivot, int jpivot) {
+  int i, j;
+  mpfr_t u;
+
+  mpfr_zinit(u);
+
+  mpfr_set(u, pivotcolumn[ipivot], GMP_RNDN);
+
+  for (j = 1; j <= m; j++) {
+    mpfr_div(q[ipivot][j], q[ipivot][j], u, GMP_RNDN);
+  }
+
+  for (i = 0; i <= m; i++)
+    if (i != ipivot) {
+      mpfr_set(u, pivotcolumn[i], GMP_RNDN);
+
+      for (j = 1; j <= m; j++) {
+        mpfr_fms(q[i][j], q[ipivot][j], u, q[i][j], GMP_RNDN);
+        mpfr_neg(q[i][j], q[i][j], GMP_RNDN);
+      }
+    }
+
+  row[col[ipivot]] = 0;
+
+  col[ipivot] = jpivot;  row[jpivot] = ipivot;
+
+  mpfr_clear(u);
+}
+
+static int minimize() {
+  int i, ipivot, jpivot;
+  mpfr_t t, u;
+  mpfr_inits(t, u, (mpfr_ptr)0);
+
+  for (;;) {
+    for (jpivot = 1; jpivot <= jmax; jpivot++) {
+      if (row[jpivot] == 0) {
+        tableau(pivotcolumn[0], 0, jpivot);
+        if (mpfr_cmp(pivotcolumn[0], minuseps) < 0) break;
+      }
+    }
+    if (jpivot > jmax) {
+      mpfr_clears(t, u, (mpfr_ptr)0);
+      return 1;
+    }
+
+    mpfr_set(u, large, GMP_RNDN);
+    ipivot = 0;
+    for (i = 1; i <= m; i++) {
+      tableau(pivotcolumn[i], i, jpivot);
+      if (mpfr_cmp(pivotcolumn[i], eps) > 0) {
+        tableau(t, i, 0);
+        mpfr_div(t, t, pivotcolumn[i], GMP_RNDN);
+        if (mpfr_cmp(t, u) < 0) { ipivot = i; mpfr_set(u, t, GMP_RNDN); }
+      }
+    }
+    if (ipivot == 0) {
+      mpfr_clears(t, u, (mpfr_ptr)0);
+      return 0; // the objective function can be minimized to -infinite
+    }
+    pivot(ipivot, jpivot);
+  }
+}
+
+static int phase1() {
+  int i, j;
+  mpfr_t u;
+  mpfr_zinit(u);
+
+  jmax = n3;
+  for (i = 0; i <= m; i++) {
+    if (col[i] > n2) mpfr_set_d(q[0][i], -1, GMP_RNDN);
+  }
+
+  minimize();
+
+  tableau(u, 0, 0);
+  if (mpfr_cmp(u, minuseps) < 0) {
+    mpfr_clear(u);
+    return 0;
+  }
+  for (i = 1; i <= m; i++) {
+    if (col[i] > n2) {
+      col[i] = -1;
+    }
+  }
+  mpfr_set_d(q[0][0], 1, GMP_RNDN);
+  for (j = 1; j <= m; j++) mpfr_set_d(q[0][j], 0, GMP_RNDN);
+  for (i = 1; i <= m; i++) {
+    if ((j = col[i]) > 0 && j <= n && mpfr_cmp_d(c[j], 0) != 0) {
+      mpfr_set(u, c[j], GMP_RNDN);
+      for (j = 1; j <= m; j++) {
+        mpfr_fms(q[0][j], q[i][j], u, q[0][j], GMP_RNDN);
+        mpfr_neg(q[0][j], q[0][j], GMP_RNDN);
+      }
+    }
+
+  }
+
+  mpfr_clear(u);
+  return 1;
+}
+
+static int phase2() {
+  int j;
+  jmax = n2;
+  for (j = 0; j <= n; j++) {
+    mpfr_set(a[0][j], c[j], GMP_RNDN);
+  }
+
+  return minimize();
+}
+
+int solve_fr(mpfr_t *result, int n0, int m0, mpfr_t **a0, int *ineq0, mpfr_t *c0) {
+  int i,j;
+
+  m = m0;   // number of inequations
+  n = n0+1; // number of variables
+
+  init(n, m);
+
+  mpfr_t csum;
+  mpfr_zinit(csum);
+
+  for(j=0;j<n0+1;j++) {
+    mpfr_set(c[j], c0[j], GMP_RNDN);
+  }
+
+  for(j=1;j<n0+1;j++) {
+    mpfr_add(csum, csum, c0[j], GMP_RNDN);
+  }
+
+  mpfr_set(c[n], csum, GMP_RNDN);
+  mpfr_neg(c[n], c[n], GMP_RNDN);
+
+  for(i=0;i<m;i++) {
+    mpfr_set_d(csum, 0, GMP_RNDN);
+
+    for(j=0;j<n0+1;j++) mpfr_set(a[i+1][j], a0[i][j], GMP_RNDN);
+    mpfr_neg(a[i+1][0], a[i+1][0], GMP_RNDN);
+
+    for(j=1;j<n0+1;j++) {
+      mpfr_add(csum, csum, a0[i][j], GMP_RNDN);
+    }
+
+    mpfr_set(a[i+1][n], csum, GMP_RNDN);
+    mpfr_neg(a[i+1][n], a[i+1][n], GMP_RNDN);
+    inequality[i+1] = ineq0[i];
+
+    if (mpfr_cmp_d(a[i+1][0], 0) < 0) {
+      if      (inequality[i+1] == GEQ) inequality[i+1] = LEQ;
+      else if (inequality[i+1] == LEQ) inequality[i+1] = GEQ;
+      for (j = 0; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
+    } else if (mpfr_cmp_d(a[i+1][0], 0) == 0 && inequality[i+1] == GEQ) {
+      inequality[i+1] = LEQ;
+      for (j = 1; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
+    }
+  }
+
+  int p1r = 1;
+
+  prepare();
+  if (n3 != n2) p1r = phase1();
+
+  if (!p1r) {
+    dispose();
+    return NOT_FEASIBLE;
+  }
+
+  int b = phase2();
+
+  mpfr_t *s = calloc(sizeof(mpfr_t), n);
+  for(j=0;j<n;j++) {
+    mpfr_zinit(s[j]);
+  }
+
+  for (j = 1; j < n; j++) {
+    if ((i = row[j]) != 0) {
+      tableau(s[j], i, 0);
+    }
+  }
+
+  mpfr_t cs;
+  mpfr_zinit(cs);
+  if (row[n] != 0) tableau(cs, row[n], 0);
+
+  for (j = 1; j < n; j++) {
+    mpfr_sub(s[j], s[j], cs, GMP_RNDN);
+  }
+
+  for(j=0;j<n;j++) {
+    mpfr_set(result[j], s[j], GMP_RNDN);
+  }
+
+  mpfr_clear(cs);
+
+  for(j=0;j<n;j++) mpfr_clear(s[j]);
+  free(s);
+
+  dispose();
+
+  return b ? OK : MAXIMIZABLE_TO_INFINITY;
+}
+
+void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result) {
+  int m0 = n * 3, n0 = m + 2 * n, i, j;
+  mpfr_t **a0, *c0, *result0;
+  int in0[m0];
+
+  a0 = malloc(sizeof(mpfr_t *) * m0);
+  for(i=0;i<m0;i++) {
+    a0[i] = calloc(n0+1, sizeof(mpfr_t));
+    for(j=0;j<n0+1;j++) mpfr_zinit(a0[i][j]);
+  }
+
+  c0 = calloc(n0+1, sizeof(mpfr_t));
+  result0 = calloc(n0+1, sizeof(mpfr_t));
+
+  for(j=0;j<n0+1;j++) {
+    mpfr_zinit(c0[j]);
+    mpfr_zinit(result0[j]);
+  }
+
+  for(i=0;i<n;i++) {
+    long double ld = mpfr_get_ld(x[m][i], GMP_RNDN);
+    if (ld < DBL_MIN) ld = 1;
+
+#if 1
+    mpfr_set_ld(c0[m+i  +1], 1.0/fabsl(ld), GMP_RNDN);
+    mpfr_set_ld(c0[m+n+i+1], 1.0/fabsl(ld), GMP_RNDN);
+#else
+    int e;
+    frexpl(ld, &e);
+    ld = 1.0 / ldexpl(1.0, e);
+    mpfr_set_ld(c0[m+i  +1], ld, GMP_RNDN);
+    mpfr_set_ld(c0[m+n+i+1], ld, GMP_RNDN);
+#endif
+
+    mpfr_set_d(a0[i*3+0][m+i+1], 1, GMP_RNDN);
+    in0[i*3+0] = GEQ;
+
+    mpfr_set_d(a0[i*3+1][m+n+i+1], 1, GMP_RNDN);
+    in0[i*3+1] = GEQ;
+
+    for(j=0;j<m;j++) {
+      mpfr_set(a0[i*3+2][j+1], x[j][i], GMP_RNDN);
+    }
+
+    mpfr_set_d(a0[i*3+2][m+i+1], 1, GMP_RNDN);
+    mpfr_set_d(a0[i*3+2][m+n+i+1], -1, GMP_RNDN);
+    in0[i*3+2] = EQU;
+    mpfr_set(a0[i*3+2][0], x[m][i], GMP_RNDN);
+    mpfr_neg(a0[i*3+2][0], a0[i*3+2][0], GMP_RNDN);
+  }
+
+  int status = solve_fr(result0, n0, m0, a0, in0, c0);
+
+  if (status == NOT_FEASIBLE) {
+    printf("not feasible\n");
+  } else {
+    if (status == MAXIMIZABLE_TO_INFINITY) printf("maximizable to inf\n");
+  }
+
+  for(i=0;i<m;i++) {
+    mpfr_set(result[i], result0[i+1], GMP_RNDN);
+  }
+
+  free(result0);
+  free(c0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/sp.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/sp.h
@@ -0,0 +1,159 @@
+// This is part of SLEEF, written by Naoki
+// Shibata. http://shibatch.sourceforge.net
+
+// The code in this file is distributed under the Creative Commons
+// Attribution 4.0 International License.
+
+#define PREC_TARGET 24
+
+#if 1
+#define N 5           // Degree of equation
+#define S 81          // Number of samples for phase 1
+#define L 0           // Number of high precision coefficients
+#define P 0.37
+#define MIN 0.0       // Min argument
+#define MAX (M_PI/2)  // Max argument
+#define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0  // Fix coef 0 to 1.0
+#endif
+
+#if 0
+#define N 5
+#define S 40
+#define L 0
+#define MIN 0.0
+#define MAX (M_PI/2)
+
+void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
+  mpfr_t x;
+  mpfr_init(x);
+  mpfr_cos(ret, a, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clear(x);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+
+#define PMUL 2
+#define PADD 2
+#define FIXCOEF0 (-0.5)
+#endif
+
+#if 0
+// xsincospi4
+#define N 5
+#define S 30
+#define P 0.69
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_sin(ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#endif
+
+#if 0
+// xsincospi4
+#define N 5
+#define S 60
+#define P 0.7
+#define L 1
+#define MIN 0.0
+#define MAX 1.0
+void TARGET(mpfr_t ret, mpfr_t a) {
+  mpfr_t x, y;
+  mpfr_inits(x, y, NULL);
+  mpfr_const_pi(x, GMP_RNDN);
+  mpfr_set_d(y, 1.0/4, GMP_RNDN);
+  mpfr_mul(x, x, y, GMP_RNDN);
+  mpfr_mul(x, x, a, GMP_RNDN);
+  mpfr_cos(ret, x, GMP_RNDN);
+  mpfr_set_ld(x, 1, GMP_RNDN);
+  mpfr_sub(ret, ret, x, GMP_RNDN);
+  mpfr_clears(x, y, NULL);
+}
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define PMUL 2
+#define PADD 2
+#endif
+
+#if 0
+#define N 7
+#define S 40
+#define L 2
+#define MIN 0.0
+#define MAX (M_PI/4)
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
+
+#if 0
+#define N 5
+#define S 40
+#define L 2
+#define MIN 1 //0.75
+#define MAX 1.5
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t frd, mpfr_t fra) {
+  mpfr_t tmp, one;
+  mpfr_inits(tmp, one, NULL);
+  mpfr_set_d(one, 1, GMP_RNDN);
+  mpfr_add(tmp, fra, one, GMP_RNDN);
+  mpfr_sub(frd, fra, one, GMP_RNDN);
+  mpfr_div(frd, frd, tmp, GMP_RNDN);
+  mpfr_clears(tmp, one, NULL);
+}
+#define FIXCOEF0 2.0
+#endif
+
+#if 0
+#define N 7
+#define S 50
+#define L 0
+#define MIN -0.347
+#define MAX 0.347 // 0.5 log 2
+#define PMUL 1
+#define PADD 0
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#define FIXCOEF1 1.0
+//#define FIXCOEF2 0.5
+#endif
+
+#if 0
+#define N 10
+#define S 100
+#define L 2
+#define MIN 0.0
+#define MAX 1.0
+#define PMUL 2
+#define PADD 1
+
+void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
+void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
+#define FIXCOEF0 1.0
+#endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/Makefile
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/Makefile
@@ -0,0 +1,153 @@
+ICCAVAILABLE := $(shell command -v icc 2> /dev/null)
+ARCH := $(shell uname -p)
+
+all :
+ifndef BUILDDIR
+        @echo
+        @echo Please set the build directory to BUILDDIR environment variable and run make once again.
+        @echo e.g. export BUILDDIR='`pwd`'/../../build
+        @echo
+else
+        @echo
+        @echo You can start measurement by "'"make measure"'".
+ifdef ICCAVAILABLE
+        @echo You can start measurement with SVML by "'"make measureSVML"'".
+endif
+        @echo Then, you can plot the results of measurement by "'"make plot"'".
+        @echo
+        @echo You have to install java and gnuplot to do plotting.
+        @echo Stop all tasks on the computer before starting measurement.
+        @echo
+endif
+
+benchsvml128_10.o : benchsvml128.c bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_10.o
+
+benchsvml128_40.o : benchsvml128.c bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_40.o
+
+benchsvml256_10.o : benchsvml256.c bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_10.o
+
+benchsvml256_40.o : benchsvml256.c bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_40.o
+
+benchsvml512_10.o : benchsvml512.c bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_10.o
+
+benchsvml512_40.o : benchsvml512.c bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_40.o
+
+
+benchsvml_10 : benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_10
+
+benchsvml_40 : benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o bench.h
+        -command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_40
+
+#
+
+ifeq ($(ARCH),aarch64)
+
+benchsleef : benchsleef.c benchsleef128.o bench.h
+        $(CC) benchsleef.c benchsleef128.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
+
+benchsleef128.o : benchsleef128.c bench.h
+        $(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
+
+else ifeq ($(ARCH),s390x)
+
+benchsleef : benchsleef.c benchsleef128.o bench.h
+        $(CC) benchsleef.c benchsleef128.o -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
+
+benchsleef128.o : benchsleef128.c bench.h
+        $(CC) benchsleef128.c -Wall -mzvector -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
+
+else ifeq ($(ARCH),ppc64le)
+
+benchsleef : benchsleef.c benchsleef128.o bench.h
+        $(CC) benchsleef.c benchsleef128.o -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
+
+benchsleef128.o : benchsleef128.c bench.h
+        $(CC) benchsleef128.c -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
+
+else
+
+benchsleef : benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o bench.h
+        $(CC) benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
+
+benchsleef128.o : benchsleef128.c bench.h
+        $(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
+
+benchsleef256.o : benchsleef256.c bench.h
+        $(CC) benchsleef256.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
+
+benchsleef512.o : benchsleef512.c bench.h
+        $(CC) benchsleef512.c -Wall -mavx512f -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
+
+endif
+
+#
+
+ProcessData.class : ProcessData.java
+        javac ProcessData.java
+
+#
+
+ifndef BUILDDIR
+measure :
+        @echo
+        @echo Please set the build directory to BUILDDIR environment variable and run make once again.
+        @echo e.g. export BUILDDIR='`pwd`'/../../build
+        @echo
+else
+measure : benchsleef
+        chmod +x ./measure.sh
+        LD_LIBRARY_PATH=$(BUILDDIR)/lib ./measure.sh ./benchsleef
+        @echo
+        @echo Now, you can plot the results of measurement by "'"make plot"'".
+        @echo You can do another measurement by "'"make measure"'".
+ifdef ICCAVAILABLE
+        @echo You can start another measurement with SVML by "'"make measureSVML"'".
+endif
+        @echo You can start over by "'"make restart"'".
+        @echo
+endif
+
+measureSVML : all benchsvml_10 benchsvml_40
+        chmod +x ./measure.sh
+        ./measure.sh ./benchsvml_10 ./benchsvml_40
+        @echo
+        @echo Now, you can plot the results of measurement by "'"make plot"'".
+        @echo You can do another measurement by "'"make measure"'".
+ifdef ICCAVAILABLE
+        @echo You can start another measurement with SVML by "'"make measureSVML"'".
+endif
+        @echo You can start over by "'"make restart"'".
+        @echo
+
+plot : ProcessData.class counter.txt
+        java ProcessData *dptrig*.out
+        gnuplot script.out
+        mv output.png trigdp.png
+        java ProcessData *dpnontrig*.out
+        gnuplot script.out
+        mv output.png nontrigdp.png
+        java ProcessData *sptrig*.out
+        gnuplot script.out
+        mv output.png trigsp.png
+        java ProcessData *spnontrig*.out
+        gnuplot script.out
+        mv output.png nontrigsp.png
+        @echo
+        @echo Plotted results are in trigdp.png, nontrigdp.png, trigsp.png and nontrigsp.png.
+        @echo
+
+clean :
+        rm -f *~ a.out *.so *.so.* *.a *.s *.o
+        rm -rf *.dSYM *.dylib
+        rm -f *.obj *.lib *.dll *.exp *.exe *.stackdump
+        rm -f *.class *.png benchsleef benchsvml_10 benchsvml_40 *.out counter.txt
+
+restart :
+        rm -f *.out counter.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java
@@ -0,0 +1,193 @@
+import java.util.*;
+import java.io.*;
+
+public class ProcessData {
+    static final int DP = 64, SP = 32;
+
+    static LinkedHashMap<String, Integer> funcNameOrder = new LinkedHashMap<String, Integer>();
+
+    static class Key {
+        final String funcName;
+
+        final int prec, bits;
+        final ArrayList<Double> range = new ArrayList<Double>();
+        final double ulps;
+
+        Key(String s) {
+            String[] a = s.split(",");
+
+            funcName = a[0].trim();
+            if (funcNameOrder.get(funcName) == null) {
+                funcNameOrder.put(funcName, funcNameOrder.size());
+            }
+
+            prec =
+                a[1].trim().equals("DP") ? DP :
+                a[1].trim().equals("SP") ? SP :
+                0;
+
+            bits = Integer.parseInt(a[2].trim());
+
+            int c;
+
+            for(c = 3;;c++) {
+                if (a[c].trim().endsWith("ulps")) break;
+                range.add(Double.parseDouble(a[c]));
+            }
+
+            ulps = Double.parseDouble(a[c].trim().replace("ulps", ""));
+        }
+
+        public int hashCode() {
+            int h = funcName.hashCode();
+            h ^= prec ^ bits;
+            return h;
+        }
+
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            Key k = (Key) o;
+            if (funcName.compareTo(k.funcName) != 0) return false;
+            if (prec != k.prec) return false;
+            if (bits != k.bits) return false;
+            if (range.size() != k.range.size()) return false;
+            for(int i=0;i<range.size();i++) {
+                if ((double)range.get(i) != (double)k.range.get(i)) return false;
+            }
+
+            if (ulps != k.ulps) return false;
+            return true;
+        }
+
+        public String toString() {
+            String s = funcName + " ";
+            s += prec == DP ? "DP " : "SP ";
+            s += bits + "bit ";
+            s += String.format(" %.0fulp ", ulps);
+            for(int i=0;i<range.size();i+=2) {
+                s += "[" + String.format("%.3g", range.get(i)) + ", " + String.format("%.3g", range.get(i+1)) + "]";
+                if (i + 2 < range.size()) s += " ";
+            }
+            return s;
+        }
+    }
+
+    static class KeyComparator implements Comparator<Key> {
+        public int compare(Key d0, Key d1) {
+            if (d0 == d1) return 0;
+            if (d0.prec < d1.prec) return  1;
+            if (d0.prec > d1.prec) return -1;
+            if (d0.ulps > d1.ulps) return  1;
+            if (d0.ulps < d1.ulps) return -1;
+
+            int fc = (int)funcNameOrder.get(d0.funcName) - (int)funcNameOrder.get(d1.funcName);
+            if (fc != 0) return fc;
+
+            if (d0.bits > d1.bits) return  1;
+            if (d0.bits < d1.bits) return -1;
+
+            if (d0.range.size() > d1.range.size()) return  1;
+            if (d0.range.size() < d1.range.size()) return -1;
+
+            for(int i=0;i<d0.range.size();i++) {
+                if (d0.range.get(i) > d1.range.get(i)) return  1;
+                if (d0.range.get(i) < d1.range.get(i)) return -1;
+            }
+
+            return 0;
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        LinkedHashMap<Key, LinkedHashMap<String, Double>> allData = new LinkedHashMap<Key, LinkedHashMap<String, Double>>();
+        TreeSet<Key> allKeys = new TreeSet<Key>(new KeyComparator());
+        LinkedHashSet<String> allColumnTitles = new LinkedHashSet<String>();
+        double maximum = 0;
+
+        for(int i=0;i<args.length;i++) {
+            LineNumberReader lnr = new LineNumberReader(new FileReader(args[i]));
+
+            String columnTitle = lnr.readLine();
+            allColumnTitles.add(columnTitle);
+
+            for(;;) {
+                String s = lnr.readLine();
+                if (s == null) break;
+
+                Key key = new Key(s);
+                allKeys.add(key);
+
+                LinkedHashMap<String, Double> v = allData.get(key);
+                if (v == null) {
+                    v = new LinkedHashMap<String, Double>();
+                    allData.put(key, v);
+                }
+                String[] a = s.split(",");
+
+                double time = Double.parseDouble(a[a.length-1]);
+                v.put(columnTitle, time);
+                maximum = Math.max(maximum, time);
+            }
+
+            lnr.close();
+        }
+
+        PrintStream ps = new PrintStream("data.out");
+
+        for(Key k : allKeys) {
+            ps.print("\"" + k + "\" ");
+
+            LinkedHashMap<String, Double> v = allData.get(k);
+
+            for(String s : allColumnTitles) {
+                Double d = v.get(s);
+                if (d != null) ps.print(d);
+                if (d == null) ps.print("0");
+                ps.print("\t");
+            }
+            ps.println();
+        }
+
+        ps.close();
+
+        ps = new PrintStream("script.out");
+
+        ps.println("set terminal pngcairo size 1280, 800 font \",10\"");
+        ps.println("set output \"output.png\"");
+
+        ps.println("color00 = \"#FF5050\";"); // red
+        ps.println("color01 = \"#0066FF\";"); // blue
+        ps.println("color02 = \"#00FF00\";"); // green
+        ps.println("color03 = \"#FF9900\";"); // orange
+        ps.println("color04 = \"#CC00CC\";"); // purple
+        ps.println("color05 = \"#880000\";"); // brown
+        ps.println("color06 = \"#003300\";"); // dark green
+        ps.println("color07 = \"#000066\";"); // dark blue
+
+        ps.println("set style data histogram");
+        ps.println("set style histogram cluster gap 1");
+        ps.println("set style fill solid 1.00");
+        ps.println("set boxwidth 0.9");
+        ps.println("set xtics format \"\"");
+        ps.println("set xtics rotate by -90");
+        ps.println("set grid ytics");
+
+        ps.println("set ylabel \"Execution time in micro sec.\"");
+        ps.println("set yrange [0:*]");
+        ps.println("set bmargin 24");
+
+        ps.println("set title \"Single execution time in micro sec.\"");
+        ps.print("plot");
+
+        int i = 0;
+        for(String s : allColumnTitles) {
+            ps.print("\"data.out\" using " + (i+2) + ":xtic(1) title \"" + s +
+                     "\" linecolor rgb color" + String.format("%02d", i));
+            if (i != allColumnTitles.size()-1) ps.print(", ");
+            i++;
+        }
+        ps.println();
+
+        ps.close();
+    }
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/bench.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/bench.h
@@ -0,0 +1,58 @@
+#define NITER1 100000
+#define NITER2 10000
+#define NITER (NITER1 * NITER2)
+
+#define callFuncSLEEF1_1(funcName, name, xmin, xmax, ulp, arg, type) ({ \
+      printf("%s\n", #funcName);                                        \
+      uint64_t t = Sleef_currentTimeMicros();                           \
+      for(int j=0;j<NITER2;j++) {                                       \
+        type *p = (type *)(arg);                                        \
+        for(int i=0;i<NITER1;i++) funcName(*p++);                       \
+      }                                                                 \
+      fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n",                    \
+              (double)xmin, (double)xmax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
+    })
+
+#define callFuncSLEEF1_2(funcName, name, xmin, xmax, ymin, ymax, ulp, arg1, arg2, type) ({ \
+      printf("%s\n", #funcName);                                        \
+      uint64_t t = Sleef_currentTimeMicros();                           \
+      for(int j=0;j<NITER2;j++) {                                       \
+        type *p1 = (type *)(arg1), *p2 = (type *)(arg2);                \
+        for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++);               \
+      }                                                                 \
+      fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n",        \
+              (double)xmin, (double)xmax, (double)ymin, (double)ymax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
+    })
+
+#define callFuncSVML1_1(funcName, name, xmin, xmax, arg, type) ({       \
+      printf("%s\n", #funcName);                                        \
+      uint64_t t = Sleef_currentTimeMicros();                           \
+      for(int j=0;j<NITER2;j++) {                                       \
+        type *p = (type *)(arg);                                        \
+        for(int i=0;i<NITER1;i++) funcName(*p++);                       \
+      }                                                                 \
+      fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n",                    \
+              (double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
+    })
+
+#define callFuncSVML2_1(funcName, name, xmin, xmax, arg, type) ({       \
+      printf("%s\n", #funcName);                                        \
+      uint64_t t = Sleef_currentTimeMicros();                           \
+      for(int j=0;j<NITER2;j++) {                                       \
+        type *p = (type *)(arg), c;                                     \
+        for(int i=0;i<NITER1;i++) funcName(&c, *p++);                   \
+      }                                                                 \
+      fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n",                    \
+              (double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
+    })
+
+#define callFuncSVML1_2(funcName, name, xmin, xmax, ymin, ymax, arg1, arg2, type) ({ \
+      printf("%s\n", #funcName);                                        \
+      uint64_t t = Sleef_currentTimeMicros();                           \
+      for(int j=0;j<NITER2;j++) {                                       \
+        type *p1 = (type *)(arg1), *p2 = (type *)(arg2);                \
+        for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++);               \
+      }                                                                 \
+      fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n",        \
+              (double)xmin, (double)xmax, (double)ymin, (double)ymax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
+    })
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef.c
@@ -0,0 +1,144 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sleef.h>
+
+#include "bench.h"
+
+int veclen = 16;
+double *abufdp, *bbufdp;
+float *abufsp, *bbufsp;
+FILE *fp;
+
+#if defined(__i386__) || defined(__x86_64__)
+void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+
+int cpuSupportsAVX() {
+    int32_t reg[4];
+    x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+}
+
+int cpuSupportsAVX512F() {
+    int32_t reg[4];
+    x86CpuID(reg, 7, 0);
+    return (reg[1] & (1 << 16)) != 0;
+}
+#endif
+
+void fillDP(double *buf, double min, double max) {
+  for(int i=0;i<NITER1*veclen;i++) {
+    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
+    buf[i] = r * (max - min) + min;
+  }
+}
+
+void fillSP(float *buf, double min, double max) {
+  for(int i=0;i<NITER1*veclen;i++) {
+    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
+    buf[i] = r * (max - min) + min;
+  }
+}
+
+void benchSleef128_DPTrig();
+void benchSleef256_DPTrig();
+void benchSleef512_DPTrig();
+void benchSleef128_DPNontrig();
+void benchSleef256_DPNontrig();
+void benchSleef512_DPNontrig();
+void benchSleef128_SPTrig();
+void benchSleef256_SPTrig();
+void benchSleef512_SPTrig();
+void benchSleef128_SPNontrig();
+void benchSleef256_SPNontrig();
+void benchSleef512_SPNontrig();
+
+//
+
+int main(int argc, char **argv) {
+  char *columnTitle = "SLEEF", *fnBase = "sleef";
+  char fn[1024];
+
+  if (argc != 1) columnTitle = argv[1];
+  if (argc >= 3) fnBase = argv[2];
+
+  srandom(time(NULL));
+
+#if defined(__i386__) || defined(__x86_64__)
+  int do128bit = 1;
+  int do256bit = cpuSupportsAVX();
+  int do512bit = cpuSupportsAVX512F();
+#elif defined(__ARM_NEON) || defined(__VSX__) || defined(__VX__)
+  int do128bit = 1;
+#else
+#error Unsupported architecture
+#endif
+
+  posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
+  posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
+
+  abufsp = (float *)abufdp;
+  bbufsp = (float *)bbufdp;
+
+  sprintf(fn, "%sdptrig.out", fnBase);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do128bit) benchSleef128_DPTrig();
+#if defined(__i386__) || defined(__x86_64__)
+  if (do256bit) benchSleef256_DPTrig();
+  if (do512bit) benchSleef512_DPTrig();
+#endif
+
+  fclose(fp);
+
+  sprintf(fn, "%sdpnontrig.out", fnBase);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do128bit) benchSleef128_DPNontrig();
+#if defined(__i386__) || defined(__x86_64__)
+  if (do256bit) benchSleef256_DPNontrig();
+  if (do512bit) benchSleef512_DPNontrig();
+#endif
+
+  fclose(fp);
+
+  sprintf(fn, "%ssptrig.out", fnBase);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do128bit) benchSleef128_SPTrig();
+#if defined(__i386__) || defined(__x86_64__)
+  if (do256bit) benchSleef256_SPTrig();
+  if (do512bit) benchSleef512_SPTrig();
+#endif
+
+  fclose(fp);
+
+  sprintf(fn, "%sspnontrig.out", fnBase);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do128bit) benchSleef128_SPNontrig();
+#if defined(__i386__) || defined(__x86_64__)
+  if (do256bit) benchSleef256_SPNontrig();
+  if (do512bit) benchSleef512_SPNontrig();
+#endif
+
+  fclose(fp);
+
+  exit(0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c
@@ -0,0 +1,195 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sleef.h>
+
+void fillDP(double *buf, double min, double max);
+void fillSP(float *buf, double min, double max);
+
+extern char x86BrandString[256], versionString[1024];
+extern int veclen;
+extern double *abufdp, *bbufdp;
+extern float *abufsp, *bbufsp;
+extern FILE *fp;
+
+#include "bench.h"
+
+#ifdef __SSE2__
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+typedef __m128d vdouble;
+typedef __m128 vfloat;
+#define ENABLED
+#elif defined(__ARM_NEON)
+#include <arm_neon.h>
+typedef float64x2_t vdouble;
+typedef float32x4_t vfloat;
+#define ENABLED
+#elif defined(__VSX__)
+#include <altivec.h>
+typedef __vector double vdouble;
+typedef __vector float  vfloat;
+#define ENABLED
+#elif defined(__VX__)
+#include <vecintrin.h>
+typedef __vector double vdouble;
+typedef __vector float  vfloat;
+#define ENABLED
+#endif
+
+#ifdef ENABLED
+void benchSleef128_DPTrig() {
+  fillDP(abufdp, 0, 6.28);
+
+  callFuncSLEEF1_1(Sleef_sind2_u10   , "sin, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd2_u10   , "cos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand2_u10   , "tan, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind2_u35   , "sin, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd2_u35   , "cos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand2_u35   , "tan, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+6);
+
+  callFuncSLEEF1_1(Sleef_sind2_u10   , "sin, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd2_u10   , "cos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand2_u10   , "tan, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind2_u35   , "sin, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd2_u35   , "cos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand2_u35   , "tan, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+100);
+
+  callFuncSLEEF1_1(Sleef_sind2_u10   , "sin, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd2_u10   , "cos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand2_u10   , "tan, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind2_u35   , "sin, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd2_u35   , "cos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand2_u35   , "tan, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
+}
+
+void benchSleef128_DPNontrig() {
+  fillDP(abufdp, 0, 1e+300);
+
+  callFuncSLEEF1_1(Sleef_logd2_u10  , "log, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_log10d2_u10, "log10, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_log1pd2_u10, "log1p, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_logd2_u35  , "log, DP, 128", 0, 1e+300, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, -700, 700);
+
+  callFuncSLEEF1_1(Sleef_expd2_u10  , "exp, DP, 128", -700, 700, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_exp2d2_u10 , "exp2, DP, 128", -700, 700, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_exp10d2_u10, "exp10, DP, 128", -700, 700, 1.0, abufdp, vdouble);
+
+  fillDP(abufdp, -30, 30);
+  fillDP(bbufdp, -30, 30);
+
+  callFuncSLEEF1_2(Sleef_powd2_u10, "pow, DP, 128", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
+
+  fillDP(abufdp, -1.0, 1.0);
+
+  callFuncSLEEF1_1(Sleef_asind2_u10, "asin, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_acosd2_u10, "acos, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_asind2_u35, "asin, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_acosd2_u35, "acos, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, -10, 10);
+  fillDP(bbufdp, -10, 10);
+
+  callFuncSLEEF1_1(Sleef_atand2_u10, "atan, DP, 128", -10, 10, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_2(Sleef_atan2d2_u10, "atan2, DP, 128", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_atand2_u35, "atan, DP, 128", -10, 10, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_2(Sleef_atan2d2_u35, "atan2, DP, 128", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
+}
+
+void benchSleef128_SPTrig() {
+  fillSP(abufsp, 0, 6.28);
+
+  callFuncSLEEF1_1(Sleef_sinf4_u10   , "sin, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf4_u10   , "cos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf4_u10   , "tan, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_sinf4_u35   , "sin, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf4_u35   , "cos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf4_u35   , "tan, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, 0, 1e+20);
+
+  callFuncSLEEF1_1(Sleef_sinf4_u10   , "sin, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf4_u10   , "cos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf4_u10   , "tan, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_sinf4_u35   , "sin, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf4_u35   , "cos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf4_u35   , "tan, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
+}
+
+void benchSleef128_SPNontrig() {
+  fillSP(abufsp, 0, 1e+38);
+
+  callFuncSLEEF1_1(Sleef_logf4_u10  , "log, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_log10f4_u10, "log10, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log1pf4_u10, "log1p, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_logf4_u35  , "log, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log10f4_u35, "log10, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log1pf4_u35, "log1p, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, -100, 100);
+
+  callFuncSLEEF1_1(Sleef_expf4_u10  , "exp, SP, 128", -100, 100, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_exp2f4_u10 , "exp2, SP, 128", -100, 100, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_exp10f4_u10, "exp10, SP, 128", -100, 100, 1.0, abufsp, vfloat);
+
+  fillSP(abufsp, -30, 30);
+  fillSP(bbufsp, -30, 30);
+
+  callFuncSLEEF1_2(Sleef_powf4_u10, "pow, SP, 128", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
+
+  fillSP(abufsp, -1.0, 1.0);
+
+  callFuncSLEEF1_1(Sleef_asinf4_u10, "asin, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_acosf4_u10, "acos, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_asinf4_u35, "asin, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_acosf4_u35, "acos, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, -10, 10);
+  fillSP(bbufsp, -10, 10);
+
+  callFuncSLEEF1_1(Sleef_atanf4_u10, "atan, SP, 128", -10, 10, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_2(Sleef_atan2f4_u10, "atan2, SP, 128", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_atanf4_u35, "atan, SP, 128", -10, 10, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_2(Sleef_atan2f4_u35, "atan2, SP, 128", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
+}
+#else // #ifdef ENABLED
+void benchSleef128_DPTrig() {}
+void benchSleef128_DPNontrig() {}
+void benchSleef128_SPTrig() {}
+void benchSleef128_SPNontrig() {}
+#endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c
@@ -0,0 +1,181 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sleef.h>
+
+void fillDP(double *buf, double min, double max);
+void fillSP(float *buf, double min, double max);
+
+extern char x86BrandString[256], versionString[1024];
+extern int veclen;
+extern double *abufdp, *bbufdp;
+extern float *abufsp, *bbufsp;
+extern FILE *fp;
+
+#include "bench.h"
+
+#ifdef __AVX__
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+typedef __m256d vdouble;
+typedef __m256 vfloat;
+#define ENABLED
+#endif
+
+#ifdef ENABLED
+void benchSleef256_DPTrig() {
+  fillDP(abufdp, 0, 6.28);
+
+  callFuncSLEEF1_1(Sleef_sind4_u10   , "sin, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd4_u10   , "cos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand4_u10   , "tan, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind4_u35   , "sin, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd4_u35   , "cos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand4_u35   , "tan, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+6);
+
+  callFuncSLEEF1_1(Sleef_sind4_u10   , "sin, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd4_u10   , "cos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand4_u10   , "tan, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind4_u35   , "sin, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd4_u35   , "cos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand4_u35   , "tan, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+100);
+
+  callFuncSLEEF1_1(Sleef_sind4_u10   , "sin, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd4_u10   , "cos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand4_u10   , "tan, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind4_u35   , "sin, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd4_u35   , "cos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand4_u35   , "tan, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
+}
+
+void benchSleef256_DPNontrig() {
+  fillDP(abufdp, 0, 1e+300);
+
+  callFuncSLEEF1_1(Sleef_logd4_u10  , "log, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_log10d4_u10, "log10, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_log1pd4_u10, "log1p, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_logd4_u35  , "log, DP, 256", 0, 1e+300, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, -700, 700);
+
+  callFuncSLEEF1_1(Sleef_expd4_u10  , "exp, DP, 256", -700, 700, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_exp2d4_u10 , "exp2, DP, 256", -700, 700, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_exp10d4_u10, "exp10, DP, 256", -700, 700, 1.0, abufdp, vdouble);
+
+  fillDP(abufdp, -30, 30);
+  fillDP(bbufdp, -30, 30);
+
+  callFuncSLEEF1_2(Sleef_powd4_u10, "pow, DP, 256", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
+
+  fillDP(abufdp, -1.0, 1.0);
+
+  callFuncSLEEF1_1(Sleef_asind4_u10, "asin, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_acosd4_u10, "acos, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_asind4_u35, "asin, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_acosd4_u35, "acos, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, -10, 10);
+  fillDP(bbufdp, -10, 10);
+
+  callFuncSLEEF1_1(Sleef_atand4_u10, "atan, DP, 256", -10, 10, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_2(Sleef_atan2d4_u10, "atan2, DP, 256", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_atand4_u35, "atan, DP, 256", -10, 10, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_2(Sleef_atan2d4_u35, "atan2, DP, 256", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
+}
+
+void benchSleef256_SPTrig() {
+  fillSP(abufsp, 0, 6.28);
+
+  callFuncSLEEF1_1(Sleef_sinf8_u10   , "sin, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf8_u10   , "cos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf8_u10   , "tan, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_sinf8_u35   , "sin, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf8_u35   , "cos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf8_u35   , "tan, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, 0, 1e+20);
+
+  callFuncSLEEF1_1(Sleef_sinf8_u10   , "sin, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf8_u10   , "cos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf8_u10   , "tan, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_sinf8_u35   , "sin, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf8_u35   , "cos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf8_u35   , "tan, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
+}
+
+void benchSleef256_SPNontrig() {
+  fillSP(abufsp, 0, 1e+38);
+
+  callFuncSLEEF1_1(Sleef_logf8_u10  , "log, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_log10f8_u10, "log10, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log1pf8_u10, "log1p, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_logf8_u35  , "log, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log10f8_u35, "log10, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log1pf8_u35, "log1p, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, -100, 100);
+
+  callFuncSLEEF1_1(Sleef_expf8_u10  , "exp, SP, 256", -100, 100, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_exp2f8_u10 , "exp2, SP, 256", -100, 100, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_exp10f8_u10, "exp10, SP, 256", -100, 100, 1.0, abufsp, vfloat);
+
+  fillSP(abufsp, -30, 30);
+  fillSP(bbufsp, -30, 30);
+
+  callFuncSLEEF1_2(Sleef_powf8_u10, "pow, SP, 256", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
+
+  fillSP(abufsp, -1.0, 1.0);
+
+  callFuncSLEEF1_1(Sleef_asinf8_u10, "asin, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_acosf8_u10, "acos, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_asinf8_u35, "asin, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_acosf8_u35, "acos, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, -10, 10);
+  fillSP(bbufsp, -10, 10);
+
+  callFuncSLEEF1_1(Sleef_atanf8_u10, "atan, SP, 256", -10, 10, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_2(Sleef_atan2f8_u10, "atan2, SP, 256", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_atanf8_u35, "atan, SP, 256", -10, 10, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_2(Sleef_atan2f8_u35, "atan2, SP, 256", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
+}
+#else // #ifdef ENABLED
+void zeroupper256() {}
+void benchSleef256_DPTrig() {}
+void benchSleef256_DPNontrig() {}
+void benchSleef256_SPTrig() {}
+void benchSleef256_SPNontrig() {}
+#endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c
@@ -0,0 +1,180 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sleef.h>
+
+void fillDP(double *buf, double min, double max);
+void fillSP(float *buf, double min, double max);
+
+extern char x86BrandString[256], versionString[1024];
+extern int veclen;
+extern double *abufdp, *bbufdp;
+extern float *abufsp, *bbufsp;
+extern FILE *fp;
+
+#include "bench.h"
+
+#ifdef __AVX512F__
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+typedef __m512d vdouble;
+typedef __m512 vfloat;
+#define ENABLED
+#endif
+
+#ifdef ENABLED
+void benchSleef512_DPTrig() {
+  fillDP(abufdp, 0, 6.28);
+
+  callFuncSLEEF1_1(Sleef_sind8_u10   , "sin, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd8_u10   , "cos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand8_u10   , "tan, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind8_u35   , "sin, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd8_u35   , "cos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand8_u35   , "tan, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+6);
+
+  callFuncSLEEF1_1(Sleef_sind8_u10   , "sin, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd8_u10   , "cos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand8_u10   , "tan, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind8_u35   , "sin, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd8_u35   , "cos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand8_u35   , "tan, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+100);
+
+  callFuncSLEEF1_1(Sleef_sind8_u10   , "sin, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd8_u10   , "cos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand8_u10   , "tan, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
+
+  callFuncSLEEF1_1(Sleef_sind8_u35   , "sin, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_cosd8_u35   , "cos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_tand8_u35   , "tan, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
+}
+
+void benchSleef512_DPNontrig() {
+  fillDP(abufdp, 0, 1e+300);
+
+  callFuncSLEEF1_1(Sleef_logd8_u10  , "log, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_log10d8_u10, "log10, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_log1pd8_u10, "log1p, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_logd8_u35  , "log, DP, 512", 0, 1e+300, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, -700, 700);
+
+  callFuncSLEEF1_1(Sleef_expd8_u10  , "exp, DP, 512", -700, 700, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_exp2d8_u10 , "exp2, DP, 512", -700, 700, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_exp10d8_u10, "exp10, DP, 512", -700, 700, 1.0, abufdp, vdouble);
+
+  fillDP(abufdp, -30, 30);
+  fillDP(bbufdp, -30, 30);
+
+  callFuncSLEEF1_2(Sleef_powd8_u10, "pow, DP, 512", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
+
+  fillDP(abufdp, -1.0, 1.0);
+
+  callFuncSLEEF1_1(Sleef_asind8_u10, "asin, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_acosd8_u10, "acos, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_asind8_u35, "asin, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_acosd8_u35, "acos, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
+
+  fillDP(abufdp, -10, 10);
+  fillDP(bbufdp, -10, 10);
+
+  callFuncSLEEF1_1(Sleef_atand8_u10, "atan, DP, 512", -10, 10, 1.0, abufdp, vdouble);
+  callFuncSLEEF1_2(Sleef_atan2d8_u10, "atan2, DP, 512", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
+  callFuncSLEEF1_1(Sleef_atand8_u35, "atan, DP, 512", -10, 10, 4.0, abufdp, vdouble);
+  callFuncSLEEF1_2(Sleef_atan2d8_u35, "atan2, DP, 512", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
+}
+
+void benchSleef512_SPTrig() {
+  fillSP(abufsp, 0, 6.28);
+
+  callFuncSLEEF1_1(Sleef_sinf16_u10   , "sin, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf16_u10   , "cos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf16_u10   , "tan, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_sinf16_u35   , "sin, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf16_u35   , "cos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf16_u35   , "tan, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, 0, 1e+20);
+
+  callFuncSLEEF1_1(Sleef_sinf16_u10   , "sin, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf16_u10   , "cos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf16_u10   , "tan, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_sinf16_u35   , "sin, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_cosf16_u35   , "cos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_tanf16_u35   , "tan, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
+}
+
+void benchSleef512_SPNontrig() {
+  fillSP(abufsp, 0, 1e+38);
+
+  callFuncSLEEF1_1(Sleef_logf16_u10  , "log, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_log10f16_u10, "log10, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log1pf16_u10, "log1p, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_logf16_u35  , "log, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log10f16_u35, "log10, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
+  //callFuncSLEEF1_1(Sleef_log1pf16_u35, "log1p, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, -100, 100);
+
+  callFuncSLEEF1_1(Sleef_expf16_u10  , "exp, SP, 512", -100, 100, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_exp2f16_u10 , "exp2, SP, 512", -100, 100, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_exp10f16_u10, "exp10, SP, 512", -100, 100, 1.0, abufsp, vfloat);
+
+  fillSP(abufsp, -30, 30);
+  fillSP(bbufsp, -30, 30);
+
+  callFuncSLEEF1_2(Sleef_powf16_u10, "pow, SP, 512", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
+
+  fillSP(abufsp, -1.0, 1.0);
+
+  callFuncSLEEF1_1(Sleef_asinf16_u10, "asin, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_acosf16_u10, "acos, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_asinf16_u35, "asin, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_1(Sleef_acosf16_u35, "acos, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
+
+  fillSP(abufsp, -10, 10);
+  fillSP(bbufsp, -10, 10);
+
+  callFuncSLEEF1_1(Sleef_atanf16_u10, "atan, SP, 512", -10, 10, 1.0, abufsp, vfloat);
+  callFuncSLEEF1_2(Sleef_atan2f16_u10, "atan2, SP, 512", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
+
+  callFuncSLEEF1_1(Sleef_atanf16_u35, "atan, SP, 512", -10, 10, 4.0, abufsp, vfloat);
+  callFuncSLEEF1_2(Sleef_atan2f16_u35, "atan2, SP, 512", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
+}
+#else // #ifdef ENABLED
+void benchSleef512_DPTrig() {}
+void benchSleef512_DPNontrig() {}
+void benchSleef512_SPTrig() {}
+void benchSleef512_SPNontrig() {}
+#endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c
@@ -0,0 +1,153 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+#include "bench.h"
+
+int veclen = 16;
+int enableLogExp;
+double *abufdp, *bbufdp;
+float *abufsp, *bbufsp;
+FILE *fp;
+
+#if defined(__i386__) || defined(__x86_64__)
+void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+
+int cpuSupportsAVX() {
+    int32_t reg[4];
+    x86CpuID(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+}
+
+int cpuSupportsAVX512F() {
+    int32_t reg[4];
+    x86CpuID(reg, 7, 0);
+    return (reg[1] & (1 << 16)) != 0;
+}
+#endif
+
+uint64_t Sleef_currentTimeMicros() {
+  struct timespec tp;
+  clock_gettime(CLOCK_MONOTONIC, &tp);
+  return (uint64_t)tp.tv_sec * 1000000LL + ((uint64_t)tp.tv_nsec/1000);
+}
+
+void fillDP(double *buf, double min, double max) {
+  for(int i=0;i<NITER1*veclen;i++) {
+    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
+    buf[i] = r * (max - min) + min;
+  }
+}
+
+void fillSP(float *buf, double min, double max) {
+  for(int i=0;i<NITER1*veclen;i++) {
+    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
+    buf[i] = r * (max - min) + min;
+  }
+}
+
+void zeroupper256();
+void benchSVML128_DPTrig();
+void benchSVML256_DPTrig();
+void benchSVML512_DPTrig();
+void benchSVML128_DPNontrig();
+void benchSVML256_DPNontrig();
+void benchSVML512_DPNontrig();
+void benchSVML128_SPTrig();
+void benchSVML256_SPTrig();
+void benchSVML512_SPTrig();
+void benchSVML128_SPNontrig();
+void benchSVML256_SPNontrig();
+void benchSVML512_SPNontrig();
+
+//
+
+int main(int argc, char **argv) {
+  char *columnTitle = "SVML", *fnBase = "svml";
+  char fn[1024];
+
+  if (argc != 1) columnTitle = argv[1];
+  if (argc >= 3) fnBase = argv[2];
+
+  srandom(time(NULL));
+
+#if defined(__i386__) || defined(__x86_64__)
+  int do128bit = 1;
+  int do256bit = cpuSupportsAVX();
+  int do512bit = cpuSupportsAVX512F();
+#elif defined(__ARM_NEON)
+  int do128bit = 1;
+  int do256bit = 0;
+  int do512bit = 0;
+#else
+#error Unsupported architecture
+#endif
+
+  posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
+  posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
+
+  abufsp = (float *)abufdp;
+  bbufsp = (float *)bbufdp;
+
+  enableLogExp = SVMLULP < 2;
+
+  sprintf(fn, "%sdptrig%gulp.out", fnBase, (double)SVMLULP);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do256bit) zeroupper256();
+  if (do128bit) benchSVML128_DPTrig();
+  if (do256bit) benchSVML256_DPTrig();
+  if (do512bit) benchSVML512_DPTrig();
+
+  fclose(fp);
+
+  sprintf(fn, "%sdpnontrig%gulp.out", fnBase, (double)SVMLULP);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do256bit) zeroupper256();
+  if (do128bit) benchSVML128_DPNontrig();
+  if (do256bit) benchSVML256_DPNontrig();
+  if (do512bit) benchSVML512_DPNontrig();
+
+  fclose(fp);
+
+  sprintf(fn, "%ssptrig%gulp.out", fnBase, (double)SVMLULP);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do256bit) zeroupper256();
+  if (do128bit) benchSVML128_SPTrig();
+  if (do256bit) benchSVML256_SPTrig();
+  if (do512bit) benchSVML512_SPTrig();
+
+  fclose(fp);
+
+  sprintf(fn, "%sspnontrig%gulp.out", fnBase, (double)SVMLULP);
+  fp = fopen(fn, "w");
+  fprintf(fp, "%s\n", columnTitle);
+
+  if (do256bit) zeroupper256();
+  if (do128bit) benchSVML128_SPNontrig();
+  if (do256bit) benchSVML256_SPNontrig();
+  if (do512bit) benchSVML512_SPNontrig();
+
+  fclose(fp);
+
+  exit(0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c
@@ -0,0 +1,144 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+uint64_t Sleef_currentTimeMicros();
+void fillDP(double *buf, double min, double max);
+void fillSP(float *buf, double min, double max);
+
+extern char x86BrandString[256], versionString[1024];
+extern int veclen;
+extern int enableLogExp;
+extern double *abufdp, *bbufdp;
+extern float *abufsp, *bbufsp;
+extern FILE *fp;
+
+#include "bench.h"
+
+#ifdef __SSE2__
+typedef __m128d vdouble;
+typedef __m128 vfloat;
+#define ENABLED
+#endif
+
+#ifdef ENABLED
+void benchSVML128_DPTrig() {
+  fillDP(abufdp, 0, 6.28);
+
+  callFuncSVML1_1(_mm_sin_pd   , "sin, DP, 128", 0, 6.28, abufdp, vdouble);
+  callFuncSVML1_1(_mm_cos_pd   , "cos, DP, 128", 0, 6.28, abufdp, vdouble);
+  callFuncSVML1_1(_mm_tan_pd   , "tan, DP, 128", 0, 6.28, abufdp, vdouble);
+  callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 6.28, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+6);
+
+  callFuncSVML1_1(_mm_sin_pd   , "sin, DP, 128", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML1_1(_mm_cos_pd   , "cos, DP, 128", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML1_1(_mm_tan_pd   , "tan, DP, 128", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+6, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+100);
+
+  callFuncSVML1_1(_mm_sin_pd   , "sin, DP, 128", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML1_1(_mm_cos_pd   , "cos, DP, 128", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML1_1(_mm_tan_pd   , "tan, DP, 128", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+100, abufdp, vdouble);
+}
+
+void benchSVML128_DPNontrig() {
+  fillDP(abufdp, 0, 1e+300);
+
+  callFuncSVML1_1(_mm_log_pd  , "log, DP, 128", 0, 1e+300, abufdp, vdouble);
+
+  if (enableLogExp) {
+    callFuncSVML1_1(_mm_log10_pd, "log10, DP, 128", 0, 1e+300, abufdp, vdouble);
+    callFuncSVML1_1(_mm_log1p_pd, "log1p, DP, 128", 0, 1e+300, abufdp, vdouble);
+
+    fillDP(abufdp, -700, 700);
+
+    callFuncSVML1_1(_mm_exp_pd  , "exp, DP, 128", -700, 700, abufdp, vdouble);
+    callFuncSVML1_1(_mm_exp2_pd , "exp2, DP, 128", -700, 700, abufdp, vdouble);
+    callFuncSVML1_1(_mm_exp10_pd, "exp10, DP, 128", -700, 700, abufdp, vdouble);
+
+    fillDP(abufdp, -30, 30);
+    fillDP(bbufdp, -30, 30);
+
+    callFuncSVML1_2(_mm_pow_pd, "pow, DP, 128", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
+  }
+
+  fillDP(abufdp, -1.0, 1.0);
+
+  callFuncSVML1_1(_mm_asin_pd, "asin, DP, 128", -1.0, 1.0, abufdp, vdouble);
+  callFuncSVML1_1(_mm_acos_pd, "acos, DP, 128", -1.0, 1.0, abufdp, vdouble);
+
+  fillDP(abufdp, -10, 10);
+  fillDP(bbufdp, -10, 10);
+
+  callFuncSVML1_1(_mm_atan_pd, "atan, DP, 128", -10, 10, abufdp, vdouble);
+  callFuncSVML1_2(_mm_atan2_pd, "atan2, DP, 128", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
+}
+
+void benchSVML128_SPTrig() {
+  fillSP(abufsp, 0, 6.28);
+
+  callFuncSVML1_1(_mm_sin_ps   , "sin, SP, 128", 0, 6.28, abufsp, vfloat);
+  callFuncSVML1_1(_mm_cos_ps   , "cos, SP, 128", 0, 6.28, abufsp, vfloat);
+  callFuncSVML1_1(_mm_tan_ps   , "tan, SP, 128", 0, 6.28, abufsp, vfloat);
+  callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 6.28, abufsp, vfloat);
+
+  fillSP(abufsp, 0, 1e+20);
+
+  callFuncSVML1_1(_mm_sin_ps   , "sin, SP, 128", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML1_1(_mm_cos_ps   , "cos, SP, 128", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML1_1(_mm_tan_ps   , "tan, SP, 128", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 1e+20, abufsp, vfloat);
+}
+
+void benchSVML128_SPNontrig() {
+  fillSP(abufsp, 0, 1e+38);
+
+  callFuncSVML1_1(_mm_log_ps  , "log, SP, 128", 0, 1e+38, abufsp, vfloat);
+
+  if (enableLogExp) {
+    callFuncSVML1_1(_mm_log10_ps, "log10, SP, 128", 0, 1e+38, abufsp, vfloat);
+    //callFuncSVML1_1(_mm_log1p_ps, "log1p, SP, 128", 0, 1e+38, abufsp, vfloat);
+
+    fillSP(abufsp, -100, 100);
+
+    callFuncSVML1_1(_mm_exp_ps  , "exp, SP, 128", -100, 100, abufsp, vfloat);
+    callFuncSVML1_1(_mm_exp2_ps , "exp2, SP, 128", -100, 100, abufsp, vfloat);
+    callFuncSVML1_1(_mm_exp10_ps, "exp10, SP, 128", -100, 100, abufsp, vfloat);
+
+    fillSP(abufsp, -30, 30);
+    fillSP(bbufsp, -30, 30);
+
+    callFuncSVML1_2(_mm_pow_ps, "pow, SP, 128", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
+  }
+
+  fillSP(abufsp, -1.0, 1.0);
+
+  callFuncSVML1_1(_mm_asin_ps, "asin, SP, 128", -1.0, 1, abufsp, vfloat);
+  callFuncSVML1_1(_mm_acos_ps, "acos, SP, 128", -1.0, 1, abufsp, vfloat);
+
+  fillSP(abufsp, -10, 10);
+  fillSP(bbufsp, -10, 10);
+
+  callFuncSVML1_1(_mm_atan_ps, "atan, SP, 128", -10, 10, abufsp, vfloat);
+  callFuncSVML1_2(_mm_atan2_ps, "atan2, SP, 128", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
+}
+#else // #ifdef ENABLED
+void benchSVML128_DPTrig() {}
+void benchSVML128_DPNontrig() {}
+void benchSVML128_SPTrig() {}
+void benchSVML128_SPNontrig() {}
+#endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c
@@ -0,0 +1,147 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+uint64_t Sleef_currentTimeMicros();
+void fillDP(double *buf, double min, double max);
+void fillSP(float *buf, double min, double max);
+
+extern char x86BrandString[256], versionString[1024];
+extern int veclen;
+extern int enableLogExp;
+extern double *abufdp, *bbufdp;
+extern float *abufsp, *bbufsp;
+extern FILE *fp;
+
+#include "bench.h"
+
+#ifdef __AVX__
+typedef __m256d vdouble;
+typedef __m256 vfloat;
+#define ENABLED
+#endif
+
+#ifdef ENABLED
+void zeroupper256() { _mm256_zeroupper(); }
+
+void benchSVML256_DPTrig() {
+  fillDP(abufdp, 0, 6.28);
+
+  callFuncSVML1_1(_mm256_sin_pd   , "sin, DP, 256", 0, 6.28, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_cos_pd   , "cos, DP, 256", 0, 6.28, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_tan_pd   , "tan, DP, 256", 0, 6.28, abufdp, vdouble);
+  callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 6.28, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+6);
+
+  callFuncSVML1_1(_mm256_sin_pd   , "sin, DP, 256", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_cos_pd   , "cos, DP, 256", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_tan_pd   , "tan, DP, 256", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+6, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+100);
+
+  callFuncSVML1_1(_mm256_sin_pd   , "sin, DP, 256", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_cos_pd   , "cos, DP, 256", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_tan_pd   , "tan, DP, 256", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+100, abufdp, vdouble);
+}
+
+void benchSVML256_DPNontrig() {
+  fillDP(abufdp, 0, 1e+300);
+
+  callFuncSVML1_1(_mm256_log_pd  , "log, DP, 256", 0, 1e+300, abufdp, vdouble);
+
+  if (enableLogExp) {
+    callFuncSVML1_1(_mm256_log10_pd, "log10, DP, 256", 0, 1e+300, abufdp, vdouble);
+    callFuncSVML1_1(_mm256_log1p_pd, "log1p, DP, 256", 0, 1e+300, abufdp, vdouble);
+
+    fillDP(abufdp, -700, 700);
+
+    callFuncSVML1_1(_mm256_exp_pd  , "exp, DP, 256", -700, 700, abufdp, vdouble);
+    callFuncSVML1_1(_mm256_exp2_pd , "exp2, DP, 256", -700, 700, abufdp, vdouble);
+    callFuncSVML1_1(_mm256_exp10_pd, "exp10, DP, 256", -700, 700, abufdp, vdouble);
+
+    fillDP(abufdp, -30, 30);
+    fillDP(bbufdp, -30, 30);
+
+    callFuncSVML1_2(_mm256_pow_pd, "pow, DP, 256", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
+  }
+
+  fillDP(abufdp, -1.0, 1.0);
+
+  callFuncSVML1_1(_mm256_asin_pd, "asin, DP, 256", -1.0, 1.0, abufdp, vdouble);
+  callFuncSVML1_1(_mm256_acos_pd, "acos, DP, 256", -1.0, 1.0, abufdp, vdouble);
+
+  fillDP(abufdp, -10, 10);
+  fillDP(bbufdp, -10, 10);
+
+  callFuncSVML1_1(_mm256_atan_pd, "atan, DP, 256", -10, 10, abufdp, vdouble);
+  callFuncSVML1_2(_mm256_atan2_pd, "atan2, DP, 256", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
+}
+
+void benchSVML256_SPTrig() {
+  fillSP(abufsp, 0, 6.28);
+
+  callFuncSVML1_1(_mm256_sin_ps   , "sin, SP, 256", 0, 6.28, abufsp, vfloat);
+  callFuncSVML1_1(_mm256_cos_ps   , "cos, SP, 256", 0, 6.28, abufsp, vfloat);
+  callFuncSVML1_1(_mm256_tan_ps   , "tan, SP, 256", 0, 6.28, abufsp, vfloat);
+  callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 6.28, abufsp, vfloat);
+
+  fillSP(abufsp, 0, 1e+20);
+
+  callFuncSVML1_1(_mm256_sin_ps   , "sin, SP, 256", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML1_1(_mm256_cos_ps   , "cos, SP, 256", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML1_1(_mm256_tan_ps   , "tan, SP, 256", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 1e+20, abufsp, vfloat);
+}
+
+void benchSVML256_SPNontrig() {
+  fillSP(abufsp, 0, 1e+38);
+
+  callFuncSVML1_1(_mm256_log_ps  , "log, SP, 256", 0, 1e+38, abufsp, vfloat);
+
+  if (enableLogExp) {
+    callFuncSVML1_1(_mm256_log10_ps, "log10, SP, 256", 0, 1e+38, abufsp, vfloat);
+    //callFuncSVML1_1(_mm256_log1p_ps, "log1p, SP, 256", 0, 1e+38, abufsp, vfloat);
+
+    fillSP(abufsp, -100, 100);
+
+    callFuncSVML1_1(_mm256_exp_ps  , "exp, SP, 256", -100, 100, abufsp, vfloat);
+    callFuncSVML1_1(_mm256_exp2_ps , "exp2, SP, 256", -100, 100, abufsp, vfloat);
+    callFuncSVML1_1(_mm256_exp10_ps, "exp10, SP, 256", -100, 100, abufsp, vfloat);
+
+    fillSP(abufsp, -30, 30);
+    fillSP(bbufsp, -30, 30);
+
+    callFuncSVML1_2(_mm256_pow_ps, "pow, SP, 256", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
+  }
+
+  fillSP(abufsp, -1.0, 1.0);
+
+  callFuncSVML1_1(_mm256_asin_ps, "asin, SP, 256", -1.0, 1, abufsp, vfloat);
+  callFuncSVML1_1(_mm256_acos_ps, "acos, SP, 256", -1.0, 1, abufsp, vfloat);
+
+  fillSP(abufsp, -10, 10);
+  fillSP(bbufsp, -10, 10);
+
+  callFuncSVML1_1(_mm256_atan_ps, "atan, SP, 256", -10, 10, abufsp, vfloat);
+  callFuncSVML1_2(_mm256_atan2_ps, "atan2, SP, 256", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
+}
+#else // #ifdef ENABLED
+void zeroupper256() {}
+void benchSVML256_DPTrig() {}
+void benchSVML256_DPNontrig() {}
+void benchSVML256_SPTrig() {}
+void benchSVML256_SPNontrig() {}
+#endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c
@@ -0,0 +1,144 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+uint64_t Sleef_currentTimeMicros();
+void fillDP(double *buf, double min, double max);
+void fillSP(float *buf, double min, double max);
+
+extern char x86BrandString[256], versionString[1024];
+extern int veclen;
+extern int enableLogExp;
+extern double *abufdp, *bbufdp;
+extern float *abufsp, *bbufsp;
+extern FILE *fp;
+
+#include "bench.h"
+
+#ifdef __AVX512F__
+typedef __m512d vdouble;
+typedef __m512 vfloat;
+#define ENABLED
+#endif
+
+#ifdef ENABLED
+void benchSVML512_DPTrig() {
+  fillDP(abufdp, 0, 6.28);
+
+  callFuncSVML1_1(_mm512_sin_pd   , "sin, DP, 512", 0, 6.28, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_cos_pd   , "cos, DP, 512", 0, 6.28, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_tan_pd   , "tan, DP, 512", 0, 6.28, abufdp, vdouble);
+  callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 6.28, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+6);
+
+  callFuncSVML1_1(_mm512_sin_pd   , "sin, DP, 512", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_cos_pd   , "cos, DP, 512", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_tan_pd   , "tan, DP, 512", 0, 1e+6, abufdp, vdouble);
+  callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+6, abufdp, vdouble);
+
+  fillDP(abufdp, 0, 1e+100);
+
+  callFuncSVML1_1(_mm512_sin_pd   , "sin, DP, 512", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_cos_pd   , "cos, DP, 512", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_tan_pd   , "tan, DP, 512", 0, 1e+100, abufdp, vdouble);
+  callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+100, abufdp, vdouble);
+}
+
+void benchSVML512_DPNontrig() {
+  fillDP(abufdp, 0, 1e+300);
+
+  callFuncSVML1_1(_mm512_log_pd  , "log, DP, 512", 0, 1e+300, abufdp, vdouble);
+
+  if (enableLogExp) {
+    callFuncSVML1_1(_mm512_log10_pd, "log10, DP, 512", 0, 1e+300, abufdp, vdouble);
+    callFuncSVML1_1(_mm512_log1p_pd, "log1p, DP, 512", 0, 1e+300, abufdp, vdouble);
+
+    fillDP(abufdp, -700, 700);
+
+    callFuncSVML1_1(_mm512_exp_pd  , "exp, DP, 512", -700, 700, abufdp, vdouble);
+    callFuncSVML1_1(_mm512_exp2_pd , "exp2, DP, 512", -700, 700, abufdp, vdouble);
+    callFuncSVML1_1(_mm512_exp10_pd, "exp10, DP, 512", -700, 700, abufdp, vdouble);
+
+    fillDP(abufdp, -30, 30);
+    fillDP(bbufdp, -30, 30);
+
+    callFuncSVML1_2(_mm512_pow_pd, "pow, DP, 512", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
+  }
+
+  fillDP(abufdp, -1.0, 1.0);
+
+  callFuncSVML1_1(_mm512_asin_pd, "asin, DP, 512", -1.0, 1.0, abufdp, vdouble);
+  callFuncSVML1_1(_mm512_acos_pd, "acos, DP, 512", -1.0, 1.0, abufdp, vdouble);
+
+  fillDP(abufdp, -10, 10);
+  fillDP(bbufdp, -10, 10);
+
+  callFuncSVML1_1(_mm512_atan_pd, "atan, DP, 512", -10, 10, abufdp, vdouble);
+  callFuncSVML1_2(_mm512_atan2_pd, "atan2, DP, 512", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
+}
+
+void benchSVML512_SPTrig() {
+  fillSP(abufsp, 0, 6.28);
+
+  callFuncSVML1_1(_mm512_sin_ps   , "sin, SP, 512", 0, 6.28, abufsp, vfloat);
+  callFuncSVML1_1(_mm512_cos_ps   , "cos, SP, 512", 0, 6.28, abufsp, vfloat);
+  callFuncSVML1_1(_mm512_tan_ps   , "tan, SP, 512", 0, 6.28, abufsp, vfloat);
+  callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 6.28, abufsp, vfloat);
+
+  fillSP(abufsp, 0, 1e+20);
+
+  callFuncSVML1_1(_mm512_sin_ps   , "sin, SP, 512", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML1_1(_mm512_cos_ps   , "cos, SP, 512", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML1_1(_mm512_tan_ps   , "tan, SP, 512", 0, 1e+20, abufsp, vfloat);
+  callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 1e+20, abufsp, vfloat);
+}
+
+void benchSVML512_SPNontrig() {
+  fillSP(abufsp, 0, 1e+38);
+
+  callFuncSVML1_1(_mm512_log_ps  , "log, SP, 512", 0, 1e+38, abufsp, vfloat);
+
+  if (enableLogExp) {
+    callFuncSVML1_1(_mm512_log10_ps, "log10, SP, 512", 0, 1e+38, abufsp, vfloat);
+    //callFuncSVML1_1(_mm512_log1p_ps, "log1p, SP, 512", 0, 1e+38, abufsp, vfloat);
+
+    fillSP(abufsp, -100, 100);
+
+    callFuncSVML1_1(_mm512_exp_ps  , "exp, SP, 512", -100, 100, abufsp, vfloat);
+    callFuncSVML1_1(_mm512_exp2_ps , "exp2, SP, 512", -100, 100, abufsp, vfloat);
+    callFuncSVML1_1(_mm512_exp10_ps, "exp10, SP, 512", -100, 100, abufsp, vfloat);
+
+    fillSP(abufsp, -30, 30);
+    fillSP(bbufsp, -30, 30);
+
+    callFuncSVML1_2(_mm512_pow_ps, "pow, SP, 512", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
+  }
+
+  fillSP(abufsp, -1.0, 1.0);
+
+  callFuncSVML1_1(_mm512_asin_ps, "asin, SP, 512", -1.0, 1, abufsp, vfloat);
+  callFuncSVML1_1(_mm512_acos_ps, "acos, SP, 512", -1.0, 1, abufsp, vfloat);
+
+  fillSP(abufsp, -10, 10);
+  fillSP(bbufsp, -10, 10);
+
+  callFuncSVML1_1(_mm512_atan_ps, "atan, SP, 512", -10, 10, abufsp, vfloat);
+  callFuncSVML1_2(_mm512_atan2_ps, "atan2, SP, 512", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
+}
+#else // #ifdef ENABLED
+void benchSVML512_DPTrig() {}
+void benchSVML512_DPNontrig() {}
+void benchSVML512_SPTrig() {}
+void benchSVML512_SPNontrig() {}
+#endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/measure.sh
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/measure.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+echo
+read -p "Enter label of measurement(e.g. My desktop PC) : " label
+
+if [ -f counter.txt ]
+then
+    counter=`cat counter.txt`
+else
+    counter=0
+fi
+
+echo Measurement in progress. This may take several minutes.
+for i in $*; do
+    $i "$label" $counter
+done
+counter=$((counter+1))
+echo $counter > counter.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/CMakeLists.txt
@@ -0,0 +1,517 @@
+
+# Settings
+
+# TESTER3_DEFINITIONS
+
+set(TESTER3_DEFINITIONS_SSE2          ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4  EXTSPEC=sse2)
+set(TESTER3_DEFINITIONS_SSE4          ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4  EXTSPEC=sse4)
+set(TESTER3_DEFINITIONS_AVX2128       ATR=finz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4  EXTSPEC=avx2128)
+set(TESTER3_DEFINITIONS_AVX           ATR=cinz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8  EXTSPEC=avx)
+set(TESTER3_DEFINITIONS_FMA4          ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8  EXTSPEC=fma4)
+set(TESTER3_DEFINITIONS_AVX2          ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8  EXTSPEC=avx2)
+set(TESTER3_DEFINITIONS_AVX512F       ATR=finz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512f)
+set(TESTER3_DEFINITIONS_AVX512FNOFMA  ATR=cinz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512fnofma)
+
+set(TESTER3_DEFINITIONS_ADVSIMD       ATR=finz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimd)
+set(TESTER3_DEFINITIONS_ADVSIMDNOFMA  ATR=cinz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimdnofma)
+set(TESTER3_DEFINITIONS_SVE           ATR=finz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=sve)
+set(TESTER3_DEFINITIONS_SVENOFMA      ATR=cinz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=svenofma)
+
+set(TESTER3_DEFINITIONS_VSX       ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx)
+set(TESTER3_DEFINITIONS_VSXNOFMA  ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsxnofma)
+set(TESTER3_DEFINITIONS_VSX3      ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3)
+set(TESTER3_DEFINITIONS_VSX3NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3nofma)
+
+set(TESTER3_DEFINITIONS_VXE       ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe)
+set(TESTER3_DEFINITIONS_VXENOFMA  ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxenofma)
+set(TESTER3_DEFINITIONS_VXE2      ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2)
+set(TESTER3_DEFINITIONS_VXE2NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2nofma)
+
+set(TESTER3_DEFINITIONS_RVVM1      ATR=finz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1 ENABLE_RVVM1)
+set(TESTER3_DEFINITIONS_RVVM1NOFMA ATR=cinz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1nofma ENABLE_RVVM1)
+set(TESTER3_DEFINITIONS_RVVM2      ATR=finz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2 ENABLE_RVVM2)
+set(TESTER3_DEFINITIONS_RVVM2NOFMA ATR=cinz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2nofma ENABLE_RVVM2)
+
+set(TESTER3_DEFINITIONS_PUREC_SCALAR    ATR=cinz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purec)
+set(TESTER3_DEFINITIONS_PURECFMA_SCALAR ATR=finz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purecfma)
+
+#
+
+if (SLEEF_ARCH_X86)
+  set(TEST3_CINZ purec_scalar sse2 sse4 avx avx512fnofma)
+  set(TEST3_FINZ purecfma_scalar avx2128 avx2 avx512f)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(TEST3_CINZ purec_scalar advsimdnofma svenofma)
+  set(TEST3_FINZ purecfma_scalar advsimd sve)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+  set(TEST3_CINZ purec_scalar)
+  set(TEST3_FINZ purecfma_scalar)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+  set(TEST3_CINZ purec_scalar vsxnofma vsx3nofma)
+  set(TEST3_FINZ purecfma_scalar vsx vsx3)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
+  set(TEST3_CINZ purec_scalar vxenofma vxe2nofma)
+  set(TEST3_FINZ purecfma_scalar vxe vxe2)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+  set(TEST3_CINZ purec_scalar rvvm1nofma rvvm2nofma)
+  set(TEST3_FINZ purecfma_scalar rvvm1 rvvm2)
+endif()
+
+#
+
+link_directories(${sleef_BINARY_DIR}/lib)                 # libsleef
+link_directories(${sleef_BINARY_DIR}/src/common)          # common.a
+include_directories(${sleef_BINARY_DIR}/include)          # sleef.h
+include_directories(${sleef_SOURCE_DIR}/src/libm)         # rename.h
+include_directories(${sleef_BINARY_DIR}/src/libm/include) # rename headers
+
+if(NOT LIB_MPFR)
+  find_program(TESTER_COMMAND tester)
+endif(NOT LIB_MPFR)
+
+if (SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
+  message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified and tester is not available")
+endif(SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
+
+find_library(LIBRT rt)
+if (NOT LIBRT)
+  set(LIBRT "")
+endif()
+
+set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}")
+
+set(COMMON_TARGET_PROPERTIES
+  C_STANDARD 99                  # -std=gnu99
+  )
+
+if (SLEEF_ENABLE_LTO)
+  list(APPEND COMMON_TARGET_PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)  # -flto
+endif()
+
+#
+
+function(add_test_iut IUT C)
+  if (LIB_MPFR)
+    set(TESTER ${TARGET_TESTER})
+  elseif(TESTER_COMMAND)
+    set(TESTER ${TESTER_COMMAND})
+  endif()
+  # When we are crosscompiling using the mkrename* tools from a native
+  # build, we use the tester executable from the native build.
+  if (CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
+    set(TESTER ${NATIVE_BUILD_DIR}/bin/${TARGET_TESTER})
+  endif(CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
+  if (TESTER)
+    if (NOT EMULATOR)
+      if (SDE_COMMAND)
+        set(FLAGS_SDE "--sde" ${SDE_COMMAND})
+      else()
+        set(FLAGS_SDE)
+      endif()
+      if (ARMIE_COMMAND)
+        set(FLAGS_ARMIE ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS})
+      else()
+        set(FLAGS_ARMIE)
+      endif()
+      add_test(NAME ${IUT}
+        COMMAND ${TESTER} ${FLAGS_SDE} ${FLAGS_ARMIE} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
+        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+      set_tests_properties(${IUT} PROPERTIES COST ${C})
+    else()
+      add_test(NAME ${IUT}
+        COMMAND ${TESTER} "--qemu" ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
+        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+      set_tests_properties(${IUT} PROPERTIES COST ${C})
+    endif()
+  endif()
+endfunction()
+
+# Compile executable 'iut'
+add_executable(${TARGET_IUT} iut.c testerutil.c)
+target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
+target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF}
+  ${LIBM} ${LIBRT})
+set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+add_test_iut(${TARGET_IUT} 1.0)
+set(IUT_LIST ${TARGET_IUT})
+
+# Compile executable 'iutcuda'
+if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
+  add_executable(iutcuda iutcuda.cu)
+  set_target_properties(iutcuda PROPERTIES LINKER_LANGUAGE CUDA)
+  target_compile_options(iutcuda PRIVATE "--fmad=false;-Xcompiler;-ffp-contract=off")
+  add_dependencies(iutcuda ${TARGET_INLINE_HEADERS})
+  add_test_iut(iutcuda 20.0)
+  list(APPEND IUT_LIST iutcuda)
+endif()
+
+set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
+
+# Add vector extension `iut`s
+macro(test_extension SIMD)
+  if(COMPILER_SUPPORTS_${SIMD})
+    string(TOLOWER ${SIMD} LCSIMD)
+    string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD})
+
+    add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC})
+    target_compile_options(${TARGET_IUT${SIMD}}
+      PRIVATE ${FLAGS_ENABLE_${SIMD}})
+    target_compile_definitions(${TARGET_IUT${SIMD}}
+      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
+    target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}
+      ${LIBM} ${LIBRT})
+    if (FORCE_AAVPCS)
+      target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1)
+    endif(FORCE_AAVPCS)
+
+    add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS})
+    add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF})
+    set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+    if (DEFINED COSTOVERRIDE_${SIMD})
+      add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}})
+    else()
+      add_test_iut(${TARGET_IUT${SIMD}} 1.0)
+    endif()
+    list(APPEND IUT_LIST ${TARGET_IUT${SIMD}})
+
+    # The iut programs whose names begin with "iuty" are the iut for the
+    # deterministic version of functions. By checking the result of
+    # testing with iutysse2, for example, it can be checked that the
+    # corresponding deterministic functions passes the accuracy and
+    # nonnumber tests.
+
+    string(CONCAT IUTYNAME "iuty" ${LCSIMD})
+    add_executable(${IUTYNAME} ${IUT_SRC})
+    target_compile_options(${IUTYNAME}
+      PRIVATE ${FLAGS_ENABLE_${SIMD}})
+    target_compile_definitions(${IUTYNAME}
+      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
+    target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF}
+      ${LIBM} ${LIBRT})
+    add_dependencies(${IUTYNAME} ${TARGET_HEADERS})
+    add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF})
+    set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+    if (DEFINED COSTOVERRIDE_${SIMD})
+      add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}})
+    else()
+      add_test_iut(${IUTYNAME} 1.0)
+    endif()
+    list(APPEND IUT_LIST ${IUTYNAME})
+
+    # The iut programs whose names begin with "iuti" are the iut for the
+    # inline version of functions.
+
+    if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
+      string(CONCAT IUTINAME "iuti" ${LCSIMD})
+      add_executable(${IUTINAME} ${IUT_SRC})
+      target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}})
+      target_compile_definitions(${IUTINAME}
+        PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
+        USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
+        MACRO_ONLY_HEADER="macroonly${SIMD}.h"
+        SIMD_SUFFIX=_${LCSIMD}_sleef
+        )
+      target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include)
+      target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
+      add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
+      set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
+      if (DEFINED COSTOVERRIDE_${SIMD})
+        add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}})
+      else()
+        add_test_iut(${IUTINAME} 1.0)
+      endif()
+      list(APPEND IUT_LIST ${IUTINAME})
+    endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
+
+    if(LIB_MPFR AND NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND NOT MINGW)
+      # Build tester2 SIMD
+      string(TOLOWER ${SIMD} SCSIMD)
+      foreach(P dp sp)
+              set(T "tester2${SCSIMD}${P}")
+              add_executable(${T} tester2simd${P}.c testerutil.c)
+              if(FORCE_AAVPCS)
+                target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
+              endif(FORCE_AAVPCS)
+              target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
+              target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
+              set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+              target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
+              add_dependencies(${T} ${TARGET_HEADERS})
+              add_dependencies(${T} ${TARGET_LIBSLEEF})
+              if (MPFR_INCLUDE_DIR)
+                target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
+              endif()
+
+              # The tester2 programs whose name begins with "tester2y" are the
+              # testing program for the deterministic version of functions.
+
+              set(T "tester2y${SCSIMD}${P}")
+              add_executable(${T} tester2simd${P}.c testerutil.c)
+              target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
+              target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
+              set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+              target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
+              add_dependencies(${T} ${TARGET_HEADERS})
+              add_dependencies(${T} ${TARGET_LIBSLEEF})
+              if (MPFR_INCLUDE_DIR)
+                target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
+              endif()
+      endforeach()
+    endif()
+
+    if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND SLEEF_OPENSSL_FOUND)
+      # Build tester3
+      string(TOLOWER ${SIMD} SCSIMD)
+      set(T "tester3${SCSIMD}")
+      add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
+      target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
+      target_compile_definitions(${T} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${TESTER3_DEFINITIONS_${SIMD}})
+      set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+
+      # Enable Vector PCS for Advanced SIMD (if supported)
+      if(FORCE_AAVPCS)
+        host_target_AAVPCS_definitions(${T})
+      endif()
+
+      target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBM} ${SLEEF_OPENSSL_LIBRARIES})
+      target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
+      add_dependencies(${T} ${TARGET_HEADERS})
+      add_dependencies(${T} ${TARGET_LIBSLEEF})
+
+      # Add test with tester3
+      list(FIND TEST3_CINZ ${SCSIMD} INDEX_TEST3_CINZ)
+      if (NOT INDEX_TEST3_CINZ EQUAL -1)
+        if (SDE_COMMAND)
+          add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
+        elseif(EMULATOR)
+          add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
+        else()
+          add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
+        endif()
+        if (DEFINED COSTOVERRIDE_${SIMD})
+          set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
+        else()
+          set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
+        endif()
+      endif()
+
+      list(FIND TEST3_FINZ ${SCSIMD} INDEX_TEST3_FINZ)
+      if (NOT INDEX_TEST3_FINZ EQUAL -1)
+        if (SDE_COMMAND)
+          add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
+        elseif(EMULATOR)
+          add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
+        else()
+          add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
+        endif()
+        if (DEFINED COSTOVERRIDE_${SIMD})
+          set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
+        else()
+          set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
+        endif()
+      endif()
+    endif()
+  endif(COMPILER_SUPPORTS_${SIMD})
+endmacro(test_extension)
+
+foreach(SIMD ${SLEEF_SUPPORTED_LIBM_EXTENSIONS})
+  test_extension(${SIMD})
+endforeach()
+
+function(add_gnuabi_compatibility_test SIMD MASKED)
+  if (MASKED)
+    set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD}_masked)
+  else(MASKED)
+    set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD})
+  endif(MASKED)
+  add_executable(${GNUABI_COMPATIBILITY_TEST} gnuabi_compatibility.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
+  set_target_properties(${GNUABI_COMPATIBILITY_TEST} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+  target_compile_options(${GNUABI_COMPATIBILITY_TEST}
+    PRIVATE ${FLAGS_ENABLE_${SIMD}})
+  if (MASKED)
+    target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
+      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} MASKED_GNUABI=1)
+  else(MASKED)
+    target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
+      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
+  endif(MASKED)
+  if (FORCE_AAVPCS)
+    target_compile_definitions(${GNUABI_COMPATIBILITY_TEST} PRIVATE ENABLE_AAVPCS=1)
+  endif(FORCE_AAVPCS)
+  target_link_libraries(${GNUABI_COMPATIBILITY_TEST} ${TARGET_LIBSLEEFGNUABI} ${LIBM})
+  # These are linker tests that don't really need to be executed,
+  # but seeing them in the report of ctest gives an idea of what
+  # has been built for testing.
+  if (EMULATOR)
+    add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
+      COMMAND ${EMULATOR} $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>
+      WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+  elseif(SDE_COMMAND)
+    add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
+      COMMAND ${SDE_COMMAND} "--" $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
+  else()
+    add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
+      COMMAND $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
+  endif(EMULATOR)
+endfunction(add_gnuabi_compatibility_test)
+
+if(ENABLE_GNUABI)
+  foreach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
+    if(COMPILER_SUPPORTS_${SIMD})
+      # GNUABI compatibility for the unmasked symbols.
+      add_gnuabi_compatibility_test(${SIMD} OFF)
+      # GNUABI compatibility for the masked symbols.
+      if (MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
+        add_gnuabi_compatibility_test(${SIMD} ON)
+      endif(MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
+    endif (COMPILER_SUPPORTS_${SIMD})
+  endforeach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
+endif(ENABLE_GNUABI)
+
+#
+
+if (SLEEF_ARCH_X86)
+  # iutdsp128
+  add_executable(iutdsp128 ${IUT_SRC})
+  target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
+  target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
+  target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
+  add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
+  add_test_iut(iutdsp128 1.0)
+  list(APPEND IUT_LIST iutdsp128)
+
+  # iutdsp256
+  add_executable(iutdsp256 ${IUT_SRC})
+  target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
+  target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX})
+  target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
+  add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
+  add_test_iut(iutdsp256 1.0)
+  list(APPEND IUT_LIST iutdsp256)
+endif(SLEEF_ARCH_X86)
+
+if (SLEEF_ARCH_PPC64)
+  add_executable(iutdsp128 ${IUT_SRC})
+  target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
+  target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX})
+  target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
+  add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
+  add_test_iut(iutdsp128 1.0)
+  list(APPEND IUT_LIST iutdsp128)
+endif(SLEEF_ARCH_PPC64)
+
+if (SLEEF_ARCH_S390X)
+  add_executable(iutdsp128 ${IUT_SRC})
+  target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
+  target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE})
+  target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
+  add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
+  add_test_iut(iutdsp128 1.0)
+  list(APPEND IUT_LIST iutdsp128)
+endif(SLEEF_ARCH_S390X)
+
+if(SLEEF_BUILD_SCALAR_LIB)
+  # Compile executable 'iutscalar'
+  add_executable(iutscalar iut.c testerutil.c)
+  target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS})
+  target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
+  set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES})
+  add_test_iut(iutscalar 1.0)
+  list(APPEND IUT_LIST iutscalar)
+endif()
+
+if(LIB_MPFR AND NOT MINGW)
+  # Build tester2 scalar
+  set(PRECISIONS dp sp)
+  if(COMPILER_SUPPORTS_LONG_DOUBLE)
+    list(APPEND PRECISIONS ld)
+  endif()
+  if(COMPILER_SUPPORTS_QUADMATH)
+    list(APPEND PRECISIONS qp)
+    set(LIBQUADMATH "-lquadmath")
+    set(ENABLEFLOAT128 PRIVATE ENABLEFLOAT128=1)
+  endif()
+  foreach(P ${PRECISIONS})
+    set(T "tester2${P}")
+    add_executable(${T} tester2${P}.c testerutil.c)
+    target_compile_definitions(${T} PRIVATE USEMPFR=1 ${ENABLEFLOAT128} ${COMMON_TARGET_DEFINITIONS})
+    set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+    if (FORCE_AAVPCS)
+      target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
+    endif(FORCE_AAVPCS)
+    if (MPFR_INCLUDE_DIR)
+      target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
+    endif()
+    target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${LIB_MPFR} ${LIBM} ${LIBGMP})
+    add_dependencies(${T} ${TARGET_HEADERS})
+    add_dependencies(${T} ${TARGET_LIBSLEEF})
+  endforeach()
+
+  # Compile executable 'tester'
+  add_host_executable(${TARGET_TESTER} tester.c testerutil.c)
+  if (NOT CMAKE_CROSSCOMPILING)
+    target_link_libraries(${TARGET_TESTER} ${LIB_MPFR} ${TARGET_LIBSLEEF} ${LIBM} ${LIBGMP})
+    target_compile_definitions(${TARGET_TESTER}
+      PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
+    target_compile_options(${TARGET_TESTER} PRIVATE -Wno-unused-result)
+    set_target_properties(${TARGET_TESTER} PROPERTIES ${COMMON_TARGET_PROPERTIES})
+    if (MPFR_INCLUDE_DIR)
+      target_include_directories(${TARGET_TESTER} PRIVATE ${MPFR_INCLUDE_DIR})
+    endif()
+  endif()
+endif(LIB_MPFR AND NOT MINGW)
+
+if(ENABLE_GNUABI AND COMPILER_SUPPORTS_OMP_SIMD AND NOT SLEEF_TARGET_PROCESSOR MATCHES "^i.86$")
+  # Build tester for vectorabi
+  add_executable(testervecabi testervecabi.c)
+  target_compile_definitions(testervecabi PRIVATE ${COMMON_TARGET_DEFINITIONS})
+  target_compile_options(testervecabi PRIVATE ${OpenMP_C_FLAGS})
+  target_link_libraries(testervecabi ${TARGET_LIBSLEEF} ${OpenMP_C_FLAGS})
+  set_target_properties(testervecabi PROPERTIES C_STANDARD 99)
+  add_test(NAME testervecabi COMMAND ${EMULATOR} testervecabi
+    WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+endif()
+
+# mveclibtest
+
+if (ENABLE_GNUABI AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
+  add_executable(mveclibtest-sse2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
+  set_target_properties(mveclibtest-sse2 PROPERTIES C_STANDARD 99)
+  target_compile_options(mveclibtest-sse2 PRIVATE ${FLAGS_FASTMATH} "-O3")
+  target_link_libraries(mveclibtest-sse2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
+  add_dependencies(mveclibtest-sse2 ${TARGET_HEADERS})
+  add_test(NAME mveclibtest-sse2 COMMAND mveclibtest-sse2)
+
+  add_executable(mveclibtest-avx mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
+  set_target_properties(mveclibtest-avx PROPERTIES C_STANDARD 99)
+  target_compile_options(mveclibtest-avx PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX} "-O3")
+  target_link_libraries(mveclibtest-avx ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
+  add_dependencies(mveclibtest-avx ${TARGET_HEADERS})
+  add_test(NAME mveclibtest-avx COMMAND mveclibtest-avx)
+
+  add_executable(mveclibtest-avx2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
+  set_target_properties(mveclibtest-avx2 PROPERTIES C_STANDARD 99)
+  target_compile_options(mveclibtest-avx2 PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX2} "-O3")
+  target_link_libraries(mveclibtest-avx2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
+  add_dependencies(mveclibtest-avx2 ${TARGET_HEADERS})
+  add_test(NAME mveclibtest-avx2 COMMAND mveclibtest-avx2)
+
+  add_executable(mveclibtest-avx512f mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
+  set_target_properties(mveclibtest-avx512f PROPERTIES C_STANDARD 99)
+  target_compile_options(mveclibtest-avx512f PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX512F} "-O3")
+  target_link_libraries(mveclibtest-avx512f ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
+  add_dependencies(mveclibtest-avx512f ${TARGET_HEADERS})
+  add_test(NAME mveclibtest-avx512f COMMAND mveclibtest-avx512f)
+endif()
+
+#
+
+if (FILECHECK_COMMAND AND COMPILER_SUPPORTS_OPENMP AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
+  add_test(NAME autovec-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-AVX2")
+  add_test(NAME autovec-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-SSE2")
+  add_test(NAME testervecabi-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-SSE2")
+  add_test(NAME testervecabi-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-AVX2")
+endif()
+
+# Tests depends on the library
+add_dependencies(${TARGET_IUT} ${TARGET_HEADERS})
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/autovec.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/autovec.c
@@ -0,0 +1,651 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#define SLEEF_ENABLE_OMP_SIMD
+#include "sleef.h"
+
+#define N 1024
+double a[N], b[N], c[N], d[N];
+float e[N], f[N], g[N], h[N];
+
+void testsind1_u10() {
+// CHECK-SSE2: testsind1_u10
+// CHECK-AVX2: testsind1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_sind1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u10
+}
+
+void testsind1_u35() {
+// CHECK-SSE2: testsind1_u35
+// CHECK-AVX2: testsind1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_sind1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u35
+}
+
+void testsinf1_u10() {
+// CHECK-SSE2: testsinf1_u10
+// CHECK-AVX2: testsinf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u10
+}
+
+void testsinf1_u35() {
+// CHECK-SSE2: testsinf1_u35
+// CHECK-AVX2: testsinf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u35
+}
+
+void testcosd1_u10() {
+// CHECK-SSE2: testcosd1_u10
+// CHECK-AVX2: testcosd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u10
+}
+
+void testcosd1_u35() {
+// CHECK-SSE2: testcosd1_u35
+// CHECK-AVX2: testcosd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u35
+}
+
+void testcosf1_u10() {
+// CHECK-SSE2: testcosf1_u10
+// CHECK-AVX2: testcosf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u10
+}
+
+void testcosf1_u35() {
+// CHECK-SSE2: testcosf1_u35
+// CHECK-AVX2: testcosf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u35
+}
+
+void testtand1_u10() {
+// CHECK-SSE2: testtand1_u10
+// CHECK-AVX2: testtand1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_tand1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u10
+}
+
+void testtand1_u35() {
+// CHECK-SSE2: testtand1_u35
+// CHECK-AVX2: testtand1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_tand1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u35
+}
+
+void testtanf1_u10() {
+// CHECK-SSE2: testtanf1_u10
+// CHECK-AVX2: testtanf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u10
+}
+
+void testtanf1_u35() {
+// CHECK-SSE2: testtanf1_u35
+// CHECK-AVX2: testtanf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u35
+}
+
+void testasind1_u10() {
+// CHECK-SSE2: testasind1_u10
+// CHECK-AVX2: testasind1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_asind1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u10
+}
+
+void testasind1_u35() {
+// CHECK-SSE2: testasind1_u35
+// CHECK-AVX2: testasind1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_asind1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u35
+}
+
+void testasinf1_u10() {
+// CHECK-SSE2: testasinf1_u10
+// CHECK-AVX2: testasinf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u10
+}
+
+void testasinf1_u35() {
+// CHECK-SSE2: testasinf1_u35
+// CHECK-AVX2: testasinf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u35
+}
+
+void testacosd1_u10() {
+// CHECK-SSE2: testacosd1_u10
+// CHECK-AVX2: testacosd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u10
+}
+
+void testacosd1_u35() {
+// CHECK-SSE2: testacosd1_u35
+// CHECK-AVX2: testacosd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u35
+}
+
+void testacosf1_u10() {
+// CHECK-SSE2: testacosf1_u10
+// CHECK-AVX2: testacosf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u10
+}
+
+void testacosf1_u35() {
+// CHECK-SSE2: testacosf1_u35
+// CHECK-AVX2: testacosf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u35
+}
+
+void testatand1_u10() {
+// CHECK-SSE2: testatand1_u10
+// CHECK-AVX2: testatand1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_atand1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u10
+}
+
+void testatand1_u35() {
+// CHECK-SSE2: testatand1_u35
+// CHECK-AVX2: testatand1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_atand1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u35
+}
+
+void testatanf1_u10() {
+// CHECK-SSE2: testatanf1_u10
+// CHECK-AVX2: testatanf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u10
+}
+
+void testatanf1_u35() {
+// CHECK-SSE2: testatanf1_u35
+// CHECK-AVX2: testatanf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u35
+}
+
+void testatan2d1_u10() {
+// CHECK-SSE2: testatan2d1_u10
+// CHECK-AVX2: testatan2d1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u10(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u10
+// CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u10
+}
+
+void testatan2d1_u35() {
+// CHECK-SSE2: testatan2d1_u35
+// CHECK-AVX2: testatan2d1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u35(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u35
+// CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u35
+}
+
+void testatan2f1_u10() {
+// CHECK-SSE2: testatan2f1_u10
+// CHECK-AVX2: testatan2f1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u10(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u10
+// CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u10
+}
+
+void testatan2f1_u35() {
+// CHECK-SSE2: testatan2f1_u35
+// CHECK-AVX2: testatan2f1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u35(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u35
+// CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u35
+}
+
+void testsinhd1_u10() {
+// CHECK-SSE2: testsinhd1_u10
+// CHECK-AVX2: testsinhd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u10
+}
+
+void testsinhd1_u35() {
+// CHECK-SSE2: testsinhd1_u35
+// CHECK-AVX2: testsinhd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u35
+}
+
+void testsinhf1_u10() {
+// CHECK-SSE2: testsinhf1_u10
+// CHECK-AVX2: testsinhf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u10
+}
+
+void testsinhf1_u35() {
+// CHECK-SSE2: testsinhf1_u35
+// CHECK-AVX2: testsinhf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u35
+}
+
+void testcoshd1_u10() {
+// CHECK-SSE2: testcoshd1_u10
+// CHECK-AVX2: testcoshd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u10
+}
+
+void testcoshd1_u35() {
+// CHECK-SSE2: testcoshd1_u35
+// CHECK-AVX2: testcoshd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u35
+}
+
+void testcoshf1_u10() {
+// CHECK-SSE2: testcoshf1_u10
+// CHECK-AVX2: testcoshf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u10
+}
+
+void testcoshf1_u35() {
+// CHECK-SSE2: testcoshf1_u35
+// CHECK-AVX2: testcoshf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u35
+}
+
+void testtanhd1_u10() {
+// CHECK-SSE2: testtanhd1_u10
+// CHECK-AVX2: testtanhd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u10
+}
+
+void testtanhd1_u35() {
+// CHECK-SSE2: testtanhd1_u35
+// CHECK-AVX2: testtanhd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u35
+}
+
+void testtanhf1_u10() {
+// CHECK-SSE2: testtanhf1_u10
+// CHECK-AVX2: testtanhf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u10
+}
+
+void testtanhf1_u35() {
+// CHECK-SSE2: testtanhf1_u35
+// CHECK-AVX2: testtanhf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u35
+}
+
+void testasinhd1_u10() {
+// CHECK-SSE2: testasinhd1_u10
+// CHECK-AVX2: testasinhd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_asinhd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_asinhd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_asinhd1_u10
+}
+
+void testasinhf1_u10() {
+// CHECK-SSE2: testasinhf1_u10
+// CHECK-AVX2: testasinhf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_asinhf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_asinhf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_asinhf1_u10
+}
+
+void testacoshd1_u10() {
+// CHECK-SSE2: testacoshd1_u10
+// CHECK-AVX2: testacoshd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_acoshd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_acoshd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_acoshd1_u10
+}
+
+void testacoshf1_u10() {
+// CHECK-SSE2: testacoshf1_u10
+// CHECK-AVX2: testacoshf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_acoshf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_acoshf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_acoshf1_u10
+}
+
+void testatanhd1_u10() {
+// CHECK-SSE2: testatanhd1_u10
+// CHECK-AVX2: testatanhd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_atanhd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_atanhd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_atanhd1_u10
+}
+
+void testatanhf1_u10() {
+// CHECK-SSE2: testatanhf1_u10
+// CHECK-AVX2: testatanhf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_atanhf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_atanhf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_atanhf1_u10
+}
+
+void testlogd1_u10() {
+// CHECK-SSE2: testlogd1_u10
+// CHECK-AVX2: testlogd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_logd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u10
+}
+
+void testlogd1_u35() {
+// CHECK-SSE2: testlogd1_u35
+// CHECK-AVX2: testlogd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_logd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u35
+}
+
+void testlogf1_u10() {
+// CHECK-SSE2: testlogf1_u10
+// CHECK-AVX2: testlogf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_logf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u10
+}
+
+void testlogf1_u35() {
+// CHECK-SSE2: testlogf1_u35
+// CHECK-AVX2: testlogf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_logf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u35
+}
+
+void testlog2d1_u10() {
+// CHECK-SSE2: testlog2d1_u10
+// CHECK-AVX2: testlog2d1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_log2d1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_log2d1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_log2d1_u10
+}
+
+void testlog2f1_u10() {
+// CHECK-SSE2: testlog2f1_u10
+// CHECK-AVX2: testlog2f1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_log2f1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_log2f1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_log2f1_u10
+}
+
+void testlog10d1_u10() {
+// CHECK-SSE2: testlog10d1_u10
+// CHECK-AVX2: testlog10d1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_log10d1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_log10d1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_log10d1_u10
+}
+
+void testlog10f1_u10() {
+// CHECK-SSE2: testlog10f1_u10
+// CHECK-AVX2: testlog10f1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_log10f1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_log10f1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_log10f1_u10
+}
+
+void testlog1pd1_u10() {
+// CHECK-SSE2: testlog1pd1_u10
+// CHECK-AVX2: testlog1pd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_log1pd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_log1pd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_log1pd1_u10
+}
+
+void testlog1pf1_u10() {
+// CHECK-SSE2: testlog1pf1_u10
+// CHECK-AVX2: testlog1pf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_log1pf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_log1pf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_log1pf1_u10
+}
+
+void testexpd1_u10() {
+// CHECK-SSE2: testexpd1_u10
+// CHECK-AVX2: testexpd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_expd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_expd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_expd1_u10
+}
+
+void testexpf1_u10() {
+// CHECK-SSE2: testexpf1_u10
+// CHECK-AVX2: testexpf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_expf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_expf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_expf1_u10
+}
+
+void testexp2d1_u10() {
+// CHECK-SSE2: testexp2d1_u10
+// CHECK-AVX2: testexp2d1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_exp2d1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_exp2d1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_exp2d1_u10
+}
+
+void testexp2f1_u10() {
+// CHECK-SSE2: testexp2f1_u10
+// CHECK-AVX2: testexp2f1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_exp2f1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_exp2f1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_exp2f1_u10
+}
+
+void testexp10d1_u10() {
+// CHECK-SSE2: testexp10d1_u10
+// CHECK-AVX2: testexp10d1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_exp10d1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_exp10d1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_exp10d1_u10
+}
+
+void testexp10f1_u10() {
+// CHECK-SSE2: testexp10f1_u10
+// CHECK-AVX2: testexp10f1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_exp10f1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_exp10f1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_exp10f1_u10
+}
+
+void testexpm1d1_u10() {
+// CHECK-SSE2: testexpm1d1_u10
+// CHECK-AVX2: testexpm1d1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_expm1d1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_expm1d1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_expm1d1_u10
+}
+
+void testexpm1f1_u10() {
+// CHECK-SSE2: testexpm1f1_u10
+// CHECK-AVX2: testexpm1f1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_expm1f1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_expm1f1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_expm1f1_u10
+}
+
+void testpowd1_u10() {
+// CHECK-SSE2: testpowd1_u10
+// CHECK-AVX2: testpowd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_powd1_u10(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_powd1_u10
+// CHECK-AVX2: _ZGVdN4vv_Sleef_powd1_u10
+}
+
+void testpowf1_u10() {
+// CHECK-SSE2: testpowf1_u10
+// CHECK-AVX2: testpowf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_powf1_u10(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_powf1_u10
+// CHECK-AVX2: _ZGVdN8vv_Sleef_powf1_u10
+}
+
+void testcbrtd1_u10() {
+// CHECK-SSE2: testcbrtd1_u10
+// CHECK-AVX2: testcbrtd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u10
+}
+
+void testcbrtd1_u35() {
+// CHECK-SSE2: testcbrtd1_u35
+// CHECK-AVX2: testcbrtd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u35(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u35
+// CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u35
+}
+
+void testcbrtf1_u10() {
+// CHECK-SSE2: testcbrtf1_u10
+// CHECK-AVX2: testcbrtf1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u10
+}
+
+void testcbrtf1_u35() {
+// CHECK-SSE2: testcbrtf1_u35
+// CHECK-AVX2: testcbrtf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u35(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u35
+// CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u35
+}
+
+void testhypotd1_u05() {
+// CHECK-SSE2: testhypotd1_u05
+// CHECK-AVX2: testhypotd1_u05
+  for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u05(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u05
+// CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u05
+}
+
+void testhypotd1_u35() {
+// CHECK-SSE2: testhypotd1_u35
+// CHECK-AVX2: testhypotd1_u35
+  for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u35(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u35
+// CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u35
+}
+
+void testhypotf1_u05() {
+// CHECK-SSE2: testhypotf1_u05
+// CHECK-AVX2: testhypotf1_u05
+  for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u05(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u05
+// CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u05
+}
+
+void testhypotf1_u35() {
+// CHECK-SSE2: testhypotf1_u35
+// CHECK-AVX2: testhypotf1_u35
+  for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u35(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u35
+// CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u35
+}
+
+void testerfd1_u10() {
+// CHECK-SSE2: testerfd1_u10
+// CHECK-AVX2: testerfd1_u10
+  for(int i=0;i<N;i++) a[i] = Sleef_erfd1_u10(b[i]);
+// CHECK-SSE2: _ZGVbN2v_Sleef_erfd1_u10
+// CHECK-AVX2: _ZGVdN4v_Sleef_erfd1_u10
+}
+
+void testerff1_u10() {
+// CHECK-SSE2: testerff1_u10
+// CHECK-AVX2: testerff1_u10
+  for(int i=0;i<N;i++) e[i] = Sleef_erff1_u10(f[i]);
+// CHECK-SSE2: _ZGVbN4v_Sleef_erff1_u10
+// CHECK-AVX2: _ZGVdN8v_Sleef_erff1_u10
+}
+
+void testfmodd1() {
+// CHECK-SSE2: testfmodd1
+// CHECK-AVX2: testfmodd1
+  for(int i=0;i<N;i++) a[i] = Sleef_fmodd1(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_fmodd1
+// CHECK-AVX2: _ZGVdN4vv_Sleef_fmodd1
+}
+
+void testfmodf1() {
+// CHECK-SSE2: testfmodf1
+// CHECK-AVX2: testfmodf1
+  for(int i=0;i<N;i++) e[i] = Sleef_fmodf1(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_fmodf1
+// CHECK-AVX2: _ZGVdN8vv_Sleef_fmodf1
+}
+
+void testremainderd1() {
+// CHECK-SSE2: testremainderd1
+// CHECK-AVX2: testremainderd1
+  for(int i=0;i<N;i++) a[i] = Sleef_remainderd1(b[i], c[i]);
+// CHECK-SSE2: _ZGVbN2vv_Sleef_remainderd1
+// CHECK-AVX2: _ZGVdN4vv_Sleef_remainderd1
+}
+
+void testremainderf1() {
+// CHECK-SSE2: testremainderf1
+// CHECK-AVX2: testremainderf1
+  for(int i=0;i<N;i++) e[i] = Sleef_remainderf1(f[i], g[i]);
+// CHECK-SSE2: _ZGVbN4vv_Sleef_remainderf1
+// CHECK-AVX2: _ZGVdN8vv_Sleef_remainderf1
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c
@@ -0,0 +1,714 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+/// This program makes sure that all the symbols that a
+/// GNUABI-compatible compiler (clang or gcc) can generate when
+/// vectorizing functions call from `#include <math.h>` are present in
+/// `libsleefgnuabi.so`.
+///
+/// The header `math.h` is not the same on all systems, and different
+/// macros can activate different sets of functions. The list provide
+/// here shoudl cover the union of all possible systems that we want
+/// to support. In particular, the test is checking that the "finite"
+/// symmbols from `#include <bits/math-finite.h>` are present for
+/// those systems supporting them.
+
+#include <setjmp.h>
+#include <stdio.h>
+#include <string.h>
+
+#if defined(ENABLE_SSE4) || defined(ENABLE_SSE2)
+#include <x86intrin.h>
+
+#define ISA_TOKEN b
+#define VLEN_SP 4
+#define VLEN_DP 2
+#define VECTOR_CC
+
+typedef __m128i vopmask;
+typedef __m128d vdouble;
+typedef __m128  vfloat;
+typedef __m128i vint;
+typedef __m128i vint2;
+#endif /* defined(ENABLE_SSE4) || defined(ENABLE_SSE2) */
+
+#ifdef ENABLE_AVX
+#include <x86intrin.h>
+
+#define ISA_TOKEN c
+#define VLEN_SP 8
+#define VLEN_DP 4
+#define VECTOR_CC
+
+typedef __m256i vopmask;
+typedef __m256d vdouble;
+typedef __m256 vfloat;
+typedef __m128i vint;
+typedef struct { __m128i x, y; } vint2;
+#endif /* ENABLE_AVX */
+
+#ifdef ENABLE_AVX2
+#include <x86intrin.h>
+
+#define ISA_TOKEN d
+#define VLEN_SP 8
+#define VLEN_DP 4
+#define VECTOR_CC
+
+typedef __m256i vopmask;
+typedef __m256d vdouble;
+typedef __m256 vfloat;
+typedef __m128i vint;
+typedef __m256i vint2;
+#endif /* ENABLE_AVX2 */
+
+#ifdef ENABLE_AVX512F
+#include <x86intrin.h>
+
+#define ISA_TOKEN e
+#define VLEN_SP 16
+#define VLEN_DP 8
+#define VECTOR_CC
+
+typedef __mmask16 vopmask;
+typedef __m512d vdouble;
+typedef __m512 vfloat;
+typedef __m256i vint;
+typedef __m512i vint2;
+#endif /* ENABLE_AVX512F */
+
+#ifdef ENABLE_ADVSIMD
+#include <arm_neon.h>
+#define ISA_TOKEN n
+#define VLEN_DP 2
+#define VLEN_SP 4
+
+#ifdef ENABLE_AAVPCS
+#define VECTOR_CC __attribute__((aarch64_vector_pcs))
+#else
+#define VECTOR_CC
+#endif
+
+typedef uint32x4_t vopmask;
+typedef float64x2_t vdouble;
+typedef float32x4_t vfloat;
+typedef int32x2_t vint;
+typedef int32x4_t vint2;
+#endif /* ENABLE_ADVSIMDF */
+
+#ifdef ENABLE_SVE
+#include <arm_sve.h>
+#define ISA_TOKEN s
+#define VLEN_SP (svcntw())
+#define VLEN_DP (svcntd())
+#define VLA_TOKEN x
+#define VECTOR_CC
+
+typedef svbool_t vopmask;
+typedef svfloat64_t vdouble;
+typedef svfloat32_t vfloat;
+typedef svint32_t vint;
+typedef svint32_t vint2;
+#endif /* ENABLE_SVE */
+
+// GNUABI name mangling macro.
+#ifndef MASKED_GNUABI
+
+#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name
+
+#define __DECLARE_vd_vd(name, t, vl, p)                                 \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
+#define __CALL_vd_vd(name, t, vl, p)                            \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
+
+#define __DECLARE_vi_vd(name, t, vl, p)                         \
+  extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
+#define __CALL_vi_vd(name, t, vl, p)                            \
+  do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
+
+#define __DECLARE_vd_vd_vi(name, t, vl, p)                              \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint)
+#define __CALL_vd_vd_vi(name, t, vl, p)                                 \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0)
+
+#define __DECLARE_vd_vd_vd(name, t, vl, p)                              \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble)
+#define __CALL_vd_vd_vd(name, t, vl, p)                                 \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0)
+
+#define __DECLARE_vd_vd_vd_vd(name, t, vl, p)                           \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble)
+#define __CALL_vd_vd_vd_vd(name, t, vl, p)                              \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0)
+
+#define __DECLARE_vd_vd_pvd(name, t, vl, p)                             \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *)
+#define __CALL_vd_vd_pvd(name, t, vl, p)                                \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0)
+
+#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p)                          \
+  extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *)
+#define __CALL_v_vd_pvd_pvd(name, t, vl, p)                             \
+  do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0)
+
+#define __DECLARE_vf_vf(name, t, vl, p)                                 \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
+#define __CALL_vf_vf(name, t, vl, p)                            \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
+
+#define __DECLARE_vf_vf_vf(name, t, vl, p)                              \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat)
+#define __CALL_vf_vf_vf(name, t, vl, p)                                 \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0)
+
+#define __DECLARE_vf_vf_vf_vf(name, t, vl, p)                           \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat)
+#define __CALL_vf_vf_vf_vf(name, t, vl, p)                              \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0)
+
+#define __DECLARE_vf_vf_pvf(name, t, vl, p)                             \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *)
+#define __CALL_vf_vf_pvf(name, t, vl, p)                                \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0)
+
+#define __DECLARE_vi_vf(name, t, vl, p)                         \
+  extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
+#define __CALL_vi_vf(name, t, vl, p)                            \
+  do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
+
+#define __DECLARE_vf_vf_vi(name, t, vl, p)                              \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2)
+#define __CALL_vf_vf_vi(name, t, vl, p)                                 \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0)
+
+#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p)                          \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*)
+#define __CALL_v_vf_pvf_pvf(name, t, vl, p)                             \
+  do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0)
+
+#else /******************** MASKED_GNUABI *****************************/
+
+#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name
+
+#define __DECLARE_vd_vd(name, t, vl, p)                                 \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
+#define __CALL_vd_vd(name, t, vl, p)                                    \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
+
+#define __DECLARE_vi_vd(name, t, vl, p)                                 \
+  extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
+#define __CALL_vi_vd(name, t, vl, p)                                    \
+  do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
+
+#define __DECLARE_vd_vd_vi(name, t, vl, p)                              \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask)
+#define __CALL_vd_vd_vi(name, t, vl, p)                                 \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0)
+
+#define __DECLARE_vd_vd_vd(name, t, vl, p)                              \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask)
+#define __CALL_vd_vd_vd(name, t, vl, p)                                 \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0)
+
+#define __DECLARE_vd_vd_vd_vd(name, t, vl, p)                           \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask)
+#define __CALL_vd_vd_vd_vd(name, t, vl, p)                              \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0)
+
+#define __DECLARE_vd_vd_pvd(name, t, vl, p)                             \
+  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask)
+#define __CALL_vd_vd_pvd(name, t, vl, p)                                \
+  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0)
+
+#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p)                          \
+  extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask)
+#define __CALL_v_vd_pvd_pvd(name, t, vl, p)                             \
+  do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0)
+
+#define __DECLARE_vf_vf(name, t, vl, p)                                 \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
+#define __CALL_vf_vf(name, t, vl, p)                                    \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
+
+#define __DECLARE_vf_vf_vf(name, t, vl, p)                              \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask)
+#define __CALL_vf_vf_vf(name, t, vl, p)                                 \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0)
+
+#define __DECLARE_vf_vf_vf_vf(name, t, vl, p)                           \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask)
+#define __CALL_vf_vf_vf_vf(name, t, vl, p)                              \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0)
+
+#define __DECLARE_vf_vf_pvf(name, t, vl, p)                             \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask)
+#define __CALL_vf_vf_pvf(name, t, vl, p)                                \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0)
+
+#define __DECLARE_vi_vf(name, t, vl, p)                                 \
+  extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
+#define __CALL_vi_vf(name, t, vl, p)                                    \
+  do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
+
+#define __DECLARE_vf_vf_vi(name, t, vl, p)                              \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask)
+#define __CALL_vf_vf_vi(name, t, vl, p)                                 \
+  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0)
+
+#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p)                          \
+  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask)
+#define __CALL_v_vf_pvf_pvf(name, t, vl, p)                             \
+  do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0)
+
+#endif /* MASKED_GNUABI */
+// Level-1 expansion macros for declaration and call. The signature of
+// each function has three input paramters to avoid segfaults of
+// sincos-like functions that are effectively loading data from
+// memory.
+
+
+// Make sure that the architectural macros are defined for each vector
+// extension.
+#ifndef ISA_TOKEN
+#error "Missing ISA token"
+#endif
+
+#ifndef VLEN_DP
+#error "Missing VLEN_DP"
+#endif
+
+#ifndef VLEN_DP
+#error "Missing VLEN_SP"
+#endif
+
+#if defined(ENABLE_SVE) && !defined(VLA_TOKEN)
+#error "Missing VLA_TOKEN"
+#endif /* defined(ENABLE_SVE) && !defined(VLA_TOKEN) */
+
+// Declaration and call, first level expantion to pick up the
+// ISA_TOKEN and VLEN_* architectural macros.
+#ifndef ENABLE_SVE
+
+#define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
+#define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
+
+#define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
+
+#define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
+
+#define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
+
+#define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
+
+#define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
+
+#define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
+
+#define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
+#define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
+
+#else /* ENABLE_SVE */
+
+#define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
+
+#define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
+
+#define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
+
+#define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
+
+#define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi0)
+
+#define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
+
+#define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
+
+#define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
+
+#define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
+
+#define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
+
+#define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
+
+#define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi20)
+
+#define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
+
+#define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
+#define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
+
+#endif /* ENABLE_SVE */
+
+//
+
+// Douple precision function declarations.
+DECLARE_DP_vd_vd(__acos_finite, v);
+DECLARE_DP_vd_vd(__acosh_finite, v);
+DECLARE_DP_vd_vd(__asin_finite, v);
+DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
+DECLARE_DP_vd_vd(__atanh_finite, v);
+DECLARE_DP_vd_vd(__cosh_finite, v);
+DECLARE_DP_vd_vd(__exp10_finite, v);
+DECLARE_DP_vd_vd(__exp2_finite, v);
+DECLARE_DP_vd_vd(__exp_finite, v);
+DECLARE_DP_vd_vd_vd(__fmod_finite, vv);
+DECLARE_DP_vd_vd_pvd(__modf_finite, vl8);
+DECLARE_DP_vd_vd_vd(__hypot_finite, vv);
+DECLARE_DP_vd_vd(__log10_finite, v);
+// DECLARE_DP_vd_vd(__log2_finite,v);
+DECLARE_DP_vd_vd(__log_finite, v);
+DECLARE_DP_vd_vd_vd(__pow_finite, vv);
+DECLARE_DP_vd_vd(__sinh_finite, v);
+DECLARE_DP_vd_vd(__sqrt_finite, v);
+DECLARE_DP_vd_vd(acos, v);
+DECLARE_DP_vd_vd(acosh, v);
+DECLARE_DP_vd_vd(asin, v);
+DECLARE_DP_vd_vd(asinh, v);
+DECLARE_DP_vd_vd(atan, v);
+DECLARE_DP_vd_vd_vd(atan2, vv);
+DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
+DECLARE_DP_vd_vd(atanh, v);
+DECLARE_DP_vd_vd(cbrt, v);
+DECLARE_DP_vd_vd(ceil, v);
+DECLARE_DP_vd_vd_vd(copysign, vv);
+DECLARE_DP_vd_vd(cos, v);
+DECLARE_DP_vd_vd(cosh, v);
+DECLARE_DP_vd_vd(cospi, v);
+DECLARE_DP_vd_vd(erf, v);
+DECLARE_DP_vd_vd(erfc, v);
+DECLARE_DP_vd_vd(exp, v);
+DECLARE_DP_vd_vd(exp10, v);
+DECLARE_DP_vd_vd(exp2, v);
+DECLARE_DP_vi_vd(expfrexp, v);
+DECLARE_DP_vd_vd(expm1, v);
+DECLARE_DP_vd_vd(fabs, v);
+DECLARE_DP_vd_vd_vd(fdim, vv);
+DECLARE_DP_vd_vd(floor, v);
+DECLARE_DP_vd_vd_vd_vd(fma, vvv);
+DECLARE_DP_vd_vd_vd(fmax, vv);
+DECLARE_DP_vd_vd_vd(fmin, vv);
+DECLARE_DP_vd_vd_vd(fmod, vv);
+DECLARE_DP_vd_vd(frfrexp, v);
+DECLARE_DP_vd_vd_vd(hypot, vv);
+DECLARE_DP_vi_vd(ilogb, v);
+DECLARE_DP_vd_vd_vi(ldexp, vv);
+DECLARE_DP_vd_vd(lgamma, v);
+DECLARE_DP_vd_vd(log, v);
+DECLARE_DP_vd_vd(log10, v);
+DECLARE_DP_vd_vd(log1p, v);
+DECLARE_DP_vd_vd(log2, v);
+DECLARE_DP_vd_vd_pvd(modf, vl8);
+DECLARE_DP_vd_vd_vd(nextafter, vv);
+DECLARE_DP_vd_vd_vd(pow, vv);
+DECLARE_DP_vd_vd(rint, v);
+DECLARE_DP_vd_vd(round, v);
+DECLARE_DP_vd_vd(sin, v);
+DECLARE_DP_v_vd_pvd_pvd(sincos, vl8l8);
+DECLARE_DP_v_vd_pvd_pvd(sincospi, vl8l8);
+DECLARE_DP_vd_vd(sinh, v);
+DECLARE_DP_vd_vd(sinpi, v);
+DECLARE_DP_vd_vd(sqrt, v);
+DECLARE_DP_vd_vd(tan, v);
+DECLARE_DP_vd_vd(tanh, v);
+DECLARE_DP_vd_vd(tgamma, v);
+DECLARE_DP_vd_vd(trunc, v);
+
+// Single precision function declarations.
+DECLARE_SP_vf_vf(__acosf_finite, v);
+DECLARE_SP_vf_vf(__acoshf_finite, v);
+DECLARE_SP_vf_vf(__asinf_finite, v);
+DECLARE_SP_vf_vf_vf(__atan2f_finite, vv);
+DECLARE_SP_vf_vf(__atanhf_finite, v);
+DECLARE_SP_vf_vf(__coshf_finite, v);
+DECLARE_SP_vf_vf(__exp10f_finite, v);
+DECLARE_SP_vf_vf(__exp2f_finite, v);
+DECLARE_SP_vf_vf(__expf_finite, v);
+DECLARE_SP_vf_vf_vf(__fmodf_finite, vv);
+DECLARE_SP_vf_vf_pvf(__modff_finite, vl4);
+DECLARE_SP_vf_vf_vf(__hypotf_finite, vv);
+DECLARE_SP_vf_vf(__log10f_finite, v);
+// DECLARE_SP_vf_vf(__log2f_finite,v);
+DECLARE_SP_vf_vf(__logf_finite, v);
+DECLARE_SP_vf_vf_vf(__powf_finite, vv);
+DECLARE_SP_vf_vf(__sinhf_finite, v);
+DECLARE_SP_vf_vf(__sqrtf_finite, v);
+DECLARE_SP_vf_vf(acosf, v);
+DECLARE_SP_vf_vf(acoshf, v);
+DECLARE_SP_vf_vf(asinf, v);
+DECLARE_SP_vf_vf(asinhf, v);
+DECLARE_SP_vf_vf(atanf, v);
+DECLARE_SP_vf_vf_vf(atan2f, vv);
+DECLARE_SP_vf_vf(atanhf, v);
+DECLARE_SP_vf_vf(cbrtf, v);
+DECLARE_SP_vf_vf(ceilf, v);
+DECLARE_SP_vf_vf_vf(copysignf, vv);
+DECLARE_SP_vf_vf(cosf, v);
+DECLARE_SP_vf_vf(coshf, v);
+DECLARE_SP_vf_vf(cospif, v);
+DECLARE_SP_vf_vf(erff, v);
+DECLARE_SP_vf_vf(erfcf, v);
+DECLARE_SP_vf_vf(expf, v);
+DECLARE_SP_vf_vf(exp10f, v);
+DECLARE_SP_vf_vf(exp2f, v);
+DECLARE_SP_vf_vf(expm1f, v);
+DECLARE_SP_vf_vf(fabsf, v);
+DECLARE_SP_vf_vf_vf(fdimf, vv);
+DECLARE_SP_vf_vf(floorf, v);
+DECLARE_SP_vf_vf_vf_vf(fmaf, vvv);
+DECLARE_SP_vf_vf_vf(fmaxf, vv);
+DECLARE_SP_vf_vf_vf(fminf, vv);
+DECLARE_SP_vf_vf_vf(fmodf, vv);
+DECLARE_SP_vf_vf(frfrexpf, v);
+DECLARE_SP_vf_vf_vf(hypotf, vv);
+#ifndef ENABLE_AVX
+// These two functions are not checked in some configurations due to
+// the issue in https://github.com/shibatch/sleef/issues/221
+DECLARE_SP_vi_vf(expfrexpf, v);
+DECLARE_SP_vi_vf(ilogbf, v);
+#endif
+DECLARE_SP_vf_vf_vi(ldexpf, vv);
+DECLARE_SP_vf_vf(lgammaf, v);
+DECLARE_SP_vf_vf(logf, v);
+DECLARE_SP_vf_vf(log10f, v);
+DECLARE_SP_vf_vf(log1pf, v);
+DECLARE_SP_vf_vf(log2f, v);
+DECLARE_SP_vf_vf_pvf(modff, vl4);
+DECLARE_SP_vf_vf_vf(nextafterf, vv);
+DECLARE_SP_vf_vf_vf(powf, vv);
+DECLARE_SP_vf_vf(rintf, v);
+DECLARE_SP_vf_vf(roundf, v);
+DECLARE_SP_vf_vf(sinf, v);
+DECLARE_SP_v_vf_pvf_pvf(sincosf, vl4l4);
+DECLARE_SP_v_vf_pvf_pvf(sincospif, vl4l4);
+DECLARE_SP_vf_vf(sinhf, v);
+DECLARE_SP_vf_vf(sinpif, v);
+DECLARE_SP_vf_vf(sqrtf, v);
+DECLARE_SP_vf_vf(tanf, v);
+DECLARE_SP_vf_vf(tanhf, v);
+DECLARE_SP_vf_vf(tgammaf, v);
+DECLARE_SP_vf_vf(truncf, v);
+
+#ifndef ENABLE_SVE
+vdouble vd0, vd1, vd2, vd3;
+vfloat vf0, vf1, vf2, vf3;
+vint vi0, vi1, vi2, vi3;
+vint2 vi20, vi21, vi22, vi23;
+vopmask mask;
+#else
+volatile char outbuf[1024];
+#endif
+
+int check_feature(double d, float f) {
+#ifdef ENABLE_SVE
+  vdouble vd0 = svdup_n_f64(d), vd1 = svdup_n_f64(d);
+#ifdef MASKED_GNUABI
+  vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(f), svdup_n_s32(0));
+#endif
+#endif
+
+  CALL_DP_vd_vd(__acos_finite, v);
+#ifdef ENABLE_SVE
+  svst1_f64(svptrue_b8(), (double *)outbuf, vd0);
+#endif
+  return 1;
+}
+
+int main2(int argc, char **argv) {
+#ifdef ENABLE_SVE
+  vdouble vd0 = svdup_n_f64(argc), vd1 = svdup_n_f64(argc), vd2 = svdup_n_f64(argc), vd3 = svdup_n_f64(argc);
+  vfloat vf0 = svdup_n_f32(argc), vf1 = svdup_n_f32(argc), vf2 = svdup_n_f32(argc), vf3 = svdup_n_f32(argc);
+  vint vi0 = svdup_n_s32(argc), vi2 = svdup_n_s32(argc);
+  vint2 vi20 = svdup_n_s32(argc), vi22 = svdup_n_s32(argc);
+#ifdef MASKED_GNUABI
+  vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(argc), svdup_n_s32(0));
+#endif
+#endif
+
+  // Double precision function call.
+  CALL_DP_vd_vd(__acos_finite, v);
+  CALL_DP_vd_vd(__acosh_finite, v);
+  CALL_DP_vd_vd(__asin_finite, v);
+  CALL_DP_vd_vd_vd(__atan2_finite, vv);
+  CALL_DP_vd_vd(__atanh_finite, v);
+  CALL_DP_vd_vd(__cosh_finite, v);
+  CALL_DP_vd_vd(__exp10_finite, v);
+  CALL_DP_vd_vd(__exp2_finite, v);
+  CALL_DP_vd_vd(__exp_finite, v);
+  CALL_DP_vd_vd_vd(__fmod_finite, vv);
+  CALL_DP_vd_vd_pvd(__modf_finite, vl8);
+  CALL_DP_vd_vd_vd(__hypot_finite, vv);
+  CALL_DP_vd_vd(__log10_finite, v);
+  // CALL_DP_vd_vd(__log2_finite,v);
+  CALL_DP_vd_vd(__log_finite, v);
+  CALL_DP_vd_vd_vd(__pow_finite, vv);
+  CALL_DP_vd_vd(__sinh_finite, v);
+  CALL_DP_vd_vd(__sqrt_finite, v);
+  CALL_DP_vd_vd(acos, v);
+  CALL_DP_vd_vd(acosh, v);
+  CALL_DP_vd_vd(asin, v);
+  CALL_DP_vd_vd(asinh, v);
+  CALL_DP_vd_vd(atan, v);
+  CALL_DP_vd_vd_vd(atan2, vv);
+  CALL_DP_vd_vd(atanh, v);
+  CALL_DP_vd_vd(cbrt, v);
+  CALL_DP_vd_vd(ceil, v);
+  CALL_DP_vd_vd_vd(copysign, vv);
+  CALL_DP_vd_vd(cos, v);
+  CALL_DP_vd_vd(cosh, v);
+  CALL_DP_vd_vd(cospi, v);
+  CALL_DP_vd_vd(erf, v);
+  CALL_DP_vd_vd(erfc, v);
+  CALL_DP_vd_vd(exp, v);
+  CALL_DP_vd_vd(exp10, v);
+  CALL_DP_vd_vd(exp2, v);
+  CALL_DP_vi_vd(expfrexp, v);
+  CALL_DP_vd_vd(expm1, v);
+  CALL_DP_vd_vd(fabs, v);
+  CALL_DP_vd_vd_vd(fdim, vv);
+  CALL_DP_vd_vd(floor, v);
+  CALL_DP_vd_vd_vd_vd(fma, vvv);
+  CALL_DP_vd_vd_vd(fmax, vv);
+  CALL_DP_vd_vd_vd(fmin, vv);
+  CALL_DP_vd_vd_vd(fmod, vv);
+  CALL_DP_vd_vd(frfrexp, v);
+  CALL_DP_vd_vd_vd(hypot, vv);
+  CALL_DP_vi_vd(ilogb, v);
+  CALL_DP_vd_vd_vi(ldexp, vv);
+  CALL_DP_vd_vd(lgamma, v);
+  CALL_DP_vd_vd(log, v);
+  CALL_DP_vd_vd(log10, v);
+  CALL_DP_vd_vd(log1p, v);
+  CALL_DP_vd_vd(log2, v);
+  CALL_DP_vd_vd_pvd(modf, vl8);
+  CALL_DP_vd_vd_vd(nextafter, vv);
+  CALL_DP_vd_vd_vd(pow, vv);
+  CALL_DP_vd_vd(rint, v);
+  CALL_DP_vd_vd(round, v);
+  CALL_DP_vd_vd(sin, v);
+  CALL_DP_v_vd_pvd_pvd(sincos, vl8l8);
+  CALL_DP_v_vd_pvd_pvd(sincospi, vl8l8);
+  CALL_DP_vd_vd(sinh, v);
+  CALL_DP_vd_vd(sinpi, v);
+  CALL_DP_vd_vd(sqrt, v);
+  CALL_DP_vd_vd(tan, v);
+  CALL_DP_vd_vd(tanh, v);
+  CALL_DP_vd_vd(tgamma, v);
+  CALL_DP_vd_vd(trunc, v);
+
+  // Single precision function call.
+  CALL_SP_vf_vf(__acosf_finite, v);
+  CALL_SP_vf_vf(__acoshf_finite, v);
+  CALL_SP_vf_vf(__asinf_finite, v);
+  CALL_SP_vf_vf_vf(__atan2f_finite, vv);
+  CALL_SP_vf_vf(__atanhf_finite, v);
+  CALL_SP_vf_vf(__coshf_finite, v);
+  CALL_SP_vf_vf(__exp10f_finite, v);
+  CALL_SP_vf_vf(__exp2f_finite, v);
+  CALL_SP_vf_vf(__expf_finite, v);
+  CALL_SP_vf_vf_vf(__fmodf_finite, vv);
+  CALL_SP_vf_vf_pvf(__modff_finite, vl4);
+  CALL_SP_vf_vf_vf(__hypotf_finite, vv);
+  CALL_SP_vf_vf(__log10f_finite, v);
+  // CALL_SP_vf_vf(__log2f_finite,v);
+  CALL_SP_vf_vf(__logf_finite, v);
+  CALL_SP_vf_vf_vf(__powf_finite, vv);
+  CALL_SP_vf_vf(__sinhf_finite, v);
+  CALL_SP_vf_vf(__sqrtf_finite, v);
+  CALL_SP_vf_vf(acosf, v);
+  CALL_SP_vf_vf(acoshf, v);
+  CALL_SP_vf_vf(asinf, v);
+  CALL_SP_vf_vf(asinhf, v);
+  CALL_SP_vf_vf(atanf, v);
+  CALL_SP_vf_vf_vf(atan2f, vv);
+  CALL_SP_vf_vf(atanhf, v);
+  CALL_SP_vf_vf(cbrtf, v);
+  CALL_SP_vf_vf(ceilf, v);
+  CALL_SP_vf_vf_vf(copysignf, vv);
+  CALL_SP_vf_vf(cosf, v);
+  CALL_SP_vf_vf(coshf, v);
+  CALL_SP_vf_vf(cospif, v);
+  CALL_SP_vf_vf(erff, v);
+  CALL_SP_vf_vf(erfcf, v);
+  CALL_SP_vf_vf(expf, v);
+  CALL_SP_vf_vf(exp10f, v);
+  CALL_SP_vf_vf(exp2f, v);
+  CALL_SP_vf_vf(expm1f, v);
+  CALL_SP_vf_vf(fabsf, v);
+  CALL_SP_vf_vf_vf(fdimf, vv);
+  CALL_SP_vf_vf(floorf, v);
+  CALL_SP_vf_vf_vf_vf(fmaf, vvv);
+  CALL_SP_vf_vf_vf(fmaxf, vv);
+  CALL_SP_vf_vf_vf(fminf, vv);
+  CALL_SP_vf_vf_vf(fmodf, vv);
+  CALL_SP_vf_vf(frfrexpf, v);
+  CALL_SP_vf_vf_vf(hypotf, vv);
+#ifndef ENABLE_AVX
+// These two functions are not checked in some configurations due to
+// the issue in https://github.com/shibatch/sleef/issues/221
+  CALL_SP_vi_vf(expfrexpf, v);
+  CALL_SP_vi_vf(ilogbf, v);
+#endif
+  CALL_SP_vf_vf_vi(ldexpf, vv);
+  CALL_SP_vf_vf(lgammaf, v);
+  CALL_SP_vf_vf(logf, v);
+  CALL_SP_vf_vf(log10f, v);
+  CALL_SP_vf_vf(log1pf, v);
+  CALL_SP_vf_vf(log2f, v);
+  CALL_SP_vf_vf_pvf(modff, vl4);
+  CALL_SP_vf_vf_vf(nextafterf, vv);
+  CALL_SP_vf_vf_vf(powf, vv);
+  CALL_SP_vf_vf(rintf, v);
+  CALL_SP_vf_vf(roundf, v);
+  CALL_SP_vf_vf(sinf, v);
+  CALL_SP_v_vf_pvf_pvf(sincosf, vl4l4);
+  CALL_SP_v_vf_pvf_pvf(sincospif, vl4l4);
+  CALL_SP_vf_vf(sinhf, v);
+  CALL_SP_vf_vf(sinpif, v);
+  CALL_SP_vf_vf(sqrtf, v);
+  CALL_SP_vf_vf(tanf, v);
+  CALL_SP_vf_vf(tanhf, v);
+  CALL_SP_vf_vf(tgammaf, v);
+  CALL_SP_vf_vf(truncf, v);
+
+  return 0;
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_cinz.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_cinz.txt
@@ -0,0 +1,129 @@
+sin u35 bc50dfbcbd8ef534541d1babe90860c7
+sin u10 dbc2cf81f292ef50fa0119e222c6c9f9
+cos u35 506e34a809b80ad3603ed46ba2a574b0
+cos u10 a0f69df5937152b8f8f0e671f3676289
+tan u35 970b5cd7f0e05defa22ebb155ab61a40
+tan u10 5fd08e0552e3ab853439bf5fd2bd344d
+sincos u10 7c164edcaa45988f6165b653fc76c495
+sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4
+sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
+sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
+log u10 4855b27222d900bea47a27cadba71727
+log u35 c95484de57c167da3d8d6d1baadf9ffa
+log2 u10 2662df9af919680ca62e1752fb1b7539
+log2 u35 1cd6d7f194a5e8364191497adc5c5cec
+log10 u10 36645e8031d873d66fd0ec2c5959f273
+log1p u10 1383924fb56cf2e7eda27de21320c591
+exp u10 13692a48edf2cf7a3e047b16ddfb7b81
+exp2 u10 436146f8d6dcaa4a754837108a9aa3e1
+exp2 u35 8881d075d9101a1dfa3f6a10b9ee8373
+exp10 u10 9d704b310f683872a6446cfc97726a4d
+exp10 u35 bc07745ebc22a7ee97679154c24b23cc
+expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
+pow u10 a0ea63b27d33262346a35c9439741075
+cbrt u10 5d8bf28ac74624594fd1be9217817690
+cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
+cbrt u35 73daa306764e208aab1627ac110b10d7
+cbrt u35 c29b7bf200215425b4ba948c8cc94c42
+hypot u05 cc2f18e409e19a02cadf7b91fd869120
+hypot u35 5194e0a554174a6145511ce3df9c1f46
+asin u10 86c061caec3fa2e1bc71bda4dad29f4c
+asin u35 31303b88bdc00206265002d6cc5e89e4
+acos u10 0a1a403590f2ac8364f132b334920945
+acos u35 493f960c1cce57931d95a5a22a0587a3
+atan u10 c97624a24ec034cc0c8985acb61d13cd
+atan u10 0be0f550406923016cfeb5ef62c25b15
+atan u35 9d6d83e066b5a4851d44771418c9948c
+atan u35 f32c1aa4caa08c6945afd1125ba8b113
+atan2 u10 6b1d9d25fcd96053acc19d1633fab36a
+atan2 u35 afb07894347062a96dab705b34eb1763
+sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
+cosh u10 f77eb95f79e274c12b4e92dc0389259b
+tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
+asinh u10 01136e54e2a434839530dda54f33cfdb
+acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
+atanh u10 601a77ba8c1d5175f2808b48a41260c1
+lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
+tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7
+erf u10 f4ae148b59bb7501d8f5746300850376
+erfc u15 5e116a4316dafa742769f71e18f6f9fe
+fabs  bef2f2ac8a4789357e580b4da4f9b9fe
+copysign  3219022f267464e3704f90558e8df3bc
+fmax  4e4f5220ccfef191864c316df0d18fc0
+fmin  c0f8effb6c611e2b3b91b820ad943f62
+fdim  e876d103931f18ceede5bfd7e3df7ab0
+fmod  618aa751e13012afdb41ec80dd35e6ba
+remainder  8d692dbb44bbc9be5af0c0657d3008b8
+modf  f03ce73cd4f9ea7f69c017f6e53355d5
+nextafter  9eba4e30d12d74dc4e8003fcff0f1582
+trunc  1bc7e909eba121dcef7f0e4046937ae5
+floor  2cff66b499dc8a30cec9467de659b774
+ceil  b080e632dcb8f8134d8715752be12917
+round  8907e21687ca9c2a539297536e754950
+rint  e49f837096bc661fe1c742801dd99a30
+sinf u35 833d845950b9cbb025629fe4c040f8f6
+sinf u10 9c21afa4d7d6af3fc666309c3cd647fe
+cosf u35 74d7f871a6553cd0019087895e2052ad
+cosf u10 35349e94c323c1614f22093959288010
+tanf u35 bbb7c092d017e96d2454a38a20687735
+tanf u10 227423bc04f42d76a8f68082ba696126
+sincosf u10 83ecc4e3d5295056e9d8c52bc196b666
+sincosf u35 533319caa49a961e4909bd6dcab40721
+sincospif u05 8b3762b67a661957c1414c351ec49034
+sincospif u35 cec15ed76a358091632634166fa77b66
+logf u10 c5a90119943acc4199e1cc7030b5def8
+logf u35 af2fbe4bfa2caaf59c734e3749dd15be
+log2f u10 ba8acae369bbb7b6404cccbc633fe25b
+log2f u35 ba32ebaa8c470899ebd433d190c00f03
+log10f u10 7e235a82d960e4434575dd39648d8bb7
+log1pf u10 350fc4f13502b36bb1107e1b1122acb1
+expf u10 ee4adaabefa3fac6c0f1925b2a948eea
+exp2f u10 b0d283dbae0f36f1b3c7eed9871f0d0d
+exp2f u35 522cc30f722f77fceb07015830b351a3
+exp10f u10 b0564be151965600f5744ff2e4992bc9
+exp10f u35 d142f1fb40e44f0c9e042718f27ee3e0
+expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
+powf u10 a7cba3239c87969662e8b41a4dd8b4ab
+cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
+cbrtf u10 2a245b03f83e9114644d03b40dac707b
+cbrtf u35 3ce62350fd585f0524a12c974fbe6cf5
+cbrtf u35 2aca0404626a28f7af7f60105ad6e217
+hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
+hypotf u35 a6f0f774b346a6bba08889ff9ba3f193
+asinf u10 7f77f7453b961512c89e87e49c549cfe
+asinf u35 22ed8760aa328e1f714031eec592a4d8
+acosf u10 15617dd0429b90e59d2923415934c2a6
+acosf u35 af0b132d9e263721f9296187dbf9b9bf
+atanf u10 26b77fb423104b45633cf24500237d6e
+atanf u10 4313d0bc2708de53f74d804aac6564d4
+atanf u35 97a1797897955643c722c7d291987331
+atanf u35 7d3f47169415058e8578f11d899bfd10
+atan2f u10 098a33f730fe95ce4774a991db4cee14
+atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363
+sinhf u10 0780a2f57df3a831718195d1ee5c19ef
+coshf u10 cfbb6aed408e43a7b7f053474100ff2d
+tanhf u10 d19f254d41e8726c748df87b95bc9acd
+asinhf u10 260d129221468a86bbfd609c27bfea6a
+acoshf u10 24ced7e5631c78b20a5716faeedbaa92
+atanhf u10 164fd77b8372b8c131baaacab1c9e650
+lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
+tgammaf u10 f3a8d25c852068622bdfcae4cb813583
+erff u10 f34af3814153de040b93e573ca7d21d8
+erfcf u15 915ab9830de89a5a504b3ce7cd2fecda
+fabsf  a3c72220bc0ade68fe22e0a15eb730d4
+copysignf  6b35517b8e1da78d9c9b52915d9a9b19
+fmaxf  9833a60a2080e8fd9ae8de32c758966f
+fminf  2dcfa19e1f1ab4973a7dec9f2cc09fa0
+fdimf  c5c0fe7b095eb8ccbb19fbf934a36b24
+fmodf  77aa84a9703e202a56e5f4609bd2482b
+remainderf  5a453b1217c173e4dc0b0211066750be
+modff  5fa4f044f20478216aa085a01b189697
+nextafterf  517c1c8f072e9024518d3d9ead98b85b
+truncf  6937050850be63c44d4b7dbd666febe6
+floorf  9341be69ee345c8554bf3ab4e9316133
+ceilf  c70874771cbe9741f1f05fedd4b629e9
+roundf  0cf52f6b8015099771e9a7dfa6b090bc
+rintf  bed68e788e2b11543c09c9d52198abf8
+fastsinf u3500 8eb51f86fb40414dd21284f020f24b6c
+fastcosf u3500 69cbc3703f1d2c68695b00b1b09287b2
+fastpowf u3500 e02e6a692cfa22a6b7149168c67ea1d2
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_finz.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_finz.txt
@@ -0,0 +1,129 @@
+sin u35 c163e4a7e9ccebb2181dcc8653367d8c
+sin u10 0d6bf6f2c935db82588222da95659019
+cos u35 52f902bd939d751b5b544ac70181fcff
+cos u10 afcdba92a75a76d56b8cf2f22d4bec9e
+tan u35 906cc42b6755fe514c5e185fcb4d2f55
+tan u10 c98f29a62067fa63646d9bcc29a310c6
+sincos u10 3fe37f4eb805505152f2b14a22a9f94e
+sincos u35 95a7b7f48c71febf10ec6eff796dd391
+sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
+sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
+log u10 4855b27222d900bea47a27cadba71727
+log u35 015f8ae899c9b921d48919dd12ef19a9
+log2 u10 2662df9af919680ca62e1752fb1b7539
+log2 u35 908b1949db34ea855944f00089b21e23
+log10 u10 36645e8031d873d66fd0ec2c5959f273
+log1p u10 1383924fb56cf2e7eda27de21320c591
+exp u10 084e5be89c2ad03e356078ea4f287bab
+exp2 u10 6e36db9ae2cf9eca82e3d9157c622351
+exp2 u35 6e36db9ae2cf9eca82e3d9157c622351
+exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e
+exp10 u35 6904d5509ca794747aa249c13886f90f
+expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
+pow u10 7e19796027d7c1d1999be948f90e6181
+cbrt u10 5d8bf28ac74624594fd1be9217817690
+cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
+cbrt u35 fc7ee3e3e6c54365d708b752c242a947
+cbrt u35 2408714a56d74f8c82389ca6772cdbc1
+hypot u05 cc2f18e409e19a02cadf7b91fd869120
+hypot u35 be7bbd41dffd746b70261ee773cbd4b2
+asin u10 8a21b7c28cdaffc9d3e53f415367932e
+asin u35 9c9e8107782898e9faed6924ad1b3cb1
+acos u10 28261e4eb8331865660c814676d5c6bc
+acos u35 310911130bfc45b10dabe3a072939331
+atan u10 f931de72f2f6a7928f307a8a382ae255
+atan u10 453f9ef62f58f9829320baf482a1d457
+atan u35 6161b6189609f105b017d8768d0a41f1
+atan u35 6face71d8d93c69448d49ed6140e361d
+atan2 u10 469babaeee9bd30e17af2f473b3ea500
+atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f
+sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
+cosh u10 f77eb95f79e274c12b4e92dc0389259b
+tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
+asinh u10 01136e54e2a434839530dda54f33cfdb
+acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
+atanh u10 601a77ba8c1d5175f2808b48a41260c1
+lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
+tgamma u10 cb9a93844ad1713d2ab92ff5b6398150
+erf u10 8a0bc2146a5c67b6bebc58f4b0076568
+erfc u15 3e247a54183eeddedc33e99c50118995
+fabs  bef2f2ac8a4789357e580b4da4f9b9fe
+copysign  3219022f267464e3704f90558e8df3bc
+fmax  4e4f5220ccfef191864c316df0d18fc0
+fmin  c0f8effb6c611e2b3b91b820ad943f62
+fdim  e876d103931f18ceede5bfd7e3df7ab0
+fmod  618aa751e13012afdb41ec80dd35e6ba
+remainder  8d692dbb44bbc9be5af0c0657d3008b8
+modf  f03ce73cd4f9ea7f69c017f6e53355d5
+nextafter  9eba4e30d12d74dc4e8003fcff0f1582
+trunc  1bc7e909eba121dcef7f0e4046937ae5
+floor  2cff66b499dc8a30cec9467de659b774
+ceil  b080e632dcb8f8134d8715752be12917
+round  8907e21687ca9c2a539297536e754950
+rint  e49f837096bc661fe1c742801dd99a30
+sinf u35 f8f804eae1d9443103e81fec96293477
+sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe
+cosf u35 f2f3d1c9f090cde9c02439608dc7066e
+cosf u10 dc35f27fae65f63f0aa6ad241f8b387b
+tanf u35 68d42ad1fb412e6b8be3853461e61213
+tanf u10 97df301d4f59e67d5318b5356b703f06
+sincosf u10 a97124d810ec461c135dc4fb0c059b6f
+sincosf u35 0cc521e52ae1227d311012c2919c1ff2
+sincospif u05 8b3762b67a661957c1414c351ec49034
+sincospif u35 8720757f221c00cc8de24b7dc4949144
+logf u10 c5a90119943acc4199e1cc7030b5def8
+logf u35 b6234302d534d6ccd48155dd6b9a4293
+log2f u10 ba8acae369bbb7b6404cccbc633fe25b
+log2f u35 74174c90717c86642b71284452a8aef6
+log10f u10 7e235a82d960e4434575dd39648d8bb7
+log1pf u10 e53dbfa80bcc1a7bcfd21000e6950475
+expf u10 9597388315e4b3e89c4c97ce46374dcf
+exp2f u10 42d66e5e4cb88feb29c5b36c632159a5
+exp2f u35 42d66e5e4cb88feb29c5b36c632159a5
+exp10f u10 954f0824b6d949d0da03b49950dc6642
+exp10f u35 6fb0e9a829e12a06679d379d05b53ede
+expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
+powf u10 2ed84af40d03e307a620365f172d010d
+cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
+cbrtf u10 2a245b03f83e9114644d03b40dac707b
+cbrtf u35 6c22a6dc132c5212250970f22f42256d
+cbrtf u35 5ab696ae11f9637413d30e6496d5324b
+hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
+hypotf u35 2a7cd97768287084b7fffc7e9fb39072
+asinf u10 e2e571a01984c4ffb3f6e38e0328d90e
+asinf u35 70df2dfc3a3569868cce60c38e7b1962
+acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb
+acosf u35 72b0e2f9791f90f1c43570b9e9ba893f
+atanf u10 fa672e387a204055f735b7af98dd8a35
+atanf u10 d017670c13bc221b68bc9ee5f41c4b5e
+atanf u35 f592e46eaa5d29583f86d3e336f20b6b
+atanf u35 e7087fe40de46921826b373d10c40954
+atan2f u10 275b2fa8ee554c45551bb142db9f8197
+atan2f u35 44b187851195d24bab2561eb8f4ff5d0
+sinhf u10 45bc228a14c3e39eeb35e9764394a23e
+coshf u10 838d441e85d415ef4fb1e5c5ea966a71
+tanhf u10 d19f254d41e8726c748df87b95bc9acd
+asinhf u10 927eeb621a3e2d5039f1a07fcf150901
+acoshf u10 932520013273174fcabe2be4a55f919f
+atanhf u10 164fd77b8372b8c131baaacab1c9e650
+lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
+tgammaf u10 c3059747811d98846f74a63d3747ac3d
+erff u10 f34af3814153de040b93e573ca7d21d8
+erfcf u15 687a9c577512d349ddbc0643013d2c56
+fabsf  a3c72220bc0ade68fe22e0a15eb730d4
+copysignf  6b35517b8e1da78d9c9b52915d9a9b19
+fmaxf  9833a60a2080e8fd9ae8de32c758966f
+fminf  2dcfa19e1f1ab4973a7dec9f2cc09fa0
+fdimf  c5c0fe7b095eb8ccbb19fbf934a36b24
+fmodf  77aa84a9703e202a56e5f4609bd2482b
+remainderf  5a453b1217c173e4dc0b0211066750be
+modff  5fa4f044f20478216aa085a01b189697
+nextafterf  517c1c8f072e9024518d3d9ead98b85b
+truncf  6937050850be63c44d4b7dbd666febe6
+floorf  9341be69ee345c8554bf3ab4e9316133
+ceilf  c70874771cbe9741f1f05fedd4b629e9
+roundf  0cf52f6b8015099771e9a7dfa6b090bc
+rintf  bed68e788e2b11543c09c9d52198abf8
+fastsinf u3500 5c48081c74cd0316379b580b047dbfc2
+fastcosf u3500 6f73d116f109283e5632c31f5988f55b
+fastpowf u3500 6dbb3110412df4fed5a71f50d40def89
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iut.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iut.c
@@ -0,0 +1,777 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include <math.h>
+
+#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
+#define STDIN_FILENO 0
+#else
+#include <unistd.h>
+#include <sys/types.h>
+#endif
+
+#include "sleef.h"
+#include "testerutil.h"
+
+#define DORENAME
+#include "rename.h"
+
+#define BUFSIZE 1024
+
+int main(int argc, char **argv) {
+  char buf[BUFSIZE];
+
+  printf("3\n");
+  fflush(stdout);
+
+  for(;;) {
+    if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;
+
+    if (startsWith(buf, "sin ")) {
+      uint64_t u;
+      sscanf(buf, "sin %" PRIx64, &u);
+      u = d2u(xsin(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sin_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "sin_u1 %" PRIx64, &u);
+      u = d2u(xsin_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cos ")) {
+      uint64_t u;
+      sscanf(buf, "cos %" PRIx64, &u);
+      u = d2u(xcos(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cos_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "cos_u1 %" PRIx64, &u);
+      u = d2u(xcos_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sincos ")) {
+      uint64_t u;
+      sscanf(buf, "sincos %" PRIx64, &u);
+      Sleef_double2 x = xsincos(u2d(u));
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
+    } else if (startsWith(buf, "sincos_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "sincos_u1 %" PRIx64, &u);
+      Sleef_double2 x = xsincos_u1(u2d(u));
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
+    } else if (startsWith(buf, "sincospi_u05 ")) {
+      uint64_t u;
+      sscanf(buf, "sincospi_u05 %" PRIx64, &u);
+      Sleef_double2 x = xsincospi_u05(u2d(u));
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
+    } else if (startsWith(buf, "sincospi_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "sincospi_u35 %" PRIx64, &u);
+      Sleef_double2 x = xsincospi_u35(u2d(u));
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
+    } else if (startsWith(buf, "sinpi_u05 ")) {
+      uint64_t u;
+      sscanf(buf, "sinpi_u05 %" PRIx64, &u);
+      u = d2u(xsinpi_u05(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cospi_u05 ")) {
+      uint64_t u;
+      sscanf(buf, "cospi_u05 %" PRIx64, &u);
+      u = d2u(xcospi_u05(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "tan ")) {
+      uint64_t u;
+      sscanf(buf, "tan %" PRIx64, &u);
+      u = d2u(xtan(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "tan_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "tan_u1 %" PRIx64, &u);
+      u = d2u(xtan_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "asin ")) {
+      uint64_t u;
+      sscanf(buf, "asin %" PRIx64, &u);
+      u = d2u(xasin(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "acos ")) {
+      uint64_t u;
+      sscanf(buf, "acos %" PRIx64, &u);
+      u = d2u(xacos(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "atan ")) {
+      uint64_t u;
+      sscanf(buf, "atan %" PRIx64, &u);
+      u = d2u(xatan(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "log ")) {
+      uint64_t u;
+      sscanf(buf, "log %" PRIx64, &u);
+      u = d2u(xlog(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "exp ")) {
+      uint64_t u;
+      sscanf(buf, "exp %" PRIx64, &u);
+      u = d2u(xexp(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "atan2 ")) {
+      uint64_t u, v;
+      sscanf(buf, "atan2 %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xatan2(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "asin_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "asin_u1 %" PRIx64, &u);
+      u = d2u(xasin_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "acos_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "acos_u1 %" PRIx64, &u);
+      u = d2u(xacos_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "atan_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "atan_u1 %" PRIx64, &u);
+      u = d2u(xatan_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "atan2_u1 ")) {
+      uint64_t u, v;
+      sscanf(buf, "atan2_u1 %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xatan2_u1(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "log_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "log_u1 %" PRIx64, &u);
+      u = d2u(xlog_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "pow ")) {
+      uint64_t u, v;
+      sscanf(buf, "pow %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xpow(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sinh ")) {
+      uint64_t u;
+      sscanf(buf, "sinh %" PRIx64, &u);
+      u = d2u(xsinh(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cosh ")) {
+      uint64_t u;
+      sscanf(buf, "cosh %" PRIx64, &u);
+      u = d2u(xcosh(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "tanh ")) {
+      uint64_t u;
+      sscanf(buf, "tanh %" PRIx64, &u);
+      u = d2u(xtanh(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sinh_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "sinh_u35 %" PRIx64, &u);
+      u = d2u(xsinh_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cosh_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "cosh_u35 %" PRIx64, &u);
+      u = d2u(xcosh_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "tanh_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "tanh_u35 %" PRIx64, &u);
+      u = d2u(xtanh_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "asinh ")) {
+      uint64_t u;
+      sscanf(buf, "asinh %" PRIx64, &u);
+      u = d2u(xasinh(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "acosh ")) {
+      uint64_t u;
+      sscanf(buf, "acosh %" PRIx64, &u);
+      u = d2u(xacosh(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "atanh ")) {
+      uint64_t u;
+      sscanf(buf, "atanh %" PRIx64, &u);
+      u = d2u(xatanh(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "fma ")) {
+      uint64_t u, v, w;
+      sscanf(buf, "fma %" PRIx64 " %" PRIx64 " %" PRIx64, &u, &v, &w);
+      u = d2u(xfma(u2d(u), u2d(v), u2d(w)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sqrt ")) {
+      uint64_t u;
+      sscanf(buf, "sqrt %" PRIx64, &u);
+      u = d2u(xsqrt(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sqrt_u05 ")) {
+      uint64_t u;
+      sscanf(buf, "sqrt_u05 %" PRIx64, &u);
+      u = d2u(xsqrt_u05(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "sqrt_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "sqrt_u35 %" PRIx64, &u);
+      u = d2u(xsqrt_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cbrt ")) {
+      uint64_t u;
+      sscanf(buf, "cbrt %" PRIx64, &u);
+      u = d2u(xcbrt(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "cbrt_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "cbrt_u1 %" PRIx64, &u);
+      u = d2u(xcbrt_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "exp2 ")) {
+      uint64_t u;
+      sscanf(buf, "exp2 %" PRIx64, &u);
+      u = d2u(xexp2(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "exp2_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "exp2_u35 %" PRIx64, &u);
+      u = d2u(xexp2_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "exp10 ")) {
+      uint64_t u;
+      sscanf(buf, "exp10 %" PRIx64, &u);
+      u = d2u(xexp10(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "exp10_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "exp10_u35 %" PRIx64, &u);
+      u = d2u(xexp10_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "expm1 ")) {
+      uint64_t u;
+      sscanf(buf, "expm1 %" PRIx64, &u);
+      u = d2u(xexpm1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "log10 ")) {
+      uint64_t u;
+      sscanf(buf, "log10 %" PRIx64, &u);
+      u = d2u(xlog10(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "log2 ")) {
+      uint64_t u;
+      sscanf(buf, "log2 %" PRIx64, &u);
+      u = d2u(xlog2(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "log2_u35 ")) {
+      uint64_t u;
+      sscanf(buf, "log2_u35 %" PRIx64, &u);
+      u = d2u(xlog2_u35(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "log1p ")) {
+      uint64_t u;
+      sscanf(buf, "log1p %" PRIx64, &u);
+      u = d2u(xlog1p(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "ldexp ")) {
+      uint64_t u, v;
+      sscanf(buf, "ldexp %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xldexp(u2d(u), (int)u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    }
+
+    else if (startsWith(buf, "hypot_u05 ")) {
+      uint64_t u, v;
+      sscanf(buf, "hypot_u05 %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xhypot_u05(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "hypot_u35 ")) {
+      uint64_t u, v;
+      sscanf(buf, "hypot_u35 %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xhypot_u35(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "copysign ")) {
+      uint64_t u, v;
+      sscanf(buf, "copysign %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xcopysign(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "fmax ")) {
+      uint64_t u, v;
+      sscanf(buf, "fmax %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xfmax(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "fmin ")) {
+      uint64_t u, v;
+      sscanf(buf, "fmin %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xfmin(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "fdim ")) {
+      uint64_t u, v;
+      sscanf(buf, "fdim %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xfdim(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "nextafter ")) {
+      uint64_t u, v;
+      sscanf(buf, "nextafter %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xnextafter(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "fmod ")) {
+      uint64_t u, v;
+      sscanf(buf, "fmod %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xfmod(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "remainder ")) {
+      uint64_t u, v;
+      sscanf(buf, "remainder %" PRIx64 " %" PRIx64, &u, &v);
+      u = d2u(xremainder(u2d(u), u2d(v)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "fabs ")) {
+      uint64_t u;
+      sscanf(buf, "fabs %" PRIx64, &u);
+      u = d2u(xfabs(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "trunc ")) {
+      uint64_t u;
+      sscanf(buf, "trunc %" PRIx64, &u);
+      u = d2u(xtrunc(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "floor ")) {
+      uint64_t u;
+      sscanf(buf, "floor %" PRIx64, &u);
+      u = d2u(xfloor(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "ceil ")) {
+      uint64_t u;
+      sscanf(buf, "ceil %" PRIx64, &u);
+      u = d2u(xceil(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "round ")) {
+      uint64_t u;
+      sscanf(buf, "round %" PRIx64, &u);
+      u = d2u(xround(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "rint ")) {
+      uint64_t u;
+      sscanf(buf, "rint %" PRIx64, &u);
+      u = d2u(xrint(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "frfrexp ")) {
+      uint64_t u;
+      sscanf(buf, "frfrexp %" PRIx64, &u);
+      u = d2u(xfrfrexp(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "modf ")) {
+      uint64_t u;
+      sscanf(buf, "modf %" PRIx64, &u);
+      Sleef_double2 x = xmodf(u2d(u));
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
+    } else if (startsWith(buf, "tgamma_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "tgamma_u1 %" PRIx64, &u);
+      u = d2u(xtgamma_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "lgamma_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "lgamma_u1 %" PRIx64, &u);
+      u = d2u(xlgamma_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "erf_u1 ")) {
+      uint64_t u;
+      sscanf(buf, "erf_u1 %" PRIx64, &u);
+      u = d2u(xerf_u1(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    } else if (startsWith(buf, "erfc_u15 ")) {
+      uint64_t u;
+      sscanf(buf, "erfc_u15 %" PRIx64, &u);
+      u = d2u(xerfc_u15(u2d(u)));
+      printf("%" PRIx64 "\n", u);
+    }
+
+    else if (startsWith(buf, "sinf ")) {
+      uint32_t u;
+      sscanf(buf, "sinf %x", &u);
+      u = f2u(xsinf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "cosf ")) {
+      uint32_t u;
+      sscanf(buf, "cosf %x", &u);
+      u = f2u(xcosf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sincosf ")) {
+      uint32_t u;
+      sscanf(buf, "sincosf %x", &u);
+      Sleef_float2 x = xsincosf(u2f(u));
+      printf("%x %x\n", f2u(x.x), f2u(x.y));
+    } else if (startsWith(buf, "tanf ")) {
+      uint32_t u;
+      sscanf(buf, "tanf %x", &u);
+      u = f2u(xtanf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "asinf ")) {
+      uint32_t u;
+      sscanf(buf, "asinf %x", &u);
+      u = f2u(xasinf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "acosf ")) {
+      uint32_t u;
+      sscanf(buf, "acosf %x", &u);
+      u = f2u(xacosf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "atanf ")) {
+      uint32_t u;
+      sscanf(buf, "atanf %x", &u);
+      u = f2u(xatanf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "atan2f ")) {
+      uint32_t u, v;
+      sscanf(buf, "atan2f %x %x", &u, &v);
+      u = f2u(xatan2f(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "logf ")) {
+      uint32_t u;
+      sscanf(buf, "logf %x", &u);
+      u = f2u(xlogf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "expf ")) {
+      uint32_t u;
+      sscanf(buf, "expf %x", &u);
+      u = f2u(xexpf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "cbrtf ")) {
+      uint32_t u;
+      sscanf(buf, "cbrtf %x", &u);
+      u = f2u(xcbrtf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sqrtf ")) {
+      uint32_t u;
+      sscanf(buf, "sqrtf %x", &u);
+      u = f2u(xsqrtf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sqrtf_u05 ")) {
+      uint32_t u;
+      sscanf(buf, "sqrtf_u05 %x", &u);
+      u = f2u(xsqrtf_u05(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sqrtf_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "sqrtf_u35 %x", &u);
+      u = f2u(xsqrtf_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "ldexpf ")) {
+      uint32_t u, v;
+      sscanf(buf, "ldexpf %x %x", &u, &v);
+      u = f2u(xldexpf(u2f(u), (int)u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "powf ")) {
+      uint32_t u, v;
+      sscanf(buf, "powf %x %x", &u, &v);
+      u = f2u(xpowf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fastpowf_u3500 ")) {
+      uint32_t u, v;
+      sscanf(buf, "fastpowf_u3500 %x %x", &u, &v);
+      u = f2u(xfastpowf_u3500(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sinhf ")) {
+      uint32_t u;
+      sscanf(buf, "sinhf %x", &u);
+      u = f2u(xsinhf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "coshf ")) {
+      uint32_t u;
+      sscanf(buf, "coshf %x", &u);
+      u = f2u(xcoshf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "tanhf ")) {
+      uint32_t u;
+      sscanf(buf, "tanhf %x", &u);
+      u = f2u(xtanhf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sinhf_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "sinhf_u35 %x", &u);
+      u = f2u(xsinhf_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "coshf_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "coshf_u35 %x", &u);
+      u = f2u(xcoshf_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "tanhf_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "tanhf_u35 %x", &u);
+      u = f2u(xtanhf_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "asinhf ")) {
+      uint32_t u;
+      sscanf(buf, "asinhf %x", &u);
+      u = f2u(xasinhf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "acoshf ")) {
+      uint32_t u;
+      sscanf(buf, "acoshf %x", &u);
+      u = f2u(xacoshf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "atanhf ")) {
+      uint32_t u;
+      sscanf(buf, "atanhf %x", &u);
+      u = f2u(xatanhf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "exp2f ")) {
+      uint32_t u;
+      sscanf(buf, "exp2f %x", &u);
+      u = f2u(xexp2f(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "exp10f ")) {
+      uint32_t u;
+      sscanf(buf, "exp10f %x", &u);
+      u = f2u(xexp10f(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "exp2f_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "exp2f_u35 %x", &u);
+      u = f2u(xexp2f_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "exp10f_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "exp10f_u35 %x", &u);
+      u = f2u(xexp10f_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "expm1f ")) {
+      uint32_t u;
+      sscanf(buf, "expm1f %x", &u);
+      u = f2u(xexpm1f(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "log10f ")) {
+      uint32_t u;
+      sscanf(buf, "log10f %x", &u);
+      u = f2u(xlog10f(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "log2f ")) {
+      uint32_t u;
+      sscanf(buf, "log2f %x", &u);
+      u = f2u(xlog2f(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "log2f_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "log2f_u35 %x", &u);
+      u = f2u(xlog2f_u35(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "log1pf ")) {
+      uint32_t u;
+      sscanf(buf, "log1pf %x", &u);
+      u = f2u(xlog1pf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sinf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "sinf_u1 %x", &u);
+      u = f2u(xsinf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "cosf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "cosf_u1 %x", &u);
+      u = f2u(xcosf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "sincosf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "sincosf_u1 %x", &u);
+      Sleef_float2 x = xsincosf_u1(u2f(u));
+      printf("%x %x\n", f2u(x.x), f2u(x.y));
+    } else if (startsWith(buf, "sincospif_u05 ")) {
+      uint32_t u;
+      sscanf(buf, "sincospif_u05 %x", &u);
+      Sleef_float2 x = xsincospif_u05(u2f(u));
+      printf("%x %x\n", f2u(x.x), f2u(x.y));
+    } else if (startsWith(buf, "sincospif_u35 ")) {
+      uint32_t u;
+      sscanf(buf, "sincospif_u35 %x", &u);
+      Sleef_float2 x = xsincospif_u35(u2f(u));
+      printf("%x %x\n", f2u(x.x), f2u(x.y));
+    } else if (startsWith(buf, "sinpif_u05 ")) {
+      uint32_t u;
+      sscanf(buf, "sinpif_u05 %x", &u);
+      u = f2u(xsinpif_u05(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "cospif_u05 ")) {
+      uint32_t u;
+      sscanf(buf, "cospif_u05 %x", &u);
+      u = f2u(xcospif_u05(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fastsinf_u3500 ")) {
+      uint32_t u;
+      sscanf(buf, "fastsinf_u3500 %x", &u);
+      u = f2u(xfastsinf_u3500(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fastcosf_u3500 ")) {
+      uint32_t u;
+      sscanf(buf, "fastcosf_u3500 %x", &u);
+      u = f2u(xfastcosf_u3500(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "tanf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "tanf_u1 %x", &u);
+      u = f2u(xtanf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "asinf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "asinf_u1 %x", &u);
+      u = f2u(xasinf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "acosf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "acosf_u1 %x", &u);
+      u = f2u(xacosf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "atanf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "atanf_u1 %x", &u);
+      u = f2u(xatanf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "atan2f_u1 ")) {
+      uint32_t u, v;
+      sscanf(buf, "atan2f_u1 %x %x", &u, &v);
+      u = f2u(xatan2f_u1(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "logf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "logf_u1 %x", &u);
+      u = f2u(xlogf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "cbrtf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "cbrtf_u1 %x", &u);
+      u = f2u(xcbrtf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "ilogb ")) {
+      uint64_t u;
+      int i;
+      sscanf(buf, "ilogb %" PRIx64, &u);
+      i = xilogb(u2d(u));
+      printf("%d\n", i);
+    } else if (startsWith(buf, "ilogbf ")) {
+      uint32_t u;
+      int i;
+      sscanf(buf, "ilogbf %x", &u);
+      i = xilogbf(u2f(u));
+      printf("%d\n", i);
+    }
+
+    else if (startsWith(buf, "hypotf_u05 ")) {
+      uint32_t u, v;
+      sscanf(buf, "hypotf_u05 %x %x", &u, &v);
+      u = f2u(xhypotf_u05(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "hypotf_u35 ")) {
+      uint32_t u, v;
+      sscanf(buf, "hypotf_u35 %x %x", &u, &v);
+      u = f2u(xhypotf_u35(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "copysignf ")) {
+      uint32_t u, v;
+      sscanf(buf, "copysignf %x %x", &u, &v);
+      u = f2u(xcopysignf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fmaxf ")) {
+      uint32_t u, v;
+      sscanf(buf, "fmaxf %x %x", &u, &v);
+      u = f2u(xfmaxf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fminf ")) {
+      uint32_t u, v;
+      sscanf(buf, "fminf %x %x", &u, &v);
+      u = f2u(xfminf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fdimf ")) {
+      uint32_t u, v;
+      sscanf(buf, "fdimf %x %x", &u, &v);
+      u = f2u(xfdimf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "nextafterf ")) {
+      uint32_t u, v;
+      sscanf(buf, "nextafterf %x %x", &u, &v);
+      u = f2u(xnextafterf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fmodf ")) {
+      uint32_t u, v;
+      sscanf(buf, "fmodf %x %x", &u, &v);
+      u = f2u(xfmodf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "remainderf ")) {
+      uint32_t u, v;
+      sscanf(buf, "remainderf %x %x", &u, &v);
+      u = f2u(xremainderf(u2f(u), u2f(v)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "fabsf ")) {
+      uint32_t u;
+      sscanf(buf, "fabsf %x", &u);
+      u = f2u(xfabsf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "truncf ")) {
+      uint32_t u;
+      sscanf(buf, "truncf %x", &u);
+      u = f2u(xtruncf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "floorf ")) {
+      uint32_t u;
+      sscanf(buf, "floorf %x", &u);
+      u = f2u(xfloorf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "ceilf ")) {
+      uint32_t u;
+      sscanf(buf, "ceilf %x", &u);
+      u = f2u(xceilf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "roundf ")) {
+      uint32_t u;
+      sscanf(buf, "roundf %x", &u);
+      u = f2u(xroundf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "rintf ")) {
+      uint32_t u;
+      sscanf(buf, "rintf %x", &u);
+      u = f2u(xrintf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "frfrexpf ")) {
+      uint32_t u;
+      sscanf(buf, "frfrexpf %x", &u);
+      u = f2u(xfrfrexpf(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "modff ")) {
+      uint32_t u;
+      sscanf(buf, "modff %x", &u);
+      Sleef_float2 x = xmodff(u2f(u));
+      printf("%x %x\n", f2u(x.x), f2u(x.y));
+    } else if (startsWith(buf, "tgammaf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "tgammaf_u1 %x", &u);
+      u = f2u(xtgammaf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "lgammaf_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "lgammaf_u1 %x", &u);
+      u = f2u(xlgammaf_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "erff_u1 ")) {
+      uint32_t u;
+      sscanf(buf, "erff_u1 %x", &u);
+      u = f2u(xerff_u1(u2f(u)));
+      printf("%x\n", u);
+    } else if (startsWith(buf, "erfcf_u15 ")) {
+      uint32_t u;
+      sscanf(buf, "erfcf_u15 %x", &u);
+      u = f2u(xerfcf_u15(u2f(u)));
+      printf("%x\n", u);
+    }
+
+    else {
+      break;
+    }
+
+    fflush(stdout);
+  }
+
+  return 0;
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutcuda.cu
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutcuda.cu
@@ -0,0 +1,546 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <math.h>
+#include <float.h>
+#include <stdint.h>
+#include <cuda.h>
+
+#include "sleefinline_purec_scalar.h"
+#include "sleefinline_cuda.h"
+
+#define STDIN_FILENO 0
+
+#define SIMD_SUFFIX _cuda_sleef
+#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
+#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
+
+#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
+#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
+
+//
+
+static int startsWith(const char *str, const char *prefix) {
+  while(*prefix != '\0') if (*str++ != *prefix++) return 0;
+  return *prefix == '\0';
+}
+
+static double u2d(uint64_t u) {
+  union {
+    double f;
+    uint64_t i;
+  } tmp;
+  tmp.i = u;
+  return tmp.f;
+}
+
+static uint64_t d2u(double d) {
+  union {
+    double f;
+    uint64_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static float u2f(uint32_t u) {
+  union {
+    float f;
+    uint32_t i;
+  } tmp;
+  tmp.i = u;
+  return tmp.f;
+}
+
+static uint32_t f2u(float d) {
+  union {
+    float f;
+    uint32_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+//
+
+__global__ void xsin(double *r, double *a0) { *r = Sleef_sind1_u35cuda(*a0); }
+__global__ void xcos(double *r, double *a0) { *r = Sleef_cosd1_u35cuda(*a0); }
+__global__ void xsincos(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u35cuda(*a0); }
+__global__ void xtan(double *r, double *a0) { *r = Sleef_tand1_u35cuda(*a0); }
+__global__ void xasin(double *r, double *a0) { *r = Sleef_asind1_u35cuda(*a0); }
+__global__ void xacos(double *r, double *a0) { *r = Sleef_acosd1_u35cuda(*a0); }
+__global__ void xatan(double *r, double *a0) { *r = Sleef_atand1_u35cuda(*a0); }
+__global__ void xatan2(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u35cuda(*a0, *a1); }
+__global__ void xlog(double *r, double *a0) { *r = Sleef_logd1_u35cuda(*a0); }
+__global__ void xcbrt(double *r, double *a0) { *r = Sleef_cbrtd1_u35cuda(*a0); }
+__global__ void xsin_u1(double *r, double *a0) { *r = Sleef_sind1_u10cuda(*a0); }
+__global__ void xcos_u1(double *r, double *a0) { *r = Sleef_cosd1_u10cuda(*a0); }
+__global__ void xsincos_u1(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u10cuda(*a0); }
+__global__ void xtan_u1(double *r, double *a0) { *r = Sleef_tand1_u10cuda(*a0); }
+__global__ void xasin_u1(double *r, double *a0) { *r = Sleef_asind1_u10cuda(*a0); }
+__global__ void xacos_u1(double *r, double *a0) { *r = Sleef_acosd1_u10cuda(*a0); }
+__global__ void xatan_u1(double *r, double *a0) { *r = Sleef_atand1_u10cuda(*a0); }
+__global__ void xatan2_u1(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u10cuda(*a0, *a1); }
+__global__ void xlog_u1(double *r, double *a0) { *r = Sleef_logd1_u10cuda(*a0); }
+__global__ void xcbrt_u1(double *r, double *a0) { *r = Sleef_cbrtd1_u10cuda(*a0); }
+__global__ void xexp(double *r, double *a0) { *r = Sleef_expd1_u10cuda(*a0); }
+__global__ void xpow(double *r, double *a0, double *a1) { *r = Sleef_powd1_u10cuda(*a0, *a1); }
+__global__ void xsinh(double *r, double *a0) { *r = Sleef_sinhd1_u10cuda(*a0); }
+__global__ void xcosh(double *r, double *a0) { *r = Sleef_coshd1_u10cuda(*a0); }
+__global__ void xtanh(double *r, double *a0) { *r = Sleef_tanhd1_u10cuda(*a0); }
+__global__ void xsinh_u35(double *r, double *a0) { *r = Sleef_sinhd1_u35cuda(*a0); }
+__global__ void xcosh_u35(double *r, double *a0) { *r = Sleef_coshd1_u35cuda(*a0); }
+__global__ void xtanh_u35(double *r, double *a0) { *r = Sleef_tanhd1_u35cuda(*a0); }
+__global__ void xasinh(double *r, double *a0) { *r = Sleef_asinhd1_u10cuda(*a0); }
+__global__ void xacosh(double *r, double *a0) { *r = Sleef_acoshd1_u10cuda(*a0); }
+__global__ void xatanh(double *r, double *a0) { *r = Sleef_atanhd1_u10cuda(*a0); }
+__global__ void xexp2(double *r, double *a0) { *r = Sleef_exp2d1_u10cuda(*a0); }
+__global__ void xexp2_u35(double *r, double *a0) { *r = Sleef_exp2d1_u35cuda(*a0); }
+__global__ void xexp10(double *r, double *a0) { *r = Sleef_exp10d1_u10cuda(*a0); }
+__global__ void xexp10_u35(double *r, double *a0) { *r = Sleef_exp10d1_u35cuda(*a0); }
+__global__ void xexpm1(double *r, double *a0) { *r = Sleef_expm1d1_u10cuda(*a0); }
+__global__ void xlog10(double *r, double *a0) { *r = Sleef_log10d1_u10cuda(*a0); }
+__global__ void xlog2(double *r, double *a0) { *r = Sleef_log2d1_u10cuda(*a0); }
+__global__ void xlog2_u35(double *r, double *a0) { *r = Sleef_log2d1_u35cuda(*a0); }
+__global__ void xlog1p(double *r, double *a0) { *r = Sleef_log1pd1_u10cuda(*a0); }
+__global__ void xsincospi_u05(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u05cuda(*a0); }
+__global__ void xsincospi_u35(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u35cuda(*a0); }
+__global__ void xsinpi_u05(double *r, double *a0) { *r = Sleef_sinpid1_u05cuda(*a0); }
+__global__ void xcospi_u05(double *r, double *a0) { *r = Sleef_cospid1_u05cuda(*a0); }
+__global__ void xldexp(double *r, double *a0, int *a1) { *r = Sleef_ldexpd1_cuda(*a0, *a1); }
+__global__ void xilogb(int *r, double *a0) { *r = Sleef_ilogbd1_cuda(*a0); }
+__global__ void xfma(double *r, double *a0, double *a1, double *a2) { *r = Sleef_fmad1_cuda(*a0, *a1, *a2); }
+__global__ void xsqrt(double *r, double *a0) { *r = Sleef_sqrtd1_cuda(*a0); }
+__global__ void xsqrt_u05(double *r, double *a0) { *r = Sleef_sqrtd1_u05cuda(*a0); }
+__global__ void xsqrt_u35(double *r, double *a0) { *r = Sleef_sqrtd1_u35cuda(*a0); }
+__global__ void xhypot_u05(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u05cuda(*a0, *a1); }
+__global__ void xhypot_u35(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u35cuda(*a0, *a1); }
+__global__ void xfabs(double *r, double *a0) { *r = Sleef_fabsd1_cuda(*a0); }
+__global__ void xcopysign(double *r, double *a0, double *a1) { *r = Sleef_copysignd1_cuda(*a0, *a1); }
+__global__ void xfmax(double *r, double *a0, double *a1) { *r = Sleef_fmaxd1_cuda(*a0, *a1); }
+__global__ void xfmin(double *r, double *a0, double *a1) { *r = Sleef_fmind1_cuda(*a0, *a1); }
+__global__ void xfdim(double *r, double *a0, double *a1) { *r = Sleef_fdimd1_cuda(*a0, *a1); }
+__global__ void xtrunc(double *r, double *a0) { *r = Sleef_truncd1_cuda(*a0); }
+__global__ void xfloor(double *r, double *a0) { *r = Sleef_floord1_cuda(*a0); }
+__global__ void xceil(double *r, double *a0) { *r = Sleef_ceild1_cuda(*a0); }
+__global__ void xround(double *r, double *a0) { *r = Sleef_roundd1_cuda(*a0); }
+__global__ void xrint(double *r, double *a0) { *r = Sleef_rintd1_cuda(*a0); }
+__global__ void xnextafter(double *r, double *a0, double *a1) { *r = Sleef_nextafterd1_cuda(*a0, *a1); }
+__global__ void xfrfrexp(double *r, double *a0) { *r = Sleef_frfrexpd1_cuda(*a0); }
+__global__ void xexpfrexp(int *r, double *a0) { *r = Sleef_expfrexpd1_cuda(*a0); }
+__global__ void xfmod(double *r, double *a0, double *a1) { *r = Sleef_fmodd1_cuda(*a0, *a1); }
+__global__ void xremainder(double *r, double *a0, double *a1) { *r = Sleef_remainderd1_cuda(*a0, *a1); }
+__global__ void xmodf(vdouble2 *r, double *a0) { *r = Sleef_modfd1_cuda(*a0); }
+__global__ void xlgamma_u1(double *r, double *a0) { *r = Sleef_lgammad1_u10cuda(*a0); }
+__global__ void xtgamma_u1(double *r, double *a0) { *r = Sleef_tgammad1_u10cuda(*a0); }
+__global__ void xerf_u1(double *r, double *a0) { *r = Sleef_erfd1_u10cuda(*a0); }
+__global__ void xerfc_u15(double *r, double *a0) { *r = Sleef_erfcd1_u15cuda(*a0); }
+
+__global__ void xsinf(float *r, float *a0) { *r = Sleef_sinf1_u35cuda(*a0); }
+__global__ void xcosf(float *r, float *a0) { *r = Sleef_cosf1_u35cuda(*a0); }
+__global__ void xsincosf(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u35cuda(*a0); }
+__global__ void xtanf(float *r, float *a0) { *r = Sleef_tanf1_u35cuda(*a0); }
+__global__ void xasinf(float *r, float *a0) { *r = Sleef_asinf1_u35cuda(*a0); }
+__global__ void xacosf(float *r, float *a0) { *r = Sleef_acosf1_u35cuda(*a0); }
+__global__ void xatanf(float *r, float *a0) { *r = Sleef_atanf1_u35cuda(*a0); }
+__global__ void xatan2f(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u35cuda(*a0, *a1); }
+__global__ void xlogf(float *r, float *a0) { *r = Sleef_logf1_u35cuda(*a0); }
+__global__ void xcbrtf(float *r, float *a0) { *r = Sleef_cbrtf1_u35cuda(*a0); }
+__global__ void xsinf_u1(float *r, float *a0) { *r = Sleef_sinf1_u10cuda(*a0); }
+__global__ void xcosf_u1(float *r, float *a0) { *r = Sleef_cosf1_u10cuda(*a0); }
+__global__ void xsincosf_u1(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u10cuda(*a0); }
+__global__ void xtanf_u1(float *r, float *a0) { *r = Sleef_tanf1_u10cuda(*a0); }
+__global__ void xasinf_u1(float *r, float *a0) { *r = Sleef_asinf1_u10cuda(*a0); }
+__global__ void xacosf_u1(float *r, float *a0) { *r = Sleef_acosf1_u10cuda(*a0); }
+__global__ void xatanf_u1(float *r, float *a0) { *r = Sleef_atanf1_u10cuda(*a0); }
+__global__ void xatan2f_u1(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u10cuda(*a0, *a1); }
+__global__ void xlogf_u1(float *r, float *a0) { *r = Sleef_logf1_u10cuda(*a0); }
+__global__ void xcbrtf_u1(float *r, float *a0) { *r = Sleef_cbrtf1_u10cuda(*a0); }
+__global__ void xexpf(float *r, float *a0) { *r = Sleef_expf1_u10cuda(*a0); }
+__global__ void xpowf(float *r, float *a0, float *a1) { *r = Sleef_powf1_u10cuda(*a0, *a1); }
+__global__ void xsinhf(float *r, float *a0) { *r = Sleef_sinhf1_u10cuda(*a0); }
+__global__ void xcoshf(float *r, float *a0) { *r = Sleef_coshf1_u10cuda(*a0); }
+__global__ void xtanhf(float *r, float *a0) { *r = Sleef_tanhf1_u10cuda(*a0); }
+__global__ void xsinhf_u35(float *r, float *a0) { *r = Sleef_sinhf1_u35cuda(*a0); }
+__global__ void xcoshf_u35(float *r, float *a0) { *r = Sleef_coshf1_u35cuda(*a0); }
+__global__ void xtanhf_u35(float *r, float *a0) { *r = Sleef_tanhf1_u35cuda(*a0); }
+__global__ void xfastsinf_u3500(float *r, float *a0) { *r = Sleef_fastsinf1_u3500cuda(*a0); }
+__global__ void xfastcosf_u3500(float *r, float *a0) { *r = Sleef_fastcosf1_u3500cuda(*a0); }
+__global__ void xfastpowf_u3500(float *r, float *a0, float *a1) { *r = Sleef_fastpowf1_u3500cuda(*a0, *a1); }
+__global__ void xasinhf(float *r, float *a0) { *r = Sleef_asinhf1_u10cuda(*a0); }
+__global__ void xacoshf(float *r, float *a0) { *r = Sleef_acoshf1_u10cuda(*a0); }
+__global__ void xatanhf(float *r, float *a0) { *r = Sleef_atanhf1_u10cuda(*a0); }
+__global__ void xexp2f(float *r, float *a0) { *r = Sleef_exp2f1_u10cuda(*a0); }
+__global__ void xexp2f_u35(float *r, float *a0) { *r = Sleef_exp2f1_u35cuda(*a0); }
+__global__ void xexp10f(float *r, float *a0) { *r = Sleef_exp10f1_u10cuda(*a0); }
+__global__ void xexp10f_u35(float *r, float *a0) { *r = Sleef_exp10f1_u35cuda(*a0); }
+__global__ void xexpm1f(float *r, float *a0) { *r = Sleef_expm1f1_u10cuda(*a0); }
+__global__ void xlog10f(float *r, float *a0) { *r = Sleef_log10f1_u10cuda(*a0); }
+__global__ void xlog2f(float *r, float *a0) { *r = Sleef_log2f1_u10cuda(*a0); }
+__global__ void xlog2f_u35(float *r, float *a0) { *r = Sleef_log2f1_u35cuda(*a0); }
+__global__ void xlog1pf(float *r, float *a0) { *r = Sleef_log1pf1_u10cuda(*a0); }
+__global__ void xsincospif_u05(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u05cuda(*a0); }
+__global__ void xsincospif_u35(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u35cuda(*a0); }
+__global__ void xsinpif_u05(float *r, float *a0) { *r = Sleef_sinpif1_u05cuda(*a0); }
+__global__ void xcospif_u05(float *r, float *a0) { *r = Sleef_cospif1_u05cuda(*a0); }
+__global__ void xldexpf(float *r, float *a0, int *a1) { *r = Sleef_ldexpf1_cuda(*a0, *a1); }
+__global__ void xilogbf(int *r, float *a0) { *r = Sleef_ilogbf1_cuda(*a0); }
+__global__ void xfmaf(float *r, float *a0, float *a1, float *a2) { *r = Sleef_fmaf1_cuda(*a0, *a1, *a2); }
+__global__ void xsqrtf(float *r, float *a0) { *r = Sleef_sqrtf1_cuda(*a0); }
+__global__ void xsqrtf_u05(float *r, float *a0) { *r = Sleef_sqrtf1_u05cuda(*a0); }
+__global__ void xsqrtf_u35(float *r, float *a0) { *r = Sleef_sqrtf1_u35cuda(*a0); }
+__global__ void xhypotf_u05(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u05cuda(*a0, *a1); }
+__global__ void xhypotf_u35(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u35cuda(*a0, *a1); }
+__global__ void xfabsf(float *r, float *a0) { *r = Sleef_fabsf1_cuda(*a0); }
+__global__ void xcopysignf(float *r, float *a0, float *a1) { *r = Sleef_copysignf1_cuda(*a0, *a1); }
+__global__ void xfmaxf(float *r, float *a0, float *a1) { *r = Sleef_fmaxf1_cuda(*a0, *a1); }
+__global__ void xfminf(float *r, float *a0, float *a1) { *r = Sleef_fminf1_cuda(*a0, *a1); }
+__global__ void xfdimf(float *r, float *a0, float *a1) { *r = Sleef_fdimf1_cuda(*a0, *a1); }
+__global__ void xtruncf(float *r, float *a0) { *r = Sleef_truncf1_cuda(*a0); }
+__global__ void xfloorf(float *r, float *a0) { *r = Sleef_floorf1_cuda(*a0); }
+__global__ void xceilf(float *r, float *a0) { *r = Sleef_ceilf1_cuda(*a0); }
+__global__ void xroundf(float *r, float *a0) { *r = Sleef_roundf1_cuda(*a0); }
+__global__ void xrintf(float *r, float *a0) { *r = Sleef_rintf1_cuda(*a0); }
+__global__ void xnextafterf(float *r, float *a0, float *a1) { *r = Sleef_nextafterf1_cuda(*a0, *a1); }
+__global__ void xfrfrexpf(float *r, float *a0) { *r = Sleef_frfrexpf1_cuda(*a0); }
+__global__ void xexpfrexpf(float *r, float *a0) { *r = Sleef_expfrexpf1_cuda(*a0); }
+__global__ void xfmodf(float *r, float *a0, float *a1) { *r = Sleef_fmodf1_cuda(*a0, *a1); }
+__global__ void xremainderf(float *r, float *a0, float *a1) { *r = Sleef_remainderf1_cuda(*a0, *a1); }
+__global__ void xmodff(vfloat2 *r, float *a0) { *r = Sleef_modff1_cuda(*a0); }
+__global__ void xlgammaf_u1(float *r, float *a0) { *r = Sleef_lgammaf1_u10cuda(*a0); }
+__global__ void xtgammaf_u1(float *r, float *a0) { *r = Sleef_tgammaf1_u10cuda(*a0); }
+__global__ void xerff_u1(float *r, float *a0) { *r = Sleef_erff1_u10cuda(*a0); }
+__global__ void xerfcf_u15(float *r, float *a0) { *r = Sleef_erfcf1_u15cuda(*a0); }
+
+//
+
+#define func_d_d(funcStr, funcName) {                           \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint64_t u;                                               \
+      sscanf(buf, funcStr " %" PRIx64, &u);                     \
+      *a0 = u2d(u);                                             \
+      funcName<<<1, 1>>>(r, a0);                                \
+      cudaDeviceSynchronize();                                  \
+      printf("%" PRIx64 "\n", d2u(*r));                         \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+#define func_d2_d(funcStr, funcName) {                                  \
+    while (startsWith(buf, funcStr " ")) {                              \
+      uint64_t u;                                                       \
+      sscanf(buf, funcStr " %" PRIx64, &u);                             \
+      *a0 = u2d(u);                                                     \
+      funcName<<<1, 1>>>(r2, a0);                                       \
+      cudaDeviceSynchronize();                                          \
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y));      \
+      fflush(stdout);                                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
+    }                                                                   \
+  }
+
+#define func_d_d_d(funcStr, funcName) {                         \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint64_t u, v;                                            \
+      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);     \
+      *a0 = u2d(u);                                             \
+      *a1 = u2d(v);                                             \
+      funcName<<<1, 1>>>(r, a0, a1);                            \
+      cudaDeviceSynchronize();                                  \
+      printf("%" PRIx64 "\n", d2u(*r));                         \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+#define func_d_d_i(funcStr, funcName) {                                 \
+    while (startsWith(buf, funcStr " ")) {                              \
+      uint64_t u, v;                                                    \
+      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);             \
+      *a0 = u2d(u);                                                     \
+      *i0 = (int)u2d(v);                                                \
+      funcName<<<1, 1>>>(r, a0, i0);                                    \
+      cudaDeviceSynchronize();                                          \
+      printf("%" PRIx64 "\n", d2u(*r));                                 \
+      fflush(stdout);                                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
+    }                                                                   \
+  }
+
+#define func_i_d(funcStr, funcName) {                   \
+    while (startsWith(buf, funcStr " ")) {              \
+      uint64_t u;                                       \
+      sscanf(buf, funcStr " %" PRIx64, &u);             \
+      *a0 = u2d(u);                                     \
+      funcName<<<1, 1>>>(i0, a0);                       \
+      cudaDeviceSynchronize();                          \
+      printf("%d\n", *i0);                              \
+      fflush(stdout);                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
+    }                                                   \
+  }
+
+//
+
+#define func_f_f(funcStr, funcName) {                           \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint32_t u;                                               \
+      sscanf(buf, funcStr " %x", &u);                           \
+      *b0 = u2f(u);                                             \
+      funcName<<<1, 1>>>(s, b0);                                \
+      cudaDeviceSynchronize();                                  \
+      printf("%x\n", f2u(*s));                                  \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+#define func_f2_f(funcStr, funcName) {                          \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint32_t u;                                               \
+      sscanf(buf, funcStr " %x", &u);                           \
+      *b0 = u2f(u);                                             \
+      funcName<<<1, 1>>>(s2, b0);                               \
+      cudaDeviceSynchronize();                                  \
+      printf("%x %x\n", f2u(s2->x), f2u(s2->y));                \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+#define func_f_f_f(funcStr, funcName) {                         \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint32_t u, v;                                            \
+      sscanf(buf, funcStr " %x %x", &u, &v);                    \
+      *b0 = u2f(u);                                             \
+      *b1 = u2f(v);                                             \
+      funcName<<<1, 1>>>(s, b0, b1);                            \
+      cudaDeviceSynchronize();                                  \
+      printf("%x\n", f2u(*s));                                  \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+//
+
+#define BUFSIZE 1024
+
+int main(int argc, char **argv) {
+#if 0
+  cuInit(0);
+
+  int ndevice;
+  cuDeviceGetCount(&ndevice);
+  if (ndevice == 0) {
+    fprintf(stderr, "No cuda device available\n");
+    exit(0);
+  }
+
+  CUdevice device;
+  char deviceName[1024];
+  cuDeviceGet(&device, 0);
+  cuDeviceGetName(deviceName, 1000, device);
+  fprintf(stderr, "Device : %s\n", deviceName);
+#endif
+
+  cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+
+  vdouble2 *r2;
+  vfloat2 *s2;
+  double *r, *a0, *a1, *a2;
+  float *s, *b0, *b1, *b2;
+  int *i0;
+  cudaMallocManaged(&r , 1*sizeof(double));
+  cudaMallocManaged(&r2, 1*sizeof(vdouble2));
+  cudaMallocManaged(&a0, 1*sizeof(double));
+  cudaMallocManaged(&a1, 1*sizeof(double));
+  cudaMallocManaged(&a2, 1*sizeof(double));
+  cudaMallocManaged(&s , 1*sizeof(float));
+  cudaMallocManaged(&s2, 1*sizeof(vfloat2));
+  cudaMallocManaged(&b0, 1*sizeof(float));
+  cudaMallocManaged(&b1, 1*sizeof(float));
+  cudaMallocManaged(&b2, 1*sizeof(float));
+  cudaMallocManaged(&i0, 1*sizeof(int));
+
+  printf("3\n");
+  fflush(stdout);
+
+  char buf[BUFSIZE];
+  if (fgets(buf, BUFSIZE-1, stdin)) {}
+
+  while(!feof(stdin)) {
+    func_d_d("sin", xsin);
+    func_d_d("cos", xcos);
+    func_d_d("tan", xtan);
+    func_d_d("asin", xasin);
+    func_d_d("acos", xacos);
+    func_d_d("atan", xatan);
+    func_d_d("log", xlog);
+    func_d_d("exp", xexp);
+
+    func_d_d("sqrt", xsqrt);
+    func_d_d("sqrt_u05", xsqrt_u05);
+    func_d_d("sqrt_u35", xsqrt_u35);
+    func_d_d("cbrt", xcbrt);
+    func_d_d("cbrt_u1", xcbrt_u1);
+
+    func_d_d("sinh", xsinh);
+    func_d_d("cosh", xcosh);
+    func_d_d("tanh", xtanh);
+    func_d_d("sinh_u35", xsinh_u35);
+    func_d_d("cosh_u35", xcosh_u35);
+    func_d_d("tanh_u35", xtanh_u35);
+    func_d_d("asinh", xasinh);
+    func_d_d("acosh", xacosh);
+    func_d_d("atanh", xatanh);
+
+    func_d_d("sin_u1", xsin_u1);
+    func_d_d("cos_u1", xcos_u1);
+    func_d_d("tan_u1", xtan_u1);
+    func_d_d("sinpi_u05", xsinpi_u05);
+    func_d_d("cospi_u05", xcospi_u05);
+    func_d_d("asin_u1", xasin_u1);
+    func_d_d("acos_u1", xacos_u1);
+    func_d_d("atan_u1", xatan_u1);
+    func_d_d("log_u1", xlog_u1);
+
+    func_d_d("exp2", xexp2);
+    func_d_d("exp10", xexp10);
+    func_d_d("exp2_u35", xexp2_u35);
+    func_d_d("exp10_u35", xexp10_u35);
+    func_d_d("expm1", xexpm1);
+    func_d_d("log10", xlog10);
+    func_d_d("log2", xlog2);
+    func_d_d("log2_u35", xlog2_u35);
+    func_d_d("log1p", xlog1p);
+    func_d_d("fabs", xfabs);
+    func_d_d("trunc", xtrunc);
+    func_d_d("floor", xfloor);
+    func_d_d("ceil", xceil);
+    func_d_d("round", xround);
+    func_d_d("rint", xrint);
+    func_d_d("frfrexp", xfrfrexp);
+    func_d_d("tgamma_u1", xtgamma_u1);
+    func_d_d("lgamma_u1", xlgamma_u1);
+    func_d_d("erf_u1", xerf_u1);
+    func_d_d("erfc_u15", xerfc_u15);
+
+    func_d2_d("sincos", xsincos);
+    func_d2_d("sincos_u1", xsincos_u1);
+    func_d2_d("sincospi_u35", xsincospi_u35);
+    func_d2_d("sincospi_u05", xsincospi_u05);
+    func_d2_d("modf", xmodf);
+
+    func_d_d_d("pow", xpow);
+    func_d_d_d("atan2", xatan2);
+    func_d_d_d("atan2_u1", xatan2_u1);
+    func_d_d_d("hypot_u05", xhypot_u05);
+    func_d_d_d("hypot_u35", xhypot_u35);
+    func_d_d_d("copysign", xcopysign);
+    func_d_d_d("fmax", xfmax);
+    func_d_d_d("fmin", xfmin);
+    func_d_d_d("fdim", xfdim);
+    func_d_d_d("nextafter", xnextafter);
+    func_d_d_d("fmod", xfmod);
+    func_d_d_d("remainder", xremainder);
+
+    func_d_d_i("ldexp", xldexp);
+    func_i_d("ilogb", xilogb);
+    func_i_d("expfrexp", xexpfrexp);
+
+    //
+
+    func_f_f("sinf", xsinf);
+    func_f_f("cosf", xcosf);
+    func_f_f("tanf", xtanf);
+    func_f_f("asinf", xasinf);
+    func_f_f("acosf", xacosf);
+    func_f_f("atanf", xatanf);
+    func_f_f("logf", xlogf);
+    func_f_f("expf", xexpf);
+
+    func_f_f("sqrtf", xsqrtf);
+    func_f_f("sqrtf_u05", xsqrtf_u05);
+    func_f_f("sqrtf_u35", xsqrtf_u35);
+    func_f_f("cbrtf", xcbrtf);
+    func_f_f("cbrtf_u1", xcbrtf_u1);
+
+    func_f_f("sinhf", xsinhf);
+    func_f_f("coshf", xcoshf);
+    func_f_f("tanhf", xtanhf);
+    func_f_f("sinhf_u35", xsinhf_u35);
+    func_f_f("coshf_u35", xcoshf_u35);
+    func_f_f("tanhf_u35", xtanhf_u35);
+    func_f_f("asinhf", xasinhf);
+    func_f_f("acoshf", xacoshf);
+    func_f_f("atanhf", xatanhf);
+
+    func_f_f("sinf_u1", xsinf_u1);
+    func_f_f("cosf_u1", xcosf_u1);
+    func_f_f("tanf_u1", xtanf_u1);
+    func_f_f("sinpif_u05", xsinpif_u05);
+    func_f_f("cospif_u05", xcospif_u05);
+    func_f_f("asinf_u1", xasinf_u1);
+    func_f_f("acosf_u1", xacosf_u1);
+    func_f_f("atanf_u1", xatanf_u1);
+    func_f_f("logf_u1", xlogf_u1);
+
+    func_f_f("exp2f", xexp2f);
+    func_f_f("exp10f", xexp10f);
+    func_f_f("exp2f_u35", xexp2f_u35);
+    func_f_f("exp10f_u35", xexp10f_u35);
+    func_f_f("expm1f", xexpm1f);
+    func_f_f("log10f", xlog10f);
+    func_f_f("log2f", xlog2f);
+    func_f_f("log2f_u35", xlog2f_u35);
+    func_f_f("log1pf", xlog1pf);
+
+    func_f2_f("sincosf", xsincosf);
+    func_f2_f("sincosf_u1", xsincosf_u1);
+    func_f2_f("sincospif_u35", xsincospif_u35);
+    func_f2_f("sincospif_u05", xsincospif_u05);
+
+    func_f_f_f("powf", xpowf);
+    func_f_f_f("atan2f", xatan2f);
+    func_f_f_f("atan2f_u1", xatan2f_u1);
+
+    func_f_f("fabsf", xfabsf);
+    func_f_f("truncf", xtruncf);
+    func_f_f("floorf", xfloorf);
+    func_f_f("ceilf", xceilf);
+    func_f_f("roundf", xroundf);
+    func_f_f("rintf", xrintf);
+    func_f_f("frfrexpf", xfrfrexpf);
+
+    func_f_f_f("hypotf_u05", xhypotf_u05);
+    func_f_f_f("hypotf_u35", xhypotf_u35);
+    func_f_f_f("copysignf", xcopysignf);
+    func_f_f_f("fmaxf", xfmaxf);
+    func_f_f_f("fminf", xfminf);
+    func_f_f_f("fdimf", xfdimf);
+    func_f_f_f("nextafterf", xnextafterf);
+    func_f_f_f("fmodf", xfmodf);
+    func_f_f_f("remainderf", xremainderf);
+
+    func_f2_f("modff", xmodff);
+
+    func_f_f("tgammaf_u1", xtgammaf_u1);
+    func_f_f("lgammaf_u1", xlgammaf_u1);
+    func_f_f("erff_u1", xerff_u1);
+    func_f_f("erfcf_u15", xerfcf_u15);
+
+    func_f_f("fastsinf_u3500", xfastsinf_u3500);
+    func_f_f("fastcosf_u3500", xfastcosf_u3500);
+    func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
+  }
+
+  return 0;
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutsimd.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutsimd.c
@@ -0,0 +1,859 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2023.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include <math.h>
+
+#if defined(_MSC_VER)
+#define STDIN_FILENO 0
+#else
+#include <unistd.h>
+#include <sys/types.h>
+#endif
+
+#include "quaddef.h"
+#include "misc.h"
+
+#if !defined(USE_INLINE_HEADER)
+#include "sleef.h"
+#else // #if !defined(USE_INLINE_HEADER)
+#include <stddef.h>
+#include <stdint.h>
+#include <float.h>
+#include <limits.h>
+
+#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__)
+#ifndef FP_FAST_FMA
+#define FP_FAST_FMA
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__STDC__)
+#define __STDC__ 1
+#endif
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
+#include <x86intrin.h>
+#endif
+
+#if (defined(_MSC_VER))
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#endif
+
+#if defined(__riscv) && defined(__riscv_v)
+#include <riscv_vector.h>
+#endif
+
+#if defined(__VSX__)
+#include <altivec.h>
+#endif
+
+#if defined(__VX__)
+#include <vecintrin.h>
+#endif
+
+#define SLEEF_ALWAYS_INLINE inline
+#define SLEEF_INLINE
+#define SLEEF_CONST
+#include USE_INLINE_HEADER
+#include MACRO_ONLY_HEADER
+
+#ifndef ENABLE_PUREC_SCALAR
+#include "sleefinline_purec_scalar.h"
+#endif
+
+#endif // #if !defined(USE_INLINE_HEADER)
+
+#include "testerutil.h"
+
+#define DORENAME
+
+#ifdef ENABLE_SSE2
+#include "renamesse2.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helpersse2.h"
+typedef Sleef___m128d_2 vdouble2;
+typedef Sleef___m128_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_SSE4
+#include "renamesse4.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 4
+#include "helpersse2.h"
+typedef Sleef___m128d_2 vdouble2;
+typedef Sleef___m128_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_AVX
+#include "renameavx.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperavx.h"
+typedef Sleef___m256d_2 vdouble2;
+typedef Sleef___m256_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_FMA4
+#include "renamefma4.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 4
+#include "helperavx.h"
+typedef Sleef___m256d_2 vdouble2;
+typedef Sleef___m256_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_AVX2
+#include "renameavx2.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperavx2.h"
+typedef Sleef___m256d_2 vdouble2;
+typedef Sleef___m256_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_AVX2128
+#include "renameavx2128.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperavx2_128.h"
+typedef Sleef___m128d_2 vdouble2;
+typedef Sleef___m128_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_AVX512F
+#include "renameavx512f.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperavx512f.h"
+typedef Sleef___m512d_2 vdouble2;
+typedef Sleef___m512_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_AVX512FNOFMA
+#include "renameavx512fnofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helperavx512f.h"
+typedef Sleef___m512d_2 vdouble2;
+typedef Sleef___m512_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VECEXT
+#define CONFIG 1
+#include "helpervecext.h"
+#include "norename.h"
+#endif
+
+#ifdef ENABLE_PUREC
+#define CONFIG 1
+#include "helperpurec.h"
+#include "norename.h"
+#endif
+
+#ifdef ENABLE_NEON32
+#include "renameneon32.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperneon32.h"
+typedef Sleef_float32x4_t_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_NEON32VFPV4
+#include "renameneon32vfpv4.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 4
+#include "helperneon32.h"
+typedef Sleef_float32x4_t_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_ADVSIMD
+#include "renameadvsimd.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperadvsimd.h"
+typedef Sleef_float64x2_t_2 vdouble2;
+typedef Sleef_float32x4_t_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_ADVSIMDNOFMA
+#include "renameadvsimdnofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helperadvsimd.h"
+typedef Sleef_float64x2_t_2 vdouble2;
+typedef Sleef_float32x4_t_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_DSP128
+#define CONFIG 2
+#include "helpersse2.h"
+#include "renamedsp128.h"
+typedef Sleef___m128d_2 vdouble2;
+typedef Sleef___m128_2 vfloat2;
+#endif
+
+#ifdef ENABLE_SVE
+#include "renamesve.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helpersve.h"
+#endif
+#endif
+
+#ifdef ENABLE_SVENOFMA
+#include "renamesvenofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helpersve.h"
+#endif
+#endif
+
+#ifdef ENABLE_DSP256
+#define CONFIG 1
+#include "helperavx.h"
+#include "renamedsp256.h"
+typedef Sleef___m256d_2 vdouble2;
+typedef Sleef___m256_2 vfloat2;
+#endif
+
+#ifdef ENABLE_VSX
+#include "renamevsx.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperpower_128.h"
+#include "renamevsx.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VSXNOFMA
+#include "renamevsxnofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helperpower_128.h"
+#include "renamevsxnofma.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VSX3
+#include "renamevsx3.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 3
+#include "helperpower_128.h"
+#include "renamevsx3.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VSX3NOFMA
+#include "renamevsx3nofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 4
+#include "helperpower_128.h"
+#include "renamevsx3nofma.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VXE
+#include "renamevxe.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 140
+#include "helpers390x_128.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VXENOFMA
+#include "renamevxenofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 141
+#include "helpers390x_128.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VXE2
+#include "renamevxe2.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 150
+#include "helpers390x_128.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_VXE2NOFMA
+#include "renamevxe2nofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 151
+#include "helpers390x_128.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_DSPPOWER_128
+#define CONFIG 1
+#include "helperpower_128.h"
+#include "renamedsp128.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+
+#ifdef ENABLE_DSPS390X_128
+#define CONFIG 140
+#include "helpers390x_128.h"
+#include "renamedsp128.h"
+typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
+typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
+#endif
+
+#ifdef ENABLE_RVVM1
+#include "renamervvm1.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperrvv.h"
+#endif
+#endif
+
+#ifdef ENABLE_RVVM1NOFMA
+#include "renamervvm1nofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helperrvv.h"
+#endif
+#endif
+
+#ifdef ENABLE_RVVM2
+#include "renamervvm2.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperrvv.h"
+#endif
+#endif
+
+#ifdef ENABLE_RVVM2NOFMA
+#include "renamervvm2nofma.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helperrvv.h"
+#endif
+#endif
+
+#ifdef ENABLE_PUREC_SCALAR
+#include "renamepurec_scalar.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 1
+#include "helperpurec_scalar.h"
+typedef Sleef_double_2 vdouble2;
+typedef Sleef_float_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_PURECFMA_SCALAR
+#include "renamepurecfma_scalar.h"
+#if !defined(USE_INLINE_HEADER)
+#define CONFIG 2
+#include "helperpurec_scalar.h"
+typedef Sleef_double_2 vdouble2;
+typedef Sleef_float_2 vfloat2;
+#endif
+#endif
+
+#ifdef ENABLE_DSP_SCALAR
+#include "renamedspscalar.h"
+#define CONFIG 1
+#include "helperpurec_scalar.h"
+typedef Sleef_double_2 vdouble2;
+typedef Sleef_float_2 vfloat2;
+#endif
+
+#ifdef USE_INLINE_HEADER
+#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
+#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
+#define vmask CONCAT_SIMD_SUFFIX(vmask, SIMD_SUFFIX)
+#define vopmask CONCAT_SIMD_SUFFIX(vopmask, SIMD_SUFFIX)
+#define vdouble CONCAT_SIMD_SUFFIX(vdouble, SIMD_SUFFIX)
+#define vint CONCAT_SIMD_SUFFIX(vint, SIMD_SUFFIX)
+#define vfloat CONCAT_SIMD_SUFFIX(vfloat, SIMD_SUFFIX)
+#define vint2 CONCAT_SIMD_SUFFIX(vint2, SIMD_SUFFIX)
+#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
+#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
+#define vd2getx_vd_vd2 CONCAT_SIMD_SUFFIX(vd2getx_vd_vd2, SIMD_SUFFIX)
+#define vd2gety_vd_vd2 CONCAT_SIMD_SUFFIX(vd2gety_vd_vd2, SIMD_SUFFIX)
+#define vf2getx_vf_vf2 CONCAT_SIMD_SUFFIX(vf2getx_vf_vf2, SIMD_SUFFIX)
+#define vf2gety_vf_vf2 CONCAT_SIMD_SUFFIX(vf2gety_vf_vf2, SIMD_SUFFIX)
+#define vloadu_vd_p CONCAT_SIMD_SUFFIX(vloadu_vd_p, SIMD_SUFFIX)
+#define vstoreu_v_p_vd CONCAT_SIMD_SUFFIX(vstoreu_v_p_vd, SIMD_SUFFIX)
+#define vloadu_vf_p CONCAT_SIMD_SUFFIX(vloadu_vf_p, SIMD_SUFFIX)
+#define vstoreu_v_p_vf CONCAT_SIMD_SUFFIX(vstoreu_v_p_vf, SIMD_SUFFIX)
+#define vloadu_vi_p CONCAT_SIMD_SUFFIX(vloadu_vi_p, SIMD_SUFFIX)
+#define vstoreu_v_p_vi CONCAT_SIMD_SUFFIX(vstoreu_v_p_vi, SIMD_SUFFIX)
+#endif
+
+//
+
+int check_feature(double d, float f) {
+#ifdef ENABLE_DP
+  {
+    double s[VECTLENDP];
+    int i;
+    for(i=0;i<VECTLENDP;i++) {
+      s[i] = d;
+    }
+    vdouble a = vloadu_vd_p(s);
+    a = xpow(a, a);
+    vstoreu_v_p_vd(s, a);
+    if (s[0] == s[0]) return 1;
+  }
+#endif
+#ifdef ENABLE_SP
+  {
+    float s[VECTLENSP];
+    int i;
+    for(i=0;i<VECTLENSP;i++) {
+      s[i] = d;
+    }
+    vfloat a = vloadu_vf_p(s);
+    a = xpowf(a, a);
+    vstoreu_v_p_vf(s, a);
+    if (s[0] == s[0]) return 1;
+  }
+#endif
+  return 0;
+}
+
+#if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
+static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
+static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
+#endif
+
+#if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
+static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
+static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
+#endif
+
+//
+
+#define func_d_d(funcStr, funcName) {                           \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint64_t u;                                               \
+      sscanf(buf, funcStr " %" PRIx64, &u);                     \
+      double s[VECTLENDP];                                      \
+      memrand(s, sizeof(s));                                    \
+      int idx = xrand() & (VECTLENDP-1);                        \
+      s[idx] = u2d(u);                                          \
+      vdouble a = vloadu_vd_p(s);                               \
+      a = funcName(a);                                          \
+      vstoreu_v_p_vd(s, a);                                     \
+      u = d2u(s[idx]);                                          \
+      printf("%" PRIx64 "\n", u);                               \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+#define func_d2_d(funcStr, funcName) {                                  \
+    while (startsWith(buf, funcStr " ")) {                              \
+      uint64_t u;                                                       \
+      sscanf(buf, funcStr " %" PRIx64, &u);                             \
+      double s[VECTLENDP], t[VECTLENDP];                                \
+      memrand(s, sizeof(s));                                            \
+      memrand(t, sizeof(t));                                            \
+      int idx = xrand() & (VECTLENDP-1);                                \
+      s[idx] = u2d(u);                                                  \
+      vdouble2 v;                                                       \
+      vdouble a = vloadu_vd_p(s);                                       \
+      v = funcName(a);                                                  \
+      vstoreu_v_p_vd(s, vd2getx_vd_vd2(v));                             \
+      vstoreu_v_p_vd(t, vd2gety_vd_vd2(v));                             \
+      Sleef_double2 d2;                                                 \
+      d2.x = s[idx];                                                    \
+      d2.y = t[idx];                                                    \
+      printf("%" PRIx64 " %" PRIx64 "\n", d2u(d2.x), d2u(d2.y));        \
+      fflush(stdout);                                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
+    }                                                                   \
+  }
+
+#define func_d_d_d(funcStr, funcName) {                         \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint64_t u, v;                                            \
+      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);     \
+      double s[VECTLENDP], t[VECTLENDP];                        \
+      memrand(s, sizeof(s));                                    \
+      memrand(t, sizeof(t));                                    \
+      int idx = xrand() & (VECTLENDP-1);                        \
+      s[idx] = u2d(u);                                          \
+      t[idx] = u2d(v);                                          \
+      vdouble a, b;                                             \
+      a = vloadu_vd_p(s);                                       \
+      b = vloadu_vd_p(t);                                       \
+      a = funcName(a, b);                                       \
+      vstoreu_v_p_vd(s, a);                                     \
+      u = d2u(s[idx]);                                          \
+      printf("%" PRIx64 "\n", u);                               \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+#define func_d_d_i(funcStr, funcName) {                                 \
+    while (startsWith(buf, funcStr " ")) {                              \
+      uint64_t u, v;                                                    \
+      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);             \
+      double s[VECTLENDP];                                              \
+      int t[VECTLENDP*2];                                               \
+      memrand(s, sizeof(s));                                            \
+      memrand(t, sizeof(t));                                            \
+      int idx = xrand() & (VECTLENDP-1);                                \
+      s[idx] = u2d(u);                                                  \
+      t[idx] = (int)u2d(v);                                             \
+      vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t)));      \
+      u = d2u(s[idx]);                                                  \
+      printf("%" PRIx64 "\n", u);                                       \
+      fflush(stdout);                                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
+    }                                                                   \
+  }
+
+#define func_i_d(funcStr, funcName) {                           \
+    while (startsWith(buf, funcStr " ")) {                      \
+      uint64_t u;                                               \
+      int i;                                                    \
+      sscanf(buf, funcStr " %" PRIx64, &u);                     \
+      double s[VECTLENDP];                                      \
+      int t[VECTLENDP*2];                                       \
+      memrand(s, sizeof(s));                                    \
+      memrand(t, sizeof(t));                                    \
+      int idx = xrand() & (VECTLENDP-1);                        \
+      s[idx] = u2d(u);                                          \
+      vdouble a = vloadu_vd_p(s);                               \
+      vint vi = funcName(a);                                    \
+      vstoreu_v_p_vi(t, vi);                                    \
+      i = t[idx];                                               \
+      printf("%d\n", i);                                        \
+      fflush(stdout);                                           \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
+    }                                                           \
+  }
+
+//
+
+#define func_f_f(funcStr, funcName) {                   \
+    while (startsWith(buf, funcStr " ")) {              \
+      uint32_t u;                                       \
+      sscanf(buf, funcStr " %x", &u);                   \
+      float s[VECTLENSP];                               \
+      memrand(s, sizeof(s));                            \
+      int idx = xrand() & (VECTLENSP-1);                \
+      s[idx] = u2f(u);                                  \
+      vfloat a = vloadu_vf_p(s);                        \
+      a = funcName(a);                                  \
+      vstoreu_v_p_vf(s, a);                             \
+      u = f2u(s[idx]);                                  \
+      printf("%x\n", u);                                \
+      fflush(stdout);                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
+    }                                                   \
+  }
+
+#define func_f2_f(funcStr, funcName) {                  \
+    while (startsWith(buf, funcStr " ")) {              \
+      uint32_t u;                                       \
+      sscanf(buf, funcStr " %x", &u);                   \
+      float s[VECTLENSP], t[VECTLENSP];                 \
+      memrand(s, sizeof(s));                            \
+      memrand(t, sizeof(t));                            \
+      int idx = xrand() & (VECTLENSP-1);                \
+      s[idx] = u2f(u);                                  \
+      vfloat2 v;                                        \
+      vfloat a = vloadu_vf_p(s);                        \
+      v = funcName(a);                                  \
+      vstoreu_v_p_vf(s, vf2getx_vf_vf2(v));             \
+      vstoreu_v_p_vf(t, vf2gety_vf_vf2(v));             \
+      Sleef_float2 d2;                                  \
+      d2.x = s[idx];                                    \
+      d2.y = t[idx];                                    \
+      printf("%x %x\n", f2u(d2.x), f2u(d2.y));          \
+      fflush(stdout);                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
+    }                                                   \
+  }
+
+#define func_f_f_f(funcStr, funcName) {                 \
+    while (startsWith(buf, funcStr " ")) {              \
+      uint32_t u, v;                                    \
+      sscanf(buf, funcStr " %x %x", &u, &v);            \
+      float s[VECTLENSP], t[VECTLENSP];                 \
+      memrand(s, sizeof(s));                            \
+      memrand(t, sizeof(t));                            \
+      int idx = xrand() & (VECTLENSP-1);                \
+      s[idx] = u2f(u);                                  \
+      t[idx] = u2f(v);                                  \
+      vfloat a, b;                                      \
+      a = vloadu_vf_p(s);                               \
+      b = vloadu_vf_p(t);                               \
+      a = funcName(a, b);                               \
+      vstoreu_v_p_vf(s, a);                             \
+      u = f2u(s[idx]);                                  \
+      printf("%x\n", u);                                \
+      fflush(stdout);                                   \
+      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
+    }                                                   \
+  }
+
+//
+
+#define BUFSIZE 1024
+
+int main2(int argc, char **argv) {
+  xsrand(time(NULL));
+
+  {
+    int k = 0;
+#ifdef ENABLE_DP
+    k += 1;
+#endif
+#ifdef ENABLE_SP
+    k += 2;
+#endif
+#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
+    k += 4; // flush to zero
+#elif defined(ENABLE_VECEXT)
+    if (vcast_f_vf(xpowf(vcast_vf_f(0.5f), vcast_vf_f(140))) == 0) k += 4;
+#endif
+#if defined(DETERMINISTIC)
+    k += 8;
+#endif
+
+    printf("%d\n", k);
+    fflush(stdout);
+  }
+
+#if !defined(USE_INLINE_HEADER)
+  fprintf(stderr, "IUT : %s\n", (const char *)xgetPtrf(0));
+#endif
+  fflush(stderr);
+
+  char buf[BUFSIZE];
+  fgets(buf, BUFSIZE-1, stdin);
+
+  while(!feof(stdin)) {
+#ifdef ENABLE_DP
+    func_d_d("sin", xsin);
+    func_d_d("cos", xcos);
+    func_d_d("tan", xtan);
+    func_d_d("asin", xasin);
+    func_d_d("acos", xacos);
+    func_d_d("atan", xatan);
+    func_d_d("log", xlog);
+    func_d_d("exp", xexp);
+
+#ifndef DETERMINISTIC
+    func_d_d("sqrt", xsqrt);
+    func_d_d("sqrt_u05", xsqrt_u05);
+    func_d_d("sqrt_u35", xsqrt_u35);
+#endif
+    func_d_d("cbrt", xcbrt);
+    func_d_d("cbrt_u1", xcbrt_u1);
+
+    func_d_d("sinh", xsinh);
+    func_d_d("cosh", xcosh);
+    func_d_d("tanh", xtanh);
+    func_d_d("sinh_u35", xsinh_u35);
+    func_d_d("cosh_u35", xcosh_u35);
+    func_d_d("tanh_u35", xtanh_u35);
+    func_d_d("asinh", xasinh);
+    func_d_d("acosh", xacosh);
+    func_d_d("atanh", xatanh);
+
+    func_d_d("sin_u1", xsin_u1);
+    func_d_d("cos_u1", xcos_u1);
+    func_d_d("tan_u1", xtan_u1);
+    func_d_d("sinpi_u05", xsinpi_u05);
+    func_d_d("cospi_u05", xcospi_u05);
+    func_d_d("asin_u1", xasin_u1);
+    func_d_d("acos_u1", xacos_u1);
+    func_d_d("atan_u1", xatan_u1);
+    func_d_d("log_u1", xlog_u1);
+
+    func_d_d("exp2", xexp2);
+    func_d_d("exp10", xexp10);
+    func_d_d("exp2_u35", xexp2_u35);
+    func_d_d("exp10_u35", xexp10_u35);
+    func_d_d("expm1", xexpm1);
+    func_d_d("log10", xlog10);
+    func_d_d("log2", xlog2);
+    func_d_d("log2_u35", xlog2_u35);
+    func_d_d("log1p", xlog1p);
+
+    func_d2_d("sincos", xsincos);
+    func_d2_d("sincos_u1", xsincos_u1);
+    func_d2_d("sincospi_u35", xsincospi_u35);
+    func_d2_d("sincospi_u05", xsincospi_u05);
+
+    func_d_d_d("pow", xpow);
+    func_d_d_d("atan2", xatan2);
+    func_d_d_d("atan2_u1", xatan2_u1);
+
+    func_d_d_i("ldexp", xldexp);
+
+    func_i_d("ilogb", xilogb);
+
+    func_d_d("fabs", xfabs);
+    func_d_d("trunc", xtrunc);
+    func_d_d("floor", xfloor);
+    func_d_d("ceil", xceil);
+    func_d_d("round", xround);
+    func_d_d("rint", xrint);
+    func_d_d("frfrexp", xfrfrexp);
+    func_i_d("expfrexp", xexpfrexp);
+
+    func_d_d_d("hypot_u05", xhypot_u05);
+    func_d_d_d("hypot_u35", xhypot_u35);
+    func_d_d_d("copysign", xcopysign);
+    func_d_d_d("fmax", xfmax);
+    func_d_d_d("fmin", xfmin);
+    func_d_d_d("fdim", xfdim);
+    func_d_d_d("nextafter", xnextafter);
+    func_d_d_d("fmod", xfmod);
+    func_d_d_d("remainder", xremainder);
+
+    func_d2_d("modf", xmodf);
+
+    func_d_d("tgamma_u1", xtgamma_u1);
+    func_d_d("lgamma_u1", xlgamma_u1);
+    func_d_d("erf_u1", xerf_u1);
+    func_d_d("erfc_u15", xerfc_u15);
+#endif
+
+#ifdef ENABLE_SP
+    func_f_f("sinf", xsinf);
+    func_f_f("cosf", xcosf);
+    func_f_f("tanf", xtanf);
+    func_f_f("asinf", xasinf);
+    func_f_f("acosf", xacosf);
+    func_f_f("atanf", xatanf);
+    func_f_f("logf", xlogf);
+    func_f_f("expf", xexpf);
+
+#ifndef DETERMINISTIC
+    func_f_f("sqrtf", xsqrtf);
+    func_f_f("sqrtf_u05", xsqrtf_u05);
+    func_f_f("sqrtf_u35", xsqrtf_u35);
+#endif
+    func_f_f("cbrtf", xcbrtf);
+    func_f_f("cbrtf_u1", xcbrtf_u1);
+
+    func_f_f("sinhf", xsinhf);
+    func_f_f("coshf", xcoshf);
+    func_f_f("tanhf", xtanhf);
+    func_f_f("sinhf_u35", xsinhf_u35);
+    func_f_f("coshf_u35", xcoshf_u35);
+    func_f_f("tanhf_u35", xtanhf_u35);
+    func_f_f("asinhf", xasinhf);
+    func_f_f("acoshf", xacoshf);
+    func_f_f("atanhf", xatanhf);
+
+    func_f_f("sinf_u1", xsinf_u1);
+    func_f_f("cosf_u1", xcosf_u1);
+    func_f_f("tanf_u1", xtanf_u1);
+    func_f_f("sinpif_u05", xsinpif_u05);
+    func_f_f("cospif_u05", xcospif_u05);
+    func_f_f("asinf_u1", xasinf_u1);
+    func_f_f("acosf_u1", xacosf_u1);
+    func_f_f("atanf_u1", xatanf_u1);
+    func_f_f("logf_u1", xlogf_u1);
+
+    func_f_f("exp2f", xexp2f);
+    func_f_f("exp10f", xexp10f);
+    func_f_f("exp2f_u35", xexp2f_u35);
+    func_f_f("exp10f_u35", xexp10f_u35);
+    func_f_f("expm1f", xexpm1f);
+    func_f_f("log10f", xlog10f);
+    func_f_f("log2f", xlog2f);
+    func_f_f("log2f_u35", xlog2f_u35);
+    func_f_f("log1pf", xlog1pf);
+
+    func_f2_f("sincosf", xsincosf);
+    func_f2_f("sincosf_u1", xsincosf_u1);
+    func_f2_f("sincospif_u35", xsincospif_u35);
+    func_f2_f("sincospif_u05", xsincospif_u05);
+
+    func_f_f_f("powf", xpowf);
+    func_f_f_f("atan2f", xatan2f);
+    func_f_f_f("atan2f_u1", xatan2f_u1);
+
+    func_f_f("fabsf", xfabsf);
+    func_f_f("truncf", xtruncf);
+    func_f_f("floorf", xfloorf);
+    func_f_f("ceilf", xceilf);
+    func_f_f("roundf", xroundf);
+    func_f_f("rintf", xrintf);
+    func_f_f("frfrexpf", xfrfrexpf);
+
+    func_f_f_f("hypotf_u05", xhypotf_u05);
+    func_f_f_f("hypotf_u35", xhypotf_u35);
+    func_f_f_f("copysignf", xcopysignf);
+    func_f_f_f("fmaxf", xfmaxf);
+    func_f_f_f("fminf", xfminf);
+    func_f_f_f("fdimf", xfdimf);
+    func_f_f_f("nextafterf", xnextafterf);
+    func_f_f_f("fmodf", xfmodf);
+    func_f_f_f("remainderf", xremainderf);
+
+    func_f2_f("modff", xmodff);
+
+    func_f_f("tgammaf_u1", xtgammaf_u1);
+    func_f_f("lgammaf_u1", xlgammaf_u1);
+    func_f_f("erff_u1", xerff_u1);
+    func_f_f("erfcf_u15", xerfcf_u15);
+
+    func_f_f("fastsinf_u3500", xfastsinf_u3500);
+    func_f_f("fastcosf_u3500", xfastcosf_u3500);
+    func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
+#endif
+  }
+
+  return 0;
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/mveclibtest.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/mveclibtest.c
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <x86intrin.h>
+
+#include <sleef.h>
+
+#define N 64
+#define M 256
+
+double r0[N], a0[N], a1[N], a2[N];
+
+void do_libm() { for(int i=0;i<N;i++) r0[i] = sin(a0[i]); }
+
+#if defined(__SSE2__)
+void do_sleef_sse2() { _mm_storeu_pd(r0, Sleef_sind2_u10sse2(_mm_loadu_pd(a0))); }
+#endif
+
+#if defined(__AVX__)
+void do_sleef_avx() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx(_mm256_loadu_pd(a0))); }
+#endif
+
+#if defined(__AVX2__)
+void do_sleef_avx2() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx2(_mm256_loadu_pd(a0))); }
+#endif
+
+#if defined(__AVX512F__)
+void do_sleef_avx512f() { _mm512_storeu_pd(r0, Sleef_sind8_u10avx512f(_mm512_loadu_pd(a0))); }
+#endif
+
+int do_test_once(double d) {
+  for(int i=0;i<N;i++) a0[i] = d;
+  do_libm();
+  double rm = r0[0];
+
+#if defined(__SSE2__)
+  for(int i=0;i<N;i++) a0[i] = d;
+  do_sleef_sse2();
+  if (rm == r0[0]) return 1;
+#endif
+
+#if defined(__AVX__)
+  for(int i=0;i<N;i++) a0[i] = d;
+  do_sleef_avx();
+  if (rm == r0[0]) return 1;
+#endif
+
+#if defined(__AVX2__)
+  for(int i=0;i<N;i++) a0[i] = d;
+  do_sleef_avx2();
+  if (rm == r0[0]) return 1;
+#endif
+
+#if defined(__AVX512F__)
+  for(int i=0;i<N;i++) a0[i] = d;
+  do_sleef_avx512f();
+  if (rm == r0[0]) return 1;
+#endif
+
+  return 0;
+}
+
+int check_feature(double d, float f) {
+#if defined(__SSE2__)
+  do_sleef_sse2();
+#endif
+
+#if defined(__AVX__)
+  do_sleef_avx();
+#endif
+
+#if defined(__AVX2__)
+  do_sleef_avx2();
+#endif
+
+#if defined(__AVX512F__)
+  do_sleef_avx512f();
+#endif
+
+  return 1;
+}
+
+int main2(int argc, char **argv) {
+  for(int i=0;i<M;i++) {
+    if (!do_test_once(10.0 * ((2.0 * rand() / RAND_MAX) - 1))) {
+      printf("fail\n");
+      exit(-1);
+    }
+  }
+  printf("pass\n");
+  exit(0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester.c
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2dp.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2dp.c
@@ -0,0 +1,991 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2021.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpfr.h>
+#include <time.h>
+#include <float.h>
+#include <limits.h>
+
+#include <math.h>
+
+#ifdef ENABLE_SYS_getrandom
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/random.h>
+#endif
+
+#include "sleef.h"
+#include "testerutil.h"
+
+#define DORENAME
+#include "rename.h"
+
+#define DENORMAL_DBL_MIN (4.9406564584124654418e-324)
+#define POSITIVE_INFINITY INFINITY
+#define NEGATIVE_INFINITY (-INFINITY)
+
+typedef union {
+  double d;
+  uint64_t u64;
+  int64_t i64;
+} conv_t;
+
+double nexttoward0(double x, int n) {
+  union {
+    double f;
+    uint64_t u;
+  } cx;
+  cx.f = x;
+  cx.u -=n ;
+  return cx.f;
+}
+
+double rnd() {
+  conv_t c;
+  switch(random() & 63) {
+  case 0: return nexttoward0( 0.0, -(random() & ((1 << (random() & 31)) - 1)));
+  case 1: return nexttoward0(-0.0, -(random() & ((1 << (random() & 31)) - 1)));
+  case 2: return nexttoward0( INFINITY, (random() & ((1 << (random() & 31)) - 1)));
+  case 3: return nexttoward0(-INFINITY, (random() & ((1 << (random() & 31)) - 1)));
+  }
+#ifdef ENABLE_SYS_getrandom
+  syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+  c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  return c.d;
+}
+
+double rnd_fr() {
+  conv_t c;
+  do {
+#ifdef ENABLE_SYS_getrandom
+    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  } while(!isnumber(c.d));
+  return c.d;
+}
+
+double rnd_zo() {
+  conv_t c;
+  do {
+#ifdef ENABLE_SYS_getrandom
+    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  } while(!isnumber(c.d) || c.d < -1 || 1 < c.d);
+  return c.d;
+}
+
+int main(int argc,char **argv)
+{
+  mpfr_t frw, frx, fry, frz;
+
+  mpfr_set_default_prec(1280);
+  mpfr_inits(frw, frx, fry, frz, NULL);
+
+  conv_t cd;
+  double d, t;
+  double d2, d3, zo;
+
+  int cnt, ecnt = 0;
+
+  srandom(time(NULL));
+
+  for(cnt = 0;ecnt < 1000;cnt++) {
+    switch(cnt & 7) {
+    case 0:
+      d = rnd();
+      d2 = rnd();
+      d3 = rnd();
+      zo = rnd();
+      break;
+    case 1:
+      cd.d = rint(rnd_zo() * 1e+10) * M_PI_4;
+      cd.i64 += (random() & 0xff) - 0x7f;
+      d = cd.d;
+      d2 = rnd();
+      d3 = rnd();
+      zo = rnd();
+      break;
+    case 2:
+      cd.d = rnd_fr() * M_PI_4;
+      cd.i64 += (random() & 0xf) - 0x7;
+      d = cd.d;
+      d2 = rnd();
+      d3 = rnd();
+      zo = rnd();
+      break;
+    default:
+      d = rnd_fr();
+      d2 = rnd_fr();
+      d3 = rnd_fr();
+      zo = rnd_zo();
+      break;
+    }
+
+    Sleef_double2 sc  = xsincospi_u05(d);
+    Sleef_double2 sc2 = xsincospi_u35(d);
+
+    {
+      const double rangemax2 = 1e+9/4;
+
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinpi(frx, frx, GMP_RNDN);
+
+      double u0 = countULP2dp(t = sc.x, frx);
+
+      if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincospi_u05 sin arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULP2dp(t = sc2.x, frx);
+
+      if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincospi_u35 sin arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+
+      double u2 = countULP2dp(t = xsinpi_u05(d), frx);
+
+      if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sinpi_u05 arg=%.20g ulp=%.20g\n", d, u2);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      const double rangemax2 = 1e+9/4;
+
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cospi(frx, frx, GMP_RNDN);
+
+      double u0 = countULP2dp(t = sc.y, frx);
+
+      if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincospi_u05 cos arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULP2dp(t = sc.y, frx);
+
+      if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincospi_u35 cos arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+
+      double u2 = countULP2dp(t = xcospi_u05(d), frx);
+
+      if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C cospi_u05 arg=%.20g ulp=%.20g\n", d, u2);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    sc = xsincos(d);
+    sc2 = xsincos_u1(d);
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sin(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xsin(d), frx);
+
+      if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(sc.x, frx);
+
+      if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+
+      double u2 = countULPdp(t = xsin_u1(d), frx);
+
+      if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+
+      double u3 = countULPdp(t = sc2.x, frx);
+
+      if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cos(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xcos(d), frx);
+
+      if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = sc.y, frx);
+
+      if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+
+      double u2 = countULPdp(t = xcos_u1(d), frx);
+
+      if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2);
+        fflush(stdout); ecnt++;
+      }
+
+      double u3 = countULPdp(t = sc2.y, frx);
+
+      if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
+        printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tan(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xtan(d), frx);
+
+      if (u0 != 0 && (u0 > 3.5 || isnan(t))) {
+        printf("Pure C tan arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xtan_u1(d), frx);
+
+      if (u1 != 0 && (u1 > 1 || isnan(t))) {
+        printf("Pure C tan_u1 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xlog(fabs(d)), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C log arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xlog_u1(fabs(d)), frx);
+
+      if (u1 > 1) {
+        printf("Pure C log_u1 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log10(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xlog10(fabs(d)), frx);
+
+      if (u0 > 1) {
+        printf("Pure C log10 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log2(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xlog2(fabs(d)), frx);
+
+      if (u0 > 1) {
+        printf("Pure C log2 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xlog2_u35(fabs(d)), frx);
+
+      if (u1 > 3.5) {
+        printf("Pure C log2_u35 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_log1p(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xlog1p(d), frx);
+
+      if ((-1 <= d && d <= 1e+307 && u0 > 1) ||
+          (d < -1 && !isnan(t)) ||
+          (d > 1e+307 && !(u0 <= 1 || isinf(t)))) {
+        printf("Pure C log1p arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xexp(d), frx);
+
+      if (u0 > 1) {
+        printf("Pure C exp arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp2(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xexp2(d), frx);
+
+      if (u0 > 1) {
+        printf("Pure C exp2 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xexp2_u35(d), frx);
+
+      if (u1 > 3.5) {
+        printf("Pure C exp2_u35 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp10(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xexp10(d), frx);
+
+      if (u0 > 1.09) {
+        printf("Pure C exp10 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xexp10_u35(d), frx);
+
+      if (u1 > 3.5) {
+        printf("Pure C exp10_u35 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_expm1(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xexpm1(d), frx);
+
+      if (u0 > 1) {
+        printf("Pure C expm1 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_pow(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xpow(d2, d), frx);
+
+      if (u0 > 1) {
+        printf("Pure C pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+        printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cbrt(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xcbrt(d), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C cbrt arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xcbrt_u1(d), frx);
+
+      if (u1 > 1) {
+        printf("Pure C cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_asin(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xasin(zo), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C asin arg=%.20g ulp=%.20g\n", zo, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xasin_u1(zo), frx);
+
+      if (u1 > 1) {
+        printf("Pure C asin_u1 arg=%.20g ulp=%.20g\n", zo, u1);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_acos(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xacos(zo), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C acos arg=%.20g ulp=%.20g\n", zo, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xacos_u1(zo), frx);
+
+      if (u1 > 1) {
+        printf("Pure C acos_u1 arg=%.20g ulp=%.20g\n", zo, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atan(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xatan(d), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C atan arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULPdp(t = xatan_u1(d), frx);
+
+      if (u1 > 1) {
+        printf("Pure C atan_u1 arg=%.20g ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_atan2(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xatan2(d2, d), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULP2dp(t = xatan2_u1(d2, d), frx);
+
+      if (u1 > 1) {
+        printf("Pure C atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xsinh(d), frx);
+
+      if ((fabs(d) <= 709 && u0 > 1) ||
+          (d >  709 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+          (d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+        printf("Pure C sinh arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xcosh(d), frx);
+
+      if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
+        printf("Pure C cosh arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xtanh(d), frx);
+
+      if (u0 > 1) {
+        printf("Pure C tanh arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xsinh_u35(d), frx);
+
+      if ((fabs(d) <= 709 && u0 > 3.5) ||
+          (d >  709 && !(u0 <= 3.5 || (isinf(t) && t > 0))) ||
+          (d < -709 && !(u0 <= 3.5 || (isinf(t) && t < 0)))) {
+        printf("Pure C sinh_u35 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xcosh_u35(d), frx);
+
+      if ((fabs(d) <= 709 && u0 > 3.5) || !(u0 <= 3.5 || (isinf(t) && t > 0))) {
+        printf("Pure C cosh_u35 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xtanh_u35(d), frx);
+
+      if (u0 > 3.5) {
+        printf("Pure C tanh_u35 arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_asinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xasinh(d), frx);
+
+      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
+          (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+          (d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+        printf("Pure C asinh arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_acosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xacosh(d), frx);
+
+      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
+          (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+          (d <= -sqrt(DBL_MAX) && !isnan(t))) {
+        printf("Pure C acosh arg=%.20g ulp=%.20g\n", d, u0);
+        printf("%.20g\n", t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xatanh(d), frx);
+
+      if (u0 > 1) {
+        printf("Pure C atanh arg=%.20g ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    //
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_abs(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xfabs(d), frx);
+
+      if (u0 != 0) {
+        printf("Pure C fabs arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_copysign(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULPdp(t = xcopysign(d, d2), frx);
+
+      if (u0 != 0 && !isnan(d2)) {
+        printf("Pure C copysign arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
+        printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_max(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULPdp(t = xfmax(d, d2), frx);
+
+      if (u0 != 0) {
+        printf("Pure C fmax arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_min(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULPdp(t = xfmin(d, d2), frx);
+
+      if (u0 != 0) {
+        printf("Pure C fmin arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_dim(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULPdp(t = xfdim(d, d2), frx);
+
+      if (u0 > 0.5) {
+        printf("Pure C fdim arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_trunc(frx, frx);
+
+      double u0 = countULPdp(t = xtrunc(d), frx);
+
+      if (u0 != 0) {
+        printf("Pure C trunc arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_floor(frx, frx);
+
+      double u0 = countULPdp(t = xfloor(d), frx);
+
+      if (u0 != 0) {
+        printf("Pure C floor arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_ceil(frx, frx);
+
+      double u0 = countULPdp(t = xceil(d), frx);
+
+      if (u0 != 0) {
+        printf("Pure C ceil arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_round(frx, frx);
+
+      double u0 = countULPdp(t = xround(d), frx);
+
+      if (u0 != 0) {
+        printf("Pure C round arg=%.24g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_rint(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xrint(d), frx);
+
+      if (u0 != 0) {
+        printf("Pure C rint arg=%.24g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_set_d(frz, d3, GMP_RNDN);
+      mpfr_fma(frx, frx, fry, frz, GMP_RNDN);
+
+      double u0 = countULP2dp(t = xfma(d, d2, d3), frx);
+      double c = mpfr_get_d(frx, GMP_RNDN);
+
+      if ((-1e+303 < c && c < 1e+303 && u0 > 0.5) ||
+          !(u0 <= 0.5 || isinf(t))) {
+        printf("Pure C fma arg=%.20g, %.20g, %.20g  ulp=%.20g\n", d, d2, d3, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sqrt(frx, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xsqrt_u05(d), frx);
+
+      if (u0 > 0.50001) {
+        printf("Pure C sqrt_u05 arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_hypot(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULP2dp(t = xhypot_u05(d, d2), frx);
+
+      if (u0 > 0.5) {
+        printf("Pure C hypot arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_hypot(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULP2dp(t = xhypot_u35(d, d2), frx);
+      double c = mpfr_get_d(frx, GMP_RNDN);
+
+      if ((-1e+308 < c && c < 1e+308 && u0 > 3.5) ||
+          !(u0 <= 3.5 || isinf(t))) {
+        printf("Pure C hypot arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      t = xnextafter(d, d2);
+      double c = nextafter(d, d2);
+
+      if (!(isnan(t) && isnan(c)) && t != c) {
+        printf("Pure C nextafter arg=%.20g, %.20g\n", d, d2);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_exp(frx, 0);
+
+      double u0 = countULPdp(t = xfrfrexp(d), frx);
+
+      if (d != 0 && isnumber(d) && u0 != 0) {
+        printf("Pure C frfrexp arg=%.20g ulp=%.20g\n", d, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      int cexp = mpfr_get_exp(frx);
+
+      int texp = xexpfrexp(d);
+
+      if (d != 0 && isnumber(d) && cexp != texp) {
+        printf("Pure C expfrexp arg=%.20g\n", d);
+        printf("correct = %d, test = %d\n", cexp, texp);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_fmod(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULPdp(t = xfmod(d, d2), frx);
+
+      if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
+        printf("Pure C fmod arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_remainder(frx, frx, fry, GMP_RNDN);
+
+      double u0 = countULPdp(t = xremainder(d, d2), frx);
+
+      if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
+        printf("Pure C remainder arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      int exp = (random() & 8191) - 4096;
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_exp(frx, mpfr_get_exp(frx) + exp);
+
+      double u0 = countULPdp(t = xldexp(d, exp), frx);
+
+      if (u0 > 0.5) {
+        printf("Pure C ldexp arg=%.20g %d ulp=%.20g\n", d, exp, u0);
+        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_modf(fry, frz, frx, GMP_RNDN);
+
+      Sleef_double2 t2 = xmodf(d);
+      double u0 = countULPdp(t2.x, frz);
+      double u1 = countULPdp(t2.y, fry);
+
+      if (u0 != 0 || u1 != 0) {
+        printf("Pure C modf arg=%.20g ulp=%.20g %.20g\n", d, u0, u1);
+        printf("correct = %.20g, %.20g\n", mpfr_get_d(frz, GMP_RNDN), mpfr_get_d(fry, GMP_RNDN));
+        printf("test    = %.20g, %.20g\n", t2.x, t2.y);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      int s;
+      mpfr_lgamma(frx, &s, frx, GMP_RNDN);
+
+      double u0 = countULPdp(t = xlgamma_u1(d), frx);
+
+      if (((d < 0 && fabsl(t - mpfr_get_ld(frx, GMP_RNDN)) > 1e-15 && u0 > 1) || (0 <= d && d < 2e+305 && u0 > 1) || (2e+305 <= d && !(u0 <= 1 || isinf(t))))) {
+        printf("Pure C xlgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
+        printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
+        printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_gamma(frx, frx, GMP_RNDN);
+
+      double u0 = countULP2dp(t = xtgamma_u1(d), frx);
+
+      if (u0 > 1.0) {
+        printf("Pure C xtgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
+        printf("Correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
+        printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_erfc(frx, frx, GMP_RNDN);
+
+      static double ebz = 9.8813129168249308835e-324; // nextafter(nextafter(0, 1), 1);
+
+      double u0 = countULP2dp(t = xerfc_u15(d), frx);
+
+      if ((d > 26.2 && u0 > 2.5 && !(mpfr_get_d(frx, GMP_RNDN) == 0 && t <= ebz)) || (d <= 26.2 && u0 > 1.5)) {
+        printf("Pure C xerfc_u15 arg=%.20g ulp=%.20g\n", d, u0);
+        printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_erf(frx, frx, GMP_RNDN);
+
+      double u0 = countULP2dp(t = xerf_u1(d), frx);
+
+      if (u0 > 0.75) {
+        printf("Pure C xerf_u1 arg=%.20g ulp=%.20g\n", d, u0);
+        printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
+        fflush(stdout); ecnt++;
+      }
+    }
+  }
+  mpfr_clears(frw, frx, fry, frz, NULL);
+  exit(0);
+}
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2ld.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2ld.c
@@ -0,0 +1,241 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2024.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpfr.h>
+#include <time.h>
+#include <float.h>
+#include <limits.h>
+
+#include <math.h>
+
+#include "misc.h"
+
+#ifdef ENABLE_SYS_getrandom
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/random.h>
+#endif
+
+#include "sleef.h"
+#include "testerutil.h"
+
+#define DORENAME
+#include "rename.h"
+
+#define DENORMAL_LDBL_MIN (3.6451995318824746025284059336194e-4951L)
+#define XLDBL_MIN (3.3621031431120935062626778173218e-4932L)
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884L
+#endif
+
+#ifndef M_PI_4l
+#define M_PI_4l .785398163397448309615660845819875721049292L
+#endif
+
+#define POSITIVE_INFINITY INFINITY
+#define NEGATIVE_INFINITY (-INFINITY)
+
+int isnumberl(long double x) { return x != SLEEF_INFINITYl && x != -SLEEF_INFINITYl && x == x; }
+int isPlusZerol(long double x) { return x == 0 && copysignl(1, x) == 1; }
+int isMinusZerol(long double x) { return x == 0 && copysignl(1, x) == -1; }
+
+mpfr_t fra, frb, frd;
+
+double countULP(long double d, mpfr_t c) {
+  long double c2 = mpfr_get_ld(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  //if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
+  //if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
+  if (isnanl(c2) && isnanl(d)) return 0;
+  if (isnanl(c2) || isnanl(d)) return 10001;
+  if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
+  if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
+  if (!isnumberl(c2) && !isnumberl(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
+  mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), DENORMAL_LDBL_MIN), GMP_RNDN);
+
+  mpfr_set_ld(frd, d, GMP_RNDN);
+  mpfr_sub(fra, frd, c, GMP_RNDN);
+  mpfr_div(fra, fra, frb, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fra, GMP_RNDN));
+
+  return u;
+}
+
+double countULP2(long double d, mpfr_t c) {
+  long double c2 = mpfr_get_ld(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  //if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
+  //if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
+  if (isnanl(c2) && isnanl(d)) return 0;
+  if (isnanl(c2) || isnanl(d)) return 10001;
+  if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
+  if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
+  if (!isnumberl(c2) && !isnumberl(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
+  mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), LDBL_MIN), GMP_RNDN);
+
+  mpfr_set_ld(frd, d, GMP_RNDN);
+  mpfr_sub(fra, frd, c, GMP_RNDN);
+  mpfr_div(fra, fra, frb, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fra, GMP_RNDN));
+
+  return u;
+}
+
+typedef union {
+  long double d;
+  __int128 u128;
+} conv_t;
+
+long double rnd() {
+  conv_t c;
+  switch(random() & 15) {
+  case 0: return  INFINITY;
+  case 1: return -INFINITY;
+  }
+#ifdef ENABLE_SYS_getrandom
+  syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
+#else
+  c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
+#endif
+  return c.d;
+}
+
+long double rnd_fr() {
+  conv_t c;
+  do {
+#ifdef ENABLE_SYS_getrandom
+    syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
+#else
+    c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
+#endif
+  } while(!isnumberl(c.d));
+  return c.d;
+}
+
+long double rnd_zo() {
+  conv_t c;
+  do {
+#ifdef ENABLE_SYS_getrandom
+    syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
+#else
+    c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
+#endif
+  } while(!isnumberl(c.d) || c.d < -1 || 1 < c.d);
+  return c.d;
+}
+
+void sinpifr(mpfr_t ret, long double d) {
+  mpfr_t frpi, frd;
+  mpfr_inits(frpi, frd, NULL);
+
+  mpfr_const_pi(frpi, GMP_RNDN);
+  mpfr_set_d(frd, 1.0, GMP_RNDN);
+  mpfr_mul(frpi, frpi, frd, GMP_RNDN);
+  mpfr_set_ld(frd, d, GMP_RNDN);
+  mpfr_mul(frd, frpi, frd, GMP_RNDN);
+  mpfr_sin(ret, frd, GMP_RNDN);
+
+  mpfr_clears(frpi, frd, NULL);
+}
+
+void cospifr(mpfr_t ret, long double d) {
+  mpfr_t frpi, frd;
+  mpfr_inits(frpi, frd, NULL);
+
+  mpfr_const_pi(frpi, GMP_RNDN);
+  mpfr_set_d(frd, 1.0, GMP_RNDN);
+  mpfr_mul(frpi, frpi, frd, GMP_RNDN);
+  mpfr_set_ld(frd, d, GMP_RNDN);
+  mpfr_mul(frd, frpi, frd, GMP_RNDN);
+  mpfr_cos(ret, frd, GMP_RNDN);
+
+  mpfr_clears(frpi, frd, NULL);
+}
+
+int main(int argc,char **argv)
+{
+  mpfr_t frx;
+
+  mpfr_set_default_prec(256);
+  mpfr_inits(fra, frb, frd, frx, NULL);
+
+  conv_t cd;
+  long double d, t;
+
+  int cnt, ecnt = 0;
+
+  srandom(time(NULL));
+
+  for(cnt = 0;ecnt < 1000;cnt++) {
+    switch(cnt & 7) {
+    case 0:
+      d = rnd();
+      break;
+    case 1:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+10) * M_PI_4;
+      cd.u128 += (random() & 0xff) - 0x7f;
+      d = cd.d;
+      break;
+    default:
+      d = rnd_fr();
+      break;
+    }
+
+    Sleef_longdouble2 sc  = xsincospil_u05(d);
+    Sleef_longdouble2 sc2 = xsincospil_u35(d);
+
+    {
+      const double rangemax2 = 1e+9;
+
+      sinpifr(frx, d);
+
+      double u0 = countULP2(t = sc.x, frx);
+
+      if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
+        printf("Pure C sincospil_u05 sin arg=%.30Lg ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULP2(t = sc2.x, frx);
+
+      if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
+        printf("Pure C sincospil_u35 sin arg=%.30Lg ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+    {
+      const double rangemax2 = 1e+9;
+
+      cospifr(frx, d);
+
+      double u0 = countULP2(t = sc.y, frx);
+
+      if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
+        printf("Pure C sincospil_u05 cos arg=%.30Lg ulp=%.20g\n", d, u0);
+        fflush(stdout); ecnt++;
+      }
+
+      double u1 = countULP2(t = sc.y, frx);
+
+      if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
+        printf("Pure C sincospil_u35 cos arg=%.30Lg ulp=%.20g\n", d, u1);
+        fflush(stdout); ecnt++;
+      }
+    }
+
+  }
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`include("${CMAKE_CURRENT_LIST_DIR}/sleefTargets.cmake")`