8329816: Add SLEEF version 3.6.1

Reviewed-by: erikj, mli, luhenry
2025-12-06 09:29:38 +01:00 · 2024-09-17 12:58:36 +00:00
parent 80db6e71b0
commit b39e6a84ef
175 changed files with 120709 additions and 0 deletions
--- a/make/Main.gmk
+++ b/make/Main.gmk
@@ -568,6 +568,10 @@ $(eval $(call SetupTarget, update-build-docs, \
    MAKEFILE := UpdateBuildDocs, \
 ))
 $(eval $(call SetupTarget, update-sleef-source, \
    MAKEFILE := UpdateSleefSource, \
 ))
 $(eval $(call SetupTarget, update-x11wrappers, \
    MAKEFILE := UpdateX11Wrappers, \
    DEPS := java.base-copy buildtools-jdk, \
--- a/make/UpdateSleefSource.gmk
+++ b/make/UpdateSleefSource.gmk
@@ -0,0 +1,153 @@
 #
 # Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License version 2 only, as
 # published by the Free Software Foundation.  Oracle designates this
 # particular file as subject to the "Classpath" exception as provided
 # by Oracle in the LICENSE file that accompanied this code.
 #
 # This code is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # version 2 for more details (a copy is included in the LICENSE file that
 # accompanied this code).
 #
 # You should have received a copy of the GNU General Public License version
 # 2 along with this work; if not, write to the Free Software Foundation,
 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 # or visit www.oracle.com if you need additional information or have any
 # questions.
 #
 ################################################################################
 default: all
 include $(SPEC)
 include MakeBase.gmk
 include CopyFiles.gmk
 include Execute.gmk
 ################################################################################
 # This file is responsible for updating the generated sleef source code files
 # that are checked in to the JDK repo, and that are actually used when building.
 # This target needs to be re-run every time the source code of libsleef is
 # updated from upstream.
 ################################################################################
 ifneq ($(COMPILE_TYPE), cross)
  $(error Only cross-compilation of libsleef is currently supported)
 endif
 ifeq ($(CMAKE), )
  $(error CMake not found. Please install cmake and rerun configure)
 endif
 ifneq ($(OPENJDK_BUILD_OS), linux)
  $(error This target is only supported on linux)
 endif
 SLEEF_SUPPORT_DIR := $(MAKESUPPORT_OUTPUTDIR)/sleef
 SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/linux/native/libsleef
 SLEEF_SOURCE_DIR := $(SLEEF_SOURCE_BASE_DIR)/upstream
 SLEEF_TARGET_DIR := $(SLEEF_SOURCE_BASE_DIR)/generated
 SLEEF_NATIVE_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/native
 SLEEF_CROSS_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/cross
 ifeq ($(OPENJDK_TARGET_CPU), aarch64)
  CROSS_COMPILATION_FILENAMES := sleefinline_advsimd.h sleefinline_sve.h
  EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_SVE=TRUE
 else ifeq ($(OPENJDK_TARGET_CPU), riscv64)
  CROSS_COMPILATION_FILENAMES := sleefinline_rvvm1.h
  EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_RVVM1=TRUE
 else
  $(error Unsupported platform)
 endif
 CROSS_COMPILATION_SRC_FILES := $(addprefix $(SLEEF_CROSS_BUILD_DIR)/include/, \
    $(CROSS_COMPILATION_FILENAMES))
 ifeq ($(TOOLCHAIN_TYPE), clang)
  SLEEF_TOOLCHAIN_TYPE := llvm
 else
  SLEEF_TOOLCHAIN_TYPE := $(TOOLCHAIN_TYPE)
 endif
 SLEEF_CMAKE_FILE := toolchains/$(OPENJDK_TARGET_CPU)-$(SLEEF_TOOLCHAIN_TYPE).cmake
 # We need to run CMake twice, first using it to configure the build, and then
 # to actually build; and we need to do this twice, once for a native build
 # and once for the cross-compilation build.
 $(eval $(call SetupExecute, sleef_native_config, \
    INFO := Configuring native sleef build, \
    OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
        $(SLEEF_NATIVE_BUILD_DIR), \
 ))
 TARGETS := $(sleef_native_config)
 $(eval $(call SetupExecute, sleef_native_build, \
    INFO := Building native sleef, \
    DEPS := $(sleef_native_config), \
    OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
        $(SLEEF_NATIVE_BUILD_DIR) -j, \
 ))
 TARGETS := $(sleef_native_build)
 $(eval $(call SetupExecute, sleef_cross_config, \
    INFO := Configuring cross-compiling sleef build, \
    DEPS := $(sleef_native_build), \
    OUTPUT_DIR := $(SLEEF_CROSS_BUILD_DIR), \
    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
        $(SLEEF_CROSS_BUILD_DIR) \
        -DCMAKE_C_COMPILER=$(CC) \
        -DCMAKE_TOOLCHAIN_FILE=$(SLEEF_CMAKE_FILE) \
        -DNATIVE_BUILD_DIR=$(SLEEF_NATIVE_BUILD_DIR) \
        -DSLEEF_BUILD_INLINE_HEADERS=TRUE \
        $(EXTRA_CROSS_OPTIONS), \
 ))
 TARGETS := $(sleef_cross_config)
 $(eval $(call SetupExecute, sleef_cross_build, \
    INFO := Building cross-compiling sleef, \
    DEPS := $(sleef_cross_config), \
    OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
    COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
        $(SLEEF_CROSS_BUILD_DIR) -j, \
 ))
 TARGETS := $(sleef_cross_build)
 $(CROSS_COMPILATION_SRC_FILES): $(sleef_cross_build)
 # Finally, copy the generated files (and one needed static file) into our
 # target directory.
 $(eval $(call SetupCopyFiles, copy_static_sleef_source, \
    FILES := $(SLEEF_SOURCE_DIR)/src/common/misc.h, \
    DEST := $(SLEEF_TARGET_DIR), \
 ))
 TARGETS := $(copy_static_sleef_source)
 $(eval $(call SetupCopyFiles, copy_generated_sleef_source, \
    FILES := $(CROSS_COMPILATION_SRC_FILES), \
    DEST := $(SLEEF_TARGET_DIR), \
 ))
 TARGETS := $(copy_generated_sleef_source)
 ################################################################################
 all: $(TARGETS)
 .PHONY: all default
--- a/make/autoconf/basic_tools.m4
+++ b/make/autoconf/basic_tools.m4
@@ -99,6 +99,7 @@ AC_DEFUN_ONCE([BASIC_SETUP_TOOLS],
  UTIL_REQUIRE_SPECIAL(FGREP, [AC_PROG_FGREP])
  # Optional tools, we can do without them
  UTIL_LOOKUP_PROGS(CMAKE, cmake)
  UTIL_LOOKUP_PROGS(DF, df)
  UTIL_LOOKUP_PROGS(GIT, git)
  UTIL_LOOKUP_PROGS(NICE, nice)
--- a/make/autoconf/spec.gmk.template
+++ b/make/autoconf/spec.gmk.template
@@ -719,6 +719,7 @@ CCACHE := @CCACHE@
 # CD is going away, but remains to cater for legacy makefiles.
 CD := cd
 CHMOD := @CHMOD@
 CMAKE := @CMAKE@
 CODESIGN := @CODESIGN@
 CP := @CP@
 CUT := @CUT@
--- a/src/jdk.incubator.vector/linux/legal/sleef.md
+++ b/src/jdk.incubator.vector/linux/legal/sleef.md
@@ -0,0 +1,439 @@
 ## SLEEF v3.6.1
 ### Notice
 ```
 Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors
 -------
 src/arch/helpersve.h has the following copyright:
 Copyright ARM Ltd. 2010 - 2024.
 -------
 src/gencoef/{dp.h, gencoef.c, ld.h, qp.h, simplexfr.c, sp.h} have no copyright but has the following license text:
 // The code is distributed under the Creative Commons Attribution 4.0 International License.
 // https://creativecommons.org/licenses/by/4.0/
 Attribution 4.0 International
 ```
 ### LICENSE Boost v1.0
 ```
 Boost Software License - Version 1.0 - August 17th, 2003
 Permission is hereby granted, free of charge, to any person or organization
 obtaining a copy of the software and accompanying documentation covered by
 this license (the "Software") to use, reproduce, display, distribute,
 execute, and transmit the Software, and to prepare derivative works of the
 Software, and to permit third-parties to whom the Software is furnished to
 do so, all subject to the following:
 The copyright notices in the Software and this entire statement, including
 the above license grant, this restriction and the following disclaimer,
 must be included in all copies of the Software, in whole or in part, and
 all derivative works of the Software, unless such copies or derivative
 works are solely in the form of machine-executable object code generated by
 a source language processor.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ```
 ### LICENSE Creative Commons Attribution 4.0 International License
 ```
 Creative Commons Corporation ("Creative Commons") is not a law firm and
 does not provide legal services or legal advice. Distribution of
 Creative Commons public licenses does not create a lawyer-client or
 other relationship. Creative Commons makes its licenses and related
 information available on an "as-is" basis. Creative Commons gives no
 warranties regarding its licenses, any material licensed under their
 terms and conditions, or any related information. Creative Commons
 disclaims all liability for damages resulting from their use to the
 fullest extent possible.
 Using Creative Commons Public Licenses
 Creative Commons public licenses provide a standard set of terms and
 conditions that creators and other rights holders may use to share
 original works of authorship and other material subject to copyright
 and certain other rights specified in the public license below. The
 following considerations are for informational purposes only, are not
 exhaustive, and do not form part of our licenses.
     Considerations for licensors: Our public licenses are
     intended for use by those authorized to give the public
     permission to use material in ways otherwise restricted by
     copyright and certain other rights. Our licenses are
     irrevocable. Licensors should read and understand the terms
     and conditions of the license they choose before applying it.
     Licensors should also secure all rights necessary before
     applying our licenses so that the public can reuse the
     material as expected. Licensors should clearly mark any
     material not subject to the license. This includes other CC-
     licensed material, or material used under an exception or
     limitation to copyright. More considerations for licensors:
    wiki.creativecommons.org/Considerations_for_licensors
     Considerations for the public: By using one of our public
     licenses, a licensor grants the public permission to use the
     licensed material under specified terms and conditions. If
     the licensor's permission is not necessary for any reason--for
     example, because of any applicable exception or limitation to
     copyright--then that use is not regulated by the license. Our
     licenses grant only permissions under copyright and certain
     other rights that a licensor has authority to grant. Use of
     the licensed material may still be restricted for other
     reasons, including because others have copyright or other
     rights in the material. A licensor may make special requests,
     such as asking that all changes be marked or described.
     Although not required by our licenses, you are encouraged to
     respect those requests where reasonable. More considerations
     for the public:
    wiki.creativecommons.org/Considerations_for_licensees
 =======================================================================
 Creative Commons Attribution 4.0 International Public License
 By exercising the Licensed Rights (defined below), You accept and agree
 to be bound by the terms and conditions of this Creative Commons
 Attribution 4.0 International Public License ("Public License"). To the
 extent this Public License may be interpreted as a contract, You are
 granted the Licensed Rights in consideration of Your acceptance of
 these terms and conditions, and the Licensor grants You such rights in
 consideration of benefits the Licensor receives from making the
 Licensed Material available under these terms and conditions.
 Section 1 -- Definitions.
  a. Adapted Material means material subject to Copyright and Similar
     Rights that is derived from or based upon the Licensed Material
     and in which the Licensed Material is translated, altered,
     arranged, transformed, or otherwise modified in a manner requiring
     permission under the Copyright and Similar Rights held by the
     Licensor. For purposes of this Public License, where the Licensed
     Material is a musical work, performance, or sound recording,
     Adapted Material is always produced where the Licensed Material is
     synched in timed relation with a moving image.
  b. Adapter's License means the license You apply to Your Copyright
     and Similar Rights in Your contributions to Adapted Material in
     accordance with the terms and conditions of this Public License.
  c. Copyright and Similar Rights means copyright and/or similar rights
     closely related to copyright including, without limitation,
     performance, broadcast, sound recording, and Sui Generis Database
     Rights, without regard to how the rights are labeled or
     categorized. For purposes of this Public License, the rights
     specified in Section 2(b)(1)-(2) are not Copyright and Similar
     Rights.
  d. Effective Technological Measures means those measures that, in the
     absence of proper authority, may not be circumvented under laws
     fulfilling obligations under Article 11 of the WIPO Copyright
     Treaty adopted on December 20, 1996, and/or similar international
     agreements.
  e. Exceptions and Limitations means fair use, fair dealing, and/or
     any other exception or limitation to Copyright and Similar Rights
     that applies to Your use of the Licensed Material.
  f. Licensed Material means the artistic or literary work, database,
     or other material to which the Licensor applied this Public
     License.
  g. Licensed Rights means the rights granted to You subject to the
     terms and conditions of this Public License, which are limited to
     all Copyright and Similar Rights that apply to Your use of the
     Licensed Material and that the Licensor has authority to license.
  h. Licensor means the individual(s) or entity(ies) granting rights
     under this Public License.
  i. Share means to provide material to the public by any means or
     process that requires permission under the Licensed Rights, such
     as reproduction, public display, public performance, distribution,
     dissemination, communication, or importation, and to make material
     available to the public including in ways that members of the
     public may access the material from a place and at a time
     individually chosen by them.
  j. Sui Generis Database Rights means rights other than copyright
     resulting from Directive 96/9/EC of the European Parliament and of
     the Council of 11 March 1996 on the legal protection of databases,
     as amended and/or succeeded, as well as other essentially
     equivalent rights anywhere in the world.
  k. You means the individual or entity exercising the Licensed Rights
     under this Public License. Your has a corresponding meaning.
 Section 2 -- Scope.
  a. License grant.
       1. Subject to the terms and conditions of this Public License,
          the Licensor hereby grants You a worldwide, royalty-free,
          non-sublicensable, non-exclusive, irrevocable license to
          exercise the Licensed Rights in the Licensed Material to:
            a. reproduce and Share the Licensed Material, in whole or
               in part; and
            b. produce, reproduce, and Share Adapted Material.
       2. Exceptions and Limitations. For the avoidance of doubt, where
          Exceptions and Limitations apply to Your use, this Public
          License does not apply, and You do not need to comply with
          its terms and conditions.
       3. Term. The term of this Public License is specified in Section
          6(a).
       4. Media and formats; technical modifications allowed. The
          Licensor authorizes You to exercise the Licensed Rights in
          all media and formats whether now known or hereafter created,
          and to make technical modifications necessary to do so. The
          Licensor waives and/or agrees not to assert any right or
          authority to forbid You from making technical modifications
          necessary to exercise the Licensed Rights, including
          technical modifications necessary to circumvent Effective
          Technological Measures. For purposes of this Public License,
          simply making modifications authorized by this Section 2(a)
          (4) never produces Adapted Material.
       5. Downstream recipients.
            a. Offer from the Licensor -- Licensed Material. Every
               recipient of the Licensed Material automatically
               receives an offer from the Licensor to exercise the
               Licensed Rights under the terms and conditions of this
               Public License.
            b. No downstream restrictions. You may not offer or impose
               any additional or different terms or conditions on, or
               apply any Effective Technological Measures to, the
               Licensed Material if doing so restricts exercise of the
               Licensed Rights by any recipient of the Licensed
               Material.
       6. No endorsement. Nothing in this Public License constitutes or
          may be construed as permission to assert or imply that You
          are, or that Your use of the Licensed Material is, connected
          with, or sponsored, endorsed, or granted official status by,
          the Licensor or others designated to receive attribution as
          provided in Section 3(a)(1)(A)(i).
  b. Other rights.
       1. Moral rights, such as the right of integrity, are not
          licensed under this Public License, nor are publicity,
          privacy, and/or other similar personality rights; however, to
          the extent possible, the Licensor waives and/or agrees not to
          assert any such rights held by the Licensor to the limited
          extent necessary to allow You to exercise the Licensed
          Rights, but not otherwise.
       2. Patent and trademark rights are not licensed under this
          Public License.
       3. To the extent possible, the Licensor waives any right to
          collect royalties from You for the exercise of the Licensed
          Rights, whether directly or through a collecting society
          under any voluntary or waivable statutory or compulsory
          licensing scheme. In all other cases the Licensor expressly
          reserves any right to collect such royalties.
 Section 3 -- License Conditions.
 Your exercise of the Licensed Rights is expressly made subject to the
 following conditions.
  a. Attribution.
       1. If You Share the Licensed Material (including in modified
          form), You must:
            a. retain the following if it is supplied by the Licensor
               with the Licensed Material:
                 i. identification of the creator(s) of the Licensed
                    Material and any others designated to receive
                    attribution, in any reasonable manner requested by
                    the Licensor (including by pseudonym if
                    designated);
                ii. a copyright notice;
               iii. a notice that refers to this Public License;
                iv. a notice that refers to the disclaimer of
                    warranties;
                 v. a URI or hyperlink to the Licensed Material to the
                    extent reasonably practicable;
            b. indicate if You modified the Licensed Material and
               retain an indication of any previous modifications; and
            c. indicate the Licensed Material is licensed under this
               Public License, and include the text of, or the URI or
               hyperlink to, this Public License.
       2. You may satisfy the conditions in Section 3(a)(1) in any
          reasonable manner based on the medium, means, and context in
          which You Share the Licensed Material. For example, it may be
          reasonable to satisfy the conditions by providing a URI or
          hyperlink to a resource that includes the required
          information.
       3. If requested by the Licensor, You must remove any of the
          information required by Section 3(a)(1)(A) to the extent
          reasonably practicable.
       4. If You Share Adapted Material You produce, the Adapter's
          License You apply must not prevent recipients of the Adapted
          Material from complying with this Public License.
 Section 4 -- Sui Generis Database Rights.
 Where the Licensed Rights include Sui Generis Database Rights that
 apply to Your use of the Licensed Material:
  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
     to extract, reuse, reproduce, and Share all or a substantial
     portion of the contents of the database;
  b. if You include all or a substantial portion of the database
     contents in a database in which You have Sui Generis Database
     Rights, then the database in which You have Sui Generis Database
     Rights (but not its individual contents) is Adapted Material; and
  c. You must comply with the conditions in Section 3(a) if You Share
     all or a substantial portion of the contents of the database.
 For the avoidance of doubt, this Section 4 supplements and does not
 replace Your obligations under this Public License where the Licensed
 Rights include other Copyright and Similar Rights.
 Section 5 -- Disclaimer of Warranties and Limitation of Liability.
  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
  c. The disclaimer of warranties and limitation of liability provided
     above shall be interpreted in a manner that, to the extent
     possible, most closely approximates an absolute disclaimer and
     waiver of all liability.
 Section 6 -- Term and Termination.
  a. This Public License applies for the term of the Copyright and
     Similar Rights licensed here. However, if You fail to comply with
     this Public License, then Your rights under this Public License
     terminate automatically.
  b. Where Your right to use the Licensed Material has terminated under
     Section 6(a), it reinstates:
       1. automatically as of the date the violation is cured, provided
          it is cured within 30 days of Your discovery of the
          violation; or
       2. upon express reinstatement by the Licensor.
     For the avoidance of doubt, this Section 6(b) does not affect any
     right the Licensor may have to seek remedies for Your violations
     of this Public License.
  c. For the avoidance of doubt, the Licensor may also offer the
     Licensed Material under separate terms or conditions or stop
     distributing the Licensed Material at any time; however, doing so
     will not terminate this Public License.
  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
     License.
 Section 7 -- Other Terms and Conditions.
  a. The Licensor shall not be bound by any additional or different
     terms or conditions communicated by You unless expressly agreed.
  b. Any arrangements, understandings, or agreements regarding the
     Licensed Material not stated herein are separate from and
     independent of the terms and conditions of this Public License.
 Section 8 -- Interpretation.
  a. For the avoidance of doubt, this Public License does not, and
     shall not be interpreted to, reduce, limit, restrict, or impose
     conditions on any use of the Licensed Material that could lawfully
     be made without permission under this Public License.
  b. To the extent possible, if any provision of this Public License is
     deemed unenforceable, it shall be automatically reformed to the
     minimum extent necessary to make it enforceable. If the provision
     cannot be reformed, it shall be severed from this Public License
     without affecting the enforceability of the remaining terms and
     conditions.
  c. No term or condition of this Public License will be waived and no
     failure to comply consented to unless expressly agreed to by the
     Licensor.
  d. Nothing in this Public License constitutes or may be interpreted
     as a limitation upon, or waiver of, any privileges and immunities
     that apply to the Licensor or You, including from the legal
     processes of any jurisdiction or authority.
 =======================================================================
 Creative Commons is not a party to its public
 licenses. Notwithstanding, Creative Commons may elect to apply one of
 its public licenses to material it publishes and in those instances
 will be considered the “Licensor.” The text of the Creative Commons
 public licenses is dedicated to the public domain under the CC0 Public
 Domain Dedication. Except for the limited purpose of indicating that
 material is shared under a Creative Commons public license or as
 otherwise permitted by the Creative Commons policies published at
 creativecommons.org/policies, Creative Commons does not authorize the
 use of the trademark "Creative Commons" or any other trademark or logo
 of Creative Commons without its prior written consent including,
 without limitation, in connection with any unauthorized modifications
 to any of its public licenses or any other arrangements,
 understandings, or agreements concerning use of licensed material. For
 the avoidance of doubt, this paragraph does not form part of the
 public licenses.
 Creative Commons may be contacted at creativecommons.org.
 ```
--- a/src/jdk.incubator.vector/linux/native/libsleef/README.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/README.md
@@ -0,0 +1,54 @@
 # About SLEEF
 This directory contains the source code for the SLEEF library, the
 **SIMD Library for Evaluating Elementary Functions**. For more information on
 SLEEF, see https://sleef.org/.
 The currently imported libsleef sources is version 3.6.1, which has
 git tag `3.6.1` and git commit hash `6ee14bcae5fe92c2ff8b000d5a01102dab08d774`.
 # About the libsleef integration in the JDK
 The upstream original source code is available in
 `src/jdk.incubator.vector/linux/native/libsleef/upstream`. However, this code is
 not directly usable in the JDK build system, but is instead used as the base for
 the generation of additional souce code files. This generation is done by
 the libsleef CMake files. If this should have been done at build time, it would
 have meant adding CMake as a required dependency to build the JDK.
 Instead, we create these generated files only once, when we import a new
 version of the libsleef source code, and check in the generated files into
 the JDK source tree. The generated files reside in
 `src/jdk.incubator.vector/linux/native/libsleef/generated`.
 # Import instructions
 To update the version of libsleef that is used in the JDK, clone
 `https://github.com/shibatch/sleef.git`, and copy all files, except the `docs`,
 `.github` and `.git` directories, into
 `src/jdk.incubator.vector/linux/native/libsleef/upstream`.
 The libsleef source code does not follow the JDK whitespace rules as enforced by
 jcheck. You will need to remove trailing whitespace, and expand tabs to 8
 spaces in the imported source code.
 Update the note above with information about what version you import.
 You will need to repeat the process below for each of the platforms in the JDK
 that uses libsleef; currently this is aarch64 and riscv64. The rest of this
 instruction assumes you are doing this on linux/x64; at this point, any other
 setup is not supported. Also, make sure you have CMake installed.
 First, run configure for cross-compiling to your selected target platform
 (e.g. aarch64).
 Run `make update-sleef-source` to process the upstream source code and
 store the generated files in the `generated` directory.
 Now, you can repeat this for the next platform. For instance, you can
 create a separate configuration using `configure --with-conf-name=riscv64` and
 then generate the updated libsleef source code by
 `make update-sleef-source CONF=riscv64`.
 Finally, verify with git that the local changes made to the files in
 `src/jdk.incubator.vector/linux/native/libsleef/generated` look okay.
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/misc.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/misc.h
@@ -0,0 +1,332 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2024.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 //
 #ifndef __MISC_H__
 #define __MISC_H__
 #if !defined(SLEEF_GENHEADER)
 #include <stdint.h>
 #include <string.h>
 #endif
 #ifndef M_PI
 #define M_PI 3.141592653589793238462643383279502884
 #endif
 #ifndef M_PIl
 #define M_PIl 3.141592653589793238462643383279502884L
 #endif
 #ifndef M_1_PI
 #define M_1_PI 0.318309886183790671537767526745028724
 #endif
 #ifndef M_1_PIl
 #define M_1_PIl 0.318309886183790671537767526745028724L
 #endif
 #ifndef M_2_PI
 #define M_2_PI 0.636619772367581343075535053490057448
 #endif
 #ifndef M_2_PIl
 #define M_2_PIl 0.636619772367581343075535053490057448L
 #endif
 #if !defined(SLEEF_GENHEADER)
 #ifndef SLEEF_FP_ILOGB0
 #define SLEEF_FP_ILOGB0 ((int)0x80000000)
 #endif
 #ifndef SLEEF_FP_ILOGBNAN
 #define SLEEF_FP_ILOGBNAN ((int)2147483647)
 #endif
 #endif
 #define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
 #define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
 #define SLEEF_FLT_MIN 0x1p-126
 #define SLEEF_DBL_MIN 0x1p-1022
 #define SLEEF_INT_MAX 2147483647
 #define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
 #define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
 //
 /*
  PI_A to PI_D are constants that satisfy the following two conditions.
  * For PI_A, PI_B and PI_C, the last 28 bits are zero.
  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
  The argument of a trig function is multiplied by 1/PI, and the
  integral part is divided into two parts, each has at most 28
  bits. So, the maximum argument that could be correctly reduced
  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
  double precision calculation, the actual maximum argument that can
  be correctly reduced is around 2^47.
 */
 #define PI_A 3.1415926218032836914
 #define PI_B 3.1786509424591713469e-08
 #define PI_C 1.2246467864107188502e-16
 #define PI_D 1.2736634327021899816e-24
 #define TRIGRANGEMAX 1e+14
 /*
  PI_A2 and PI_B2 are constants that satisfy the following two conditions.
  * The last 3 bits of PI_A2 are zero.
  * PI_A2 + PI_B2 is close to PI as much as possible.
  The argument of a trig function is multiplied by 1/PI, and the
  integral part is multiplied by PI_A2. So, the maximum argument that
  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
  we confirmed that it correctly reduces the argument up to around 15.
 */
 #define PI_A2 3.141592653589793116
 #define PI_B2 1.2246467991473532072e-16
 #define TRIGRANGEMAX2 15
 #define M_2_PI_H 0.63661977236758138243
 #define M_2_PI_L -3.9357353350364971764e-17
 #define SQRT_DBL_MAX 1.3407807929942596355e+154
 #define TRIGRANGEMAX3 1e+9
 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
 #define L2U .69314718055966295651160180568695068359375
 #define L2L .28235290563031577122588448175013436025525412068e-12
 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
 #define L10U 0.30102999566383914498 // log 2 / log 10
 #define L10L 1.4205023227266099418e-13
 #define LOG10_2 3.3219280948873623478703194294893901758648313930
 #define L10Uf 0.3010253906f
 #define L10Lf 4.605038981e-06f
 //
 #define PI_Af 3.140625f
 #define PI_Bf 0.0009670257568359375f
 #define PI_Cf 6.2771141529083251953e-07f
 #define PI_Df 1.2154201256553420762e-10f
 #define TRIGRANGEMAXf 39000
 #define PI_A2f 3.1414794921875f
 #define PI_B2f 0.00011315941810607910156f
 #define PI_C2f 1.9841872589410058936e-09f
 #define TRIGRANGEMAX2f 125.0f
 #define TRIGRANGEMAX4f 8e+6f
 #define SQRT_FLT_MAX 18446743523953729536.0
 #define L2Uf 0.693145751953125f
 #define L2Lf 1.428606765330187045e-06f
 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
 #ifndef M_PIf
 # define M_PIf ((float)M_PI)
 #endif
 //
 #ifndef MIN
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #endif
 #ifndef MAX
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 #endif
 #ifndef ABS
 #define ABS(x) ((x) < 0 ? -(x) : (x))
 #endif
 #define stringify(s) stringify_(s)
 #define stringify_(s) #s
 #if !defined(SLEEF_GENHEADER)
 typedef long double longdouble;
 #endif
 #if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
 #define Sleef_double2_DEFINED
 typedef struct {
  double x, y;
 } Sleef_double2;
 #endif
 #if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
 #define Sleef_float2_DEFINED
 typedef struct {
  float x, y;
 } Sleef_float2;
 #endif
 #if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
 #define Sleef_longdouble2_DEFINED
 typedef struct {
  long double x, y;
 } Sleef_longdouble2;
 #endif
 #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 #define LIKELY(condition) __builtin_expect(!!(condition), 1)
 #define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
 #define RESTRICT __restrict__
 #ifndef __arm__
 #define ALIGNED(x) __attribute__((aligned(x)))
 #else
 #define ALIGNED(x)
 #endif
 #if defined(SLEEF_GENHEADER)
 #define INLINE SLEEF_ALWAYS_INLINE
 #define EXPORT SLEEF_INLINE
 #define CONST SLEEF_CONST
 #define NOEXPORT
 #else // #if defined(SLEEF_GENHEADER)
 #define CONST __attribute__((const))
 #define INLINE __attribute__((always_inline))
 #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
 #ifndef SLEEF_STATIC_LIBS
 #define EXPORT __stdcall __declspec(dllexport)
 #define NOEXPORT
 #else // #ifndef SLEEF_STATIC_LIBS
 #define EXPORT
 #define NOEXPORT
 #endif // #ifndef SLEEF_STATIC_LIBS
 #else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
 #define EXPORT __attribute__((visibility("default")))
 #define NOEXPORT __attribute__ ((visibility ("hidden")))
 #endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
 #endif // #if defined(SLEEF_GENHEADER)
 #define SLEEF_NAN __builtin_nan("")
 #define SLEEF_NANf __builtin_nanf("")
 #define SLEEF_NANl __builtin_nanl("")
 #define SLEEF_INFINITY __builtin_inf()
 #define SLEEF_INFINITYf __builtin_inff()
 #define SLEEF_INFINITYl __builtin_infl()
 #if defined(__INTEL_COMPILER) || defined (__clang__)
 #define SLEEF_INFINITYq __builtin_inf()
 #define SLEEF_NANq __builtin_nan("")
 #else
 #define SLEEF_INFINITYq __builtin_infq()
 #define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
 #endif
 #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 #if defined(SLEEF_GENHEADER)
 #define INLINE SLEEF_ALWAYS_INLINE
 #define CONST SLEEF_CONST
 #define EXPORT SLEEF_INLINE
 #define NOEXPORT
 #else // #if defined(SLEEF_GENHEADER)
 #define INLINE __forceinline
 #define CONST
 #ifndef SLEEF_STATIC_LIBS
 #define EXPORT __declspec(dllexport)
 #define NOEXPORT
 #else
 #define EXPORT
 #define NOEXPORT
 #endif
 #endif // #if defined(SLEEF_GENHEADER)
 #define RESTRICT
 #define ALIGNED(x)
 #define LIKELY(condition) (condition)
 #define UNLIKELY(condition) (condition)
 #if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
 #include <x86intrin.h>
 #endif
 #define SLEEF_INFINITY (1e+300 * 1e+300)
 #define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
 #define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
 #define SLEEF_NANf ((float)SLEEF_NAN)
 #define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
 #define SLEEF_NANl ((long double)SLEEF_NAN)
 #if (defined(_M_AMD64) || defined(_M_X64))
 #ifndef __SSE2__
 #define __SSE2__
 #define __SSE3__
 #define __SSE4_1__
 #endif
 #elif _M_IX86_FP == 2
 #ifndef __SSE2__
 #define __SSE2__
 #define __SSE3__
 #define __SSE4_1__
 #endif
 #elif _M_IX86_FP == 1
 #ifndef __SSE__
 #define __SSE__
 #endif
 #endif
 #endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 #if !defined(__linux__)
 #define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
 #define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
 #define isnanf(x) ((x) != (x))
 #define isnanl(x) ((x) != (x))
 #endif
 #endif // #ifndef __MISC_H__
 #ifdef ENABLE_AAVPCS
 #define VECTOR_CC __attribute__((aarch64_vector_pcs))
 #else
 #define VECTOR_CC
 #endif
 //
 #if defined (__GNUC__) && !defined(__INTEL_COMPILER)
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wunknown-pragmas"
 #if !defined (__clang__)
 #pragma GCC diagnostic ignored "-Wattribute-alias"
 #pragma GCC diagnostic ignored "-Wlto-type-mismatch"
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 #endif
 #if defined(_MSC_VER)
 #pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
 #pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
 #pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
 #pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
 #pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_advsimd.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_advsimd.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_rvvm1.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_rvvm1.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_sve.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/generated/sleefinline_sve.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/CHANGELOG.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/CHANGELOG.md
@@ -0,0 +1,255 @@
 ## 3.6.1 - 2024-06-10
 This patch release provides important bug fixes, including a fix
 for API compatibility with 3.5 (#534).
 The support and test for some features is still limited, as
 documented in [README](./README.md), however significant progress
 was made in order to test on Linux, macOS and Windows.
 ### Added
 - Add support for RISC-V in DFT, QUAD and inline headers (#503,
  #522).
 - Add GHA workflow to run CI tests on Windows x86 (#540) and macOS
  x86/aarch64 (#543). And update test matrix.
 - Add GHA workflows to run examples in CI (#550).
 ### Changed
 - Cleanup/Improve support for RISC-V in LIBM (#520, #521).
 - Update supported environment in documentation (#529, #549),
  including website and test matrix from README.
 ### Fixed
 - Major fix and cleanup of CMakeLists.txt (#531).
 - Fix compatibility issue after removal of quad and long double
  sincospi (#545). Restores functions that are missing in 3.6.
 - Various bug fixes (#528, #533, #536, #537).
 ## 3.6 - 2024-02-14
 This release follows a long period of inactivity. The library is now
 being actively maintained. However, the support and test for some
 features is currently limited, as documented in [README](./README.md).
 ### Added
 - Add documentation for the quad precision math library
 - Enable generation of inline header file for CUDA (PR #337)
 - Add support for System/390 z15 support (PR #343)
 - Add support for POWER 9 (PR #360)
 - Add quad-precision functions (PR #375, #377, #380, #381, #382, #383,
  #385, #386, #387)
 - Add preliminary support for iOS and Android (PR #388, #389)
 - Add OpenMP pragmas to the function declarations in sleef.h to enable
  auto-vectorization by GCC (PR #404, #406)
 - Add new public CI test infrastructure using GitHub Actions (PR #476)
 - Add support for RISC-V in libm (PR #477)
 ### Removed
 - Remove old CI scripts based on Travis/Jenkins/Appveyor (PR #502)
 ### Changed
 - Optimise error functions (PR #370)
 - Update CMake package config (PR #412)
 - Update documentation and move doc/website to main repository (PR #504,
  #513)
 - Add SLEEF_ prefix to user-facing CMake options (PR #509)
 - Disable SVE on Darwin (PR #512)
 ### Fixed
 - Fix parallel builds with GNU make (PR #491)
 - Various bug fixes (PR #492, #499, #508)
 ## 3.5.1 - 2020-09-15
 ### Changed
 - Fixed a bug in handling compiler options
 ## 3.5 - 2020-09-01
 - IBM System/390 support is added.
 - The library can be built with Clang on Windows.
 - Static libraries with LTO can be generated.
 - Alternative division and sqrt methods can be chosen with AArch64.
 - Header files for inlining the whole SLEEF functions can be generated.
 - IEEE remainder function is added.
 - GCC-10 can now build SLEEF with SVE support.
 ## 3.4.1 - 2019-10-01
 ### Changed
 - Fixed accuracy problem with tan_u35, atan_u10, log2f_u35 and exp10f_u10.
  https://github.com/shibatch/sleef/pull/260
  https://github.com/shibatch/sleef/pull/265
  https://github.com/shibatch/sleef/pull/267
 - SVE intrinsics that are not supported in newer ACLE are replaced.
  https://github.com/shibatch/sleef/pull/268
 - FMA4 detection problem is fixed.
  https://github.com/shibatch/sleef/pull/262
 - Compilation problem under Windows with MinGW is fixed.
  https://github.com/shibatch/sleef/pull/266
 ## 3.4 - 2019-04-28
 ### Added
 - Faster and low precision functions are added.
  https://github.com/shibatch/sleef/pull/229
 - Functions that return consistent results across platforms are
  added
  https://github.com/shibatch/sleef/pull/216
  https://github.com/shibatch/sleef/pull/224
 - Quad precision math library(libsleefquad) is added
  https://github.com/shibatch/sleef/pull/235
  https://github.com/shibatch/sleef/pull/237
  https://github.com/shibatch/sleef/pull/240
 - AArch64 Vector Procedure Call Standard (AAVPCS) support.
 ### Changed
 - Many functions are now faster
 - Testers are now faster
 ## 3.3.1 - 2018-08-20
 ### Added
 - FreeBSD support is added
 ### Changed
 - i386 build problem is fixed
 - Trigonometric functions now evaluate correctly with full FP
  domain.
  https://github.com/shibatch/sleef/pull/210
 ## 3.3 - 2018-07-06
 ### Added
 - SVE target support is added to libsleef.
  https://github.com/shibatch/sleef/pull/180
 - SVE target support is added to DFT. With this patch, DFT operations
  can be carried out using 256, 512, 1024 and 2048-bit wide vectors
  according to runtime availability of vector registers and operators.
  https://github.com/shibatch/sleef/pull/182
 - 3.5-ULP versions of sinh, cosh, tanh, sinhf, coshf, tanhf, and the
  corresponding testing functionalities are added.
  https://github.com/shibatch/sleef/pull/192
 - Power VSX target support is added to libsleef.
  https://github.com/shibatch/sleef/pull/195
 - Payne-Hanek like argument reduction is added to libsleef.
  https://github.com/shibatch/sleef/pull/197
 ## 3.2 - 2018-02-26
 ### Added
 - The whole build system of the project migrated from makefiles to
  cmake. In particualr this includes `libsleef`, `libsleefgnuabi`,
  `libdft` and all the tests.
 - Benchmarks that compare `libsleef` vs `SVML` on X86 Linux are
  available in the project tree under src/libm-benchmarks directory.
 - Extensive upstream testing via Travis CI and Appveyor, on the
  following systems:
  * OS: Windows / Linux / OSX.
  * Compilers: gcc / clang / MSVC.
  * Targets: X86 (SSE/AVX/AVX2/AVX512F), AArch64 (Advanced SIMD), ARM
    (NEON). Emulators like QEMU or SDE can be used to run the tests.
 - Added the following new vector functions (with relative testing):
  * `log2`
 - New compatibility tests have been added to check that
  `libsleefgnuabi` exports the GNUABI symbols correctly.
 - The library can be compiled to an LLVM bitcode object.
 - Added masked interface to the library to support AVX512F masked
  vectorization.
 ### Changed
 - Use native instructions if available for `sqrt`.
 - Fixed fmax and fmin behavior on AArch64:
  https://github.com/shibatch/sleef/pull/140
 - Speed improvements for `asin`, `acos`, `fmod` and `log`. Computation
  speed of other functions are also improved by general optimization.
  https://github.com/shibatch/sleef/pull/97
 - Removed `libm` dependency.
 ### Removed
 - Makefile build system
 ## 3.1 - 2017-07-19
 - Added AArch64 support
 - Implemented the remaining C99 math functions : lgamma, tgamma,
  erf, erfc, fabs, copysign, fmax, fmin, fdim, trunc, floor, ceil,
  round, rint, modf, ldexp, nextafter, frexp, hypot, and fmod.
 - Added dispatcher for x86 functions
 - Improved reduction of trigonometric functions
 - Added support for 32-bit x86, Cygwin, etc.
 - Improved tester
 ## 3.0 - 2017-02-07
 - New API is defined
 - Functions for DFT are added
 - sincospi functions are added
 - gencoef now supports single, extended and quad precision in addition to double precision
 - Linux, Windows and Mac OS X are supported
 - GCC, Clang, Intel Compiler, Microsoft Visual C++ are supported
 - The library can be compiled as DLLs
 - Files needed for creating a debian package are now included
 ## 2.120 - 2017-01-30
 - Relicensed to Boost Software License Version 1.0
 ## 2.110 - 2016-12-11
 - The valid range of argument is extended for trig functions
 - Specification of each functions regarding to the domain and accuracy is added
 - A coefficient generation tool is added
 - New testing tools are introduced
 - Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh
 - SIMD xsin and xcos returned values more than 1 when FMA is enabled
 - Pure C cbrt returned incorrect values when the argument is negative
 - tan_u1 returned values with more than 1 ulp of error on rare occasions
 - Removed support for Java language(because no one seems using this)
 ## 2.100 - 2016-12-04
 - Added support for AVX-512F and Clang Extended Vectors.
 ## 2.90 - 2016-11-27
 - Added ilogbf. All the reported bugs(listed below) are fixed.
 - Log function returned incorrect values when the argument is very small.
 - Signs of returned values were incorrect when the argument is signed zero.
 - Tester incorrectly counted ULP in some cases.
 - ilogb function returned incorrect values in some cases.
 ## 2.80 - 2013-05-18
 - Added support for ARM NEON. Added higher accuracy single
  precision functions : sinf_u1, cosf_u1, sincosf_u1, tanf_u1, asinf_u1,
  acosf_u1, atanf_u1, atan2f_u1, logf_u1, and cbrtf_u1.
 ## 2.70 - 2013-04-30
 - Added higher accuracy functions : sin_u1, cos_u1, sincos_u1,
  tan_u1, asin_u1, acos_u1, atan_u1, atan2_u1, log_u1, and
  cbrt_u1. These functions evaluate the corresponding function with at
  most 1 ulp of error.
 ## 2.60 - 2013-03-26
 - Added the remaining single precision functions : powf, sinhf,
  coshf, tanhf, exp2f, exp10f, log10f, log1pf. Added support for FMA4
  (for AMD Bulldozer). Added more test cases. Fixed minor bugs (which
  degraded accuracy in some rare cases).
 ## 2.50 - 2013-03-12
 - Added support for AVX2. SLEEF now compiles with ICC.
 ## 2.40 - 2013-03-07
 - Fixed incorrect denormal/nonnumber handling in ldexp, ldexpf,
  sinf and cosf. Removed support for Go language.
 ## 2.31 - 2012-07-05
 - Added sincosf.
 ## 2.30 - 2012-01-20
 - Added single precision functions : sinf, cosf, tanf, asinf,
  acosf, atanf, logf, expf, atan2f and cbrtf.
 ## 2.20 - 2012-01-09
 - Added exp2, exp10, expm1, log10, log1p, and cbrt.
 ## 2.10 - 2012-01-05
 - asin() and acos() are back.
 - Added ilogb() and ldexp().
 - Added hyperbolic functions.
 - Eliminated dependency on frexp, ldexp, fabs, isnan and isinf.
 ## 2.00 - 2011-12-30
 - All of the algorithm has been updated.
 - Both accuracy and speed are improved since version 1.10.
 - Denormal number handling is also improved.
 ## 1.10 - 2010-06-22
 - AVX support is added. Accuracy tester is added.
 ## 1.00 - 2010-05-15
 - Initial release
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/CMakeLists.txt
@@ -0,0 +1,339 @@
 cmake_minimum_required(VERSION 3.18)
 project(SLEEF VERSION 3.6.1 LANGUAGES C)
 set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
 # Options
 option(SLEEF_BUILD_STATIC_TEST_BINS "Build statically linked test executables" OFF)
 option(SLEEF_ENABLE_LTO "Enable LTO on GCC or ThinLTO on clang" OFF)
 option(SLEEF_BUILD_LIBM "libsleef will be built." ON)
 option(SLEEF_BUILD_DFT "libsleefdft will be built." OFF)
 option(SLEEF_BUILD_QUAD "libsleefquad will be built." OFF)
 option(SLEEF_BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON)
 option(SLEEF_BUILD_SCALAR_LIB "libsleefscalar will be built." OFF)
 option(SLEEF_BUILD_TESTS "Tests will be built." ON)
 option(SLEEF_BUILD_INLINE_HEADERS "Build header for inlining whole SLEEF functions" OFF)
 option(SLEEF_TEST_ALL_IUT "Perform tests on implementations with all vector extensions" OFF)
 option(SLEEF_SHOW_CONFIG "Show SLEEF configuration status messages." ON)
 option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
 option(SLEEF_ASAN "Enable address sanitizing on all targets." OFF)
 option(SLEEF_ENFORCE_TESTER "Build fails if tester is not available" OFF)
 option(SLEEF_ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)
 option(SLEEF_ENABLE_ALTDIV  "Enable alternative division method (aarch64 only)" OFF)
 option(SLEEF_ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
 option(SLEEF_DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)
 option(SLEEF_DISABLE_MPFR "Disable testing with the MPFR library" OFF)
 option(SLEEF_DISABLE_SSL "Disable testing with the SSL library" OFF)
 option(SLEEF_ENABLE_CUDA "Enable CUDA" OFF)
 option(SLEEF_ENABLE_CXX "Enable C++" OFF)
 #
 if (DEFINED SLEEF_BUILD_SHARED_LIBS)
  set(BUILD_SHARED_LIBS ${SLEEF_BUILD_SHARED_LIBS})
 endif ()
 if (SLEEF_SHOW_CONFIG)
  # Normalize the value of BUILD_SHARED_LIBS so that it displays nicely
  # in the configuration display
  if (BUILD_SHARED_LIBS)
    set(BUILD_SHARED_LIBS ON)
  else ()
    set(BUILD_SHARED_LIBS OFF)
  endif ()
 endif ()
 # Function used to generate safe command arguments for add_custom_command
 function(command_arguments PROPNAME)
  set(quoted_args "")
  foreach(arg ${ARGN})
    list(APPEND quoted_args "\"${arg}\"" )
  endforeach()
  set(${PROPNAME} ${quoted_args} PARENT_SCOPE)
 endfunction()
 # Helper function for concatenating several files
 function(sleef_concat_files)
  cmake_parse_arguments(concat_required "" "OUTPUT" "SOURCES" ${ARGN})
  if("${concat_required_OUTPUT}" STREQUAL "")
    message(FATAL_ERROR "Must pass OUTPUT to sleef_concat_files")
  endif()
  if(NOT concat_required_SOURCES)
    message(FATAL_ERROR "sleef_concat_files not passed any SOURCES")
  endif()
  add_custom_command(
    OUTPUT ${concat_required_OUTPUT}
    COMMAND ${CMAKE_COMMAND} -E cat ${concat_required_SOURCES} > ${concat_required_OUTPUT}
    DEPENDS ${concat_required_SOURCES}
    COMMAND_EXPAND_LISTS)
 endfunction()
 # Settings
 set(SLEEF_ALL_SUPPORTED_EXTENSIONS
  AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2  # x86
  SVENOFMA SVE ADVSIMDNOFMA ADVSIMD                     # Aarch64
  NEON32 NEON32VFPV4                                    # Aarch32
  VSX VSXNOFMA VSX3 VSX3NOFMA                           # PPC64
  VXE VXENOFMA VXE2 VXE2NOFMA                           # IBM Z
  RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2                     # RISC-V Vectors
  PUREC_SCALAR PURECFMA_SCALAR                          # Generic type
  CACHE STRING "List of SIMD architectures supported by libsleef."
  )
 set(SLEEF_SUPPORTED_LIBM_EXTENSIONS
  AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2  # x86
  SVENOFMA SVE ADVSIMDNOFMA ADVSIMD                     # Aarch64
  NEON32 NEON32VFPV4                                    # Aarch32
  VSX VSXNOFMA VSX3 VSX3NOFMA                           # PPC64
  VXE VXENOFMA VXE2 VXE2NOFMA                           # IBM Z
  RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2                     # RISC-V Vectors
  PUREC_SCALAR PURECFMA_SCALAR                          # Generic type
  CACHE STRING "List of SIMD architectures supported by libsleef."
  )
 set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS
  SSE2 AVX AVX2 AVX512F ADVSIMD SVE
  CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI."
 )
 set(SLEEF_SUPPORTED_QUAD_EXTENSIONS
  PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX2128 AVX2 AVX512F ADVSIMD SVE VSX VSX3 VXE VXE2 RVVM1 RVVM2)
 # MKMASKED_PARAMS
 command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_dp avx512f e 8)
 command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_sp avx512f e -16)
 command_arguments(MKMASKED_PARAMS_GNUABI_SVE_dp sve s 2)
 command_arguments(MKMASKED_PARAMS_GNUABI_SVE_sp sve s -4)
 #
 set(COSTOVERRIDE_AVX512F 10)
 set(COSTOVERRIDE_AVX512FNOFMA 10)
 set(COSTOVERRIDE_AVX2 2)
 set(COSTOVERRIDE_AVX 2)
 set(COSTOVERRIDE_NEON32 2)
 set(COSTOVERRIDE_NEON32VFPV4 2)
 set(COSTOVERRIDE_SVE 10)
 set(COSTOVERRIDE_SVENOFMA 10)
 set(COSTOVERRIDE_RVVM1 10)
 set(COSTOVERRIDE_RVVM1NOFMA 10)
 set(COSTOVERRIDE_RVVM2 20)
 set(COSTOVERRIDE_RVVM2NOFMA 20)
 #
 enable_testing()
 if (SLEEF_ENABLE_CXX)
  enable_language(CXX)
 endif()
 if (SLEEF_ENABLE_CUDA)
  enable_language(CUDA)
 endif()
 # For specifying installation directories
 include(GNUInstallDirs)
 if(NOT DEFINED sleef_SOURCE_DIR)
   set(sleef_SOURCE_DIR ${CMAKE_SOURCE_DIR})
 endif()
 if(NOT DEFINED sleef_BINARY_DIR)
   set(sleef_BINARY_DIR ${CMAKE_BINARY_DIR})
 endif()
 # Sanity check for in-source builds which we do not want to happen
 if(sleef_SOURCE_DIR STREQUAL sleef_BINARY_DIR)
  message(FATAL_ERROR "SLEEF does not allow in-source builds.
 You can refer to docs/build-with-cmake.md for instructions on how provide a \
 separate build directory. Note: Please remove autogenerated file \
 `CMakeCache.txt` and directory `CMakeFiles` in the current directory.")
 endif()
 if(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
  message(FATAL_ERROR "SLEEF_ENABLE_LTO and BUILD_SHARED_LIBS cannot be specified at the same time")
 endif(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
 if(SLEEF_ENABLE_LTO)
  include(CheckIPOSupported)
  check_ipo_supported(RESULT supported OUTPUT error)
 endif()
 # Set output directories for the library files
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
 foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
  string(TOUPPER ${CONFIG} CONFIG)
  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/bin)
 endforeach(CONFIG CMAKE_CONFIGURATION_TYPES)
 # Path for finding cmake modules
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
 set(SLEEF_SCRIPT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Scripts CACHE PATH
  "Path for finding sleef specific cmake scripts")
 if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND "x${CMAKE_C_SIMULATE_ID}" STREQUAL "xMSVC")
  message(STATUS "Building with Clang on Windows")
  set(SLEEF_CLANG_ON_WINDOWS TRUE)
 endif()
 # sleef-config.h.in passes cmake settings to the source code
 include(Configure.cmake)
 configure_file(
  ${PROJECT_SOURCE_DIR}/sleef-config.h.in
  ${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY)
 # We like to have a documented index of all targets in the project. The
 # variables listed below carry the names of the targets defined throughout
 # the project.
 # Generates object file (shared library) `libsleef`
 # Defined in src/libm/CMakeLists.txt via command add_library
 set(TARGET_LIBSLEEF "sleef")
 set(TARGET_LIBSLEEFGNUABI "sleefgnuabi")
 # Generates the sleef.h headers and all the rename headers
 # Defined in src/libm/CMakeLists.txt via custom commands and a custom target
 set(TARGET_HEADERS "headers")
 set(TARGET_INLINE_HEADERS "inline_headers")
 set(TARGET_QINLINE_HEADERS "quad_inline_headers")
 set(TARGET_LIBINLINE "sleefinline")
 # Generates executable files for running the test suite
 # Defined in src/libm-tester/CMakeLists.txt via command add_executable
 set(TARGET_TESTER "tester")
 set(TARGET_IUT "iut")
 # The target to generate LLVM bitcode only, available when SLEEF_ENABLE_LLVM_BITCODE is passed to cmake
 set(TARGET_LLVM_BITCODE "llvm-bitcode")
 # Generates the helper executable file mkrename needed to write the sleef header
 set(TARGET_MKRENAME "mkrename")
 set(TARGET_MKRENAME_GNUABI "mkrename_gnuabi")
 set(TARGET_MKMASKED_GNUABI "mkmasked_gnuabi")
 # Generates the helper executable file mkdisp needed to write the sleef header
 set(TARGET_MKDISP "mkdisp")
 set(TARGET_MKALIAS "mkalias")
 # Generates static library common
 # Defined in src/common/CMakeLists.txt via command add_library
 set(TARGET_LIBCOMMON_OBJ "common")
 set(TARGET_LIBARRAYMAP_OBJ "arraymap")
 # Function used to add an executable that is executed on host
 function(add_host_executable TARGETNAME)
  if (NOT CMAKE_CROSSCOMPILING)
    add_executable(${TARGETNAME} ${ARGN})
    # Ensure that Darwin host executable is built as universal binary
    if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
      target_compile_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
      target_link_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
    endif()
  else()
    add_executable(${TARGETNAME} IMPORTED GLOBAL)
    set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
  endif()
 endfunction()
 function(host_target_AAVPCS_definitions TARGETNAME)
  if (NOT CMAKE_CROSSCOMPILING)
    target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1)
  endif()
 endfunction()
 # Generates object file (shared library) `libsleefdft`
 # Defined in src/dft/CMakeLists.txt via command add_library
 set(TARGET_LIBDFT "sleefdft")
 # Check subdirectories
 add_subdirectory("src")
 # Install the CMake package config
 include(CMakePackageConfigHelpers)
 write_basic_package_version_file(
    sleefConfigVersion.cmake
    COMPATIBILITY SameMajorVersion
 )
 set(
    SLEEF_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/sleef"
    CACHE STRING "CMake package config location relative to the install prefix"
 )
 mark_as_advanced(SLEEF_INSTALL_CMAKEDIR)
 install(
    FILES
    "${PROJECT_SOURCE_DIR}/sleefConfig.cmake"
    "${PROJECT_BINARY_DIR}/sleefConfigVersion.cmake"
    DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
    COMPONENT sleef_Development
 )
 install(
    EXPORT sleefTargets
    NAMESPACE sleef::
    DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
    COMPONENT sleef_Development
 )
 # Extra messages at configuration time. By default is active, it can be
 # turned off by invoking cmake with "-DSLEEF_SHOW_CONFIG=OFF".
 if(SLEEF_SHOW_CONFIG)
  message(STATUS "Configuring build for ${PROJECT_NAME}-v${SLEEF_VERSION}")
  message("   Target system: ${CMAKE_SYSTEM}")
  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
    message("   Target processor: ${CMAKE_OSX_ARCHITECTURES}")
  else()
    message("   Target processor: ${CMAKE_SYSTEM_PROCESSOR}")
  endif()
  message("   Host system: ${CMAKE_HOST_SYSTEM}")
  message("   Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
  message("   Detected C compiler: ${CMAKE_C_COMPILER_ID} @ ${CMAKE_C_COMPILER}")
  message("   CMake: ${CMAKE_VERSION}")
  message("   Make program: ${CMAKE_MAKE_PROGRAM}")
  if(CMAKE_CROSSCOMPILING)
    message("   Crosscompiling SLEEF.")
    message("   Native build dir: ${NATIVE_BUILD_DIR}")
  endif(CMAKE_CROSSCOMPILING)
  message(STATUS "Using option `${SLEEF_C_FLAGS}` to compile libsleef")
  message(STATUS "Building shared libs : " ${BUILD_SHARED_LIBS})
  message(STATUS "Building static test bins: " ${SLEEF_BUILD_STATIC_TEST_BINS})
  message(STATUS "MPFR : " ${LIB_MPFR})
  if (MPFR_INCLUDE_DIR)
    message(STATUS "MPFR header file in " ${MPFR_INCLUDE_DIR})
  endif()
  message(STATUS "GMP : " ${LIBGMP})
  message(STATUS "RT : " ${LIBRT})
  message(STATUS "FFTW3 : " ${LIBFFTW3})
  message(STATUS "OPENSSL : " ${OPENSSL_VERSION})
  message(STATUS "SDE : " ${SDE_COMMAND})
  if (SLEEF_BUILD_INLINE_HEADERS)
    message(STATUS "SED : " ${SED_COMMAND})
  endif()
  message(STATUS "COMPILER_SUPPORTS_OPENMP : " ${COMPILER_SUPPORTS_OPENMP})
  if(ENABLE_GNUABI)
    message(STATUS "A version of SLEEF compatible  with libm and libmvec in GNU libc will be produced (${TARGET_LIBSLEEFGNUABI}.so)")
  endif()
  if (COMPILER_SUPPORTS_SVE)
    message(STATUS "Building SLEEF with VLA SVE support")
    if (ARMIE_COMMAND)
      message(STATUS "Arm Instruction Emulator found at ${ARMIE_COMMAND}")
      message(STATUS "SVE testing is done with ${SVE_VECTOR_BITS}-bits vectors.")
    endif()
  endif()
  if(FORCE_AAVPCS)
    message(STATUS "Building SLEEF with AArch64 Vector PCS support")
  endif()
 endif(SLEEF_SHOW_CONFIG)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/CONTRIBUTORS.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/CONTRIBUTORS.md
@@ -0,0 +1,27 @@
 # List of contributors
 These lists are not exhaustive and only provide most relevant contact information.
 For an exhausitive list of contributors please refer to the
 [GitHub contributors section for SLEEF](https://github.com/shibatch/sleef/graphs/contributors).
 ## Maintainers
 | Name                 | Affiliation             | Github profile                     |
 | -------------------- | ----------------------- | ---------------------------------- |
 | Pierre Blanchard     | Arm Ltd.                | https://github.com/blapie          |
 | Joana Cruz           | Arm Ltd.                | https://github.com/joanaxcruz      |
 | Joe Ramsay           | Arm Ltd.                | https://github.com/joeramsay       |
 | Naoki Shibata        | Nara Institute of Science and Technology | https://github.com/shibatch |
 ## Contributors
 | Name                 | Affiliation             | Github profile                     |
 | -------------------- | ----------------------- | ---------------------------------- |
 | Anonymous            |                         | https://github.com/friendlyanon    |
 | Diana Bite           | Former Arm Ltd.         | https://github.com/diaena          |
 | Ludovic Henry        | Rivos Inc.              | https://github.com/luhenry         |
 | Martin Krastev       | Chaos Group             | https://github.com/blu             |
 | Jilayne Lovejoy      | Former Arm Inc.         | https://github.com/jlovejoy        |
 | Kerry McLaughlin     | Arm Ltd.                | https://github.com/kmclaughlin-arm |
 | Alexandre Mutel      | Unity Technologies      | https://github.com/xoofx           |
 | Francesco Petrogalli | Former Arm Ltd.         | https://github.com/fpetrogalli-arm |
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/Configure.cmake
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/Configure.cmake
@@ -0,0 +1,860 @@
 include(CheckCCompilerFlag)
 include(CheckCSourceCompiles)
 include(CheckTypeSize)
 include(CheckLanguage)
 #
 if (SLEEF_BUILD_STATIC_TEST_BINS)
  set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
  set(BUILD_SHARED_LIBS OFF)
  set(CMAKE_EXE_LINKER_FLAGS "-static")
 endif()
 set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl")
 if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL)
  if (SLEEF_BUILD_STATIC_TEST_BINS)
    set(OPENSSL_USE_STATIC_LIBS TRUE)
  endif()
  find_package(OpenSSL)
  if (OPENSSL_FOUND)
    set(SLEEF_OPENSSL_FOUND TRUE)
    set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES})
    # Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically.
    # This is a known issue https://github.com/openssl/openssl/issues/13872.
    if (SLEEF_BUILD_STATIC_TEST_BINS)
      string(REGEX REPLACE
             "-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
             SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}")
    endif()
    set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION})
    set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES})
    set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
  endif()
 else()
  # find_package cannot find OpenSSL when cross-compiling
  find_library(LIBSSL ssl)
  find_library(LIBCRYPTO crypto)
  if (LIBSSL AND LIBCRYPTO)
    set(SLEEF_OPENSSL_FOUND TRUE)
    set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES})
    set(SLEEF_OPENSSL_VERSION ${LIBSSL})
  endif()
 endif()
 if (SLEEF_ENFORCE_TESTER3 AND NOT SLEEF_OPENSSL_FOUND)
  message(FATAL_ERROR "SLEEF_ENFORCE_TESTER3 is specified and OpenSSL not found")
 endif()
 # Some toolchains require explicit linking of the libraries following.
 find_library(LIB_MPFR mpfr)
 find_library(LIBM m)
 find_library(LIBGMP gmp)
 find_library(LIBRT rt)
 find_library(LIBFFTW3 fftw3)
 if (LIB_MPFR)
  find_path(MPFR_INCLUDE_DIR
    NAMES mpfr.h
    ONLY_CMAKE_FIND_ROOT_PATH)
 endif(LIB_MPFR)
 if (LIBFFTW3)
  find_path(FFTW3_INCLUDE_DIR
    NAMES fftw3.h
    ONLY_CMAKE_FIND_ROOT_PATH)
 endif(LIBFFTW3)
 if (NOT LIBM)
  set(LIBM "")
 endif()
 if (NOT LIBRT)
  set(LIBRT "")
 endif()
 if (SLEEF_DISABLE_MPFR)
  set(LIB_MPFR "")
 endif()
 if (SLEEF_DISABLE_SSL)
  set(SLEEF_OPENSSL_FOUND FALSE)
 endif()
 # Force set default build type if none was specified
 # Note: some sleef code requires the optimisation flags turned on
 if(NOT CMAKE_BUILD_TYPE)
  message(STATUS "Setting build type to 'Release' (required for full support).")
  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
    "Debug" "Release" "RelWithDebInfo" "MinSizeRel")
 endif()
 # Sanitizers
 if(SLEEF_ASAN)
  # Add address sanitizing to all targets
  add_compile_options(-fno-omit-frame-pointer -fsanitize=address)
  add_link_options(-fno-omit-frame-pointer -fsanitize=address)
 endif()
 # TARGET PROCESSOR DETECTION
 set(SLEEF_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
 if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
  set(SLEEF_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
 endif()
 # PLATFORM DETECTION
 if(CMAKE_SIZEOF_VOID_P EQUAL 4)
  set(SLEEF_ARCH_32BIT ON CACHE INTERNAL "True for 32-bit architecture.")
 endif()
 if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)")
  set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")
  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
 elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64")
  set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
  # Aarch64 requires support for advsimdfma4
  set(COMPILER_SUPPORTS_ADVSIMD 1)
  set(COMPILER_SUPPORTS_ADVSIMDNOFMA 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
  set(SLEEF_ARCH_AARCH32 ON CACHE INTERNAL "True for Aarch32 architecture.")
  set(COMPILER_SUPPORTS_NEON32 1)
  set(COMPILER_SUPPORTS_NEON32VFPV4 1)
  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mfpu=vfpv4")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
  set(SLEEF_ARCH_PPC64 ON CACHE INTERNAL "True for PPC64 architecture.")
  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mvsx")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
  set(SLEEF_ARCH_S390X ON CACHE INTERNAL "True for IBM Z architecture.")
  set(CLANG_FLAGS_ENABLE_PUREC_SCALAR "-march=z14;-mzvector")
  set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-march=z14;-mzvector")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
  set(SLEEF_ARCH_RISCV64 ON CACHE INTERNAL "True for RISCV64 architecture.")
 endif()
 set(COMPILER_SUPPORTS_PUREC_SCALAR 1)
 set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
 # Compiler feature detection
 # Detect CLANG executable path (on both Windows and Linux/OSX)
 if(NOT CLANG_EXE_PATH)
  # If the current compiler used by CMAKE is already clang, use this one directly
  if(CMAKE_C_COMPILER MATCHES "clang")
    set(CLANG_EXE_PATH ${CMAKE_C_COMPILER})
  else()
    # Else we may find clang on the path?
    find_program(CLANG_EXE_PATH NAMES clang "clang-11" "clang-10" "clang-9" "clang-8" "clang-7" "clang-6.0" "clang-5.0" "clang-4.0" "clang-3.9")
  endif()
 endif()
 # Allow to define the Gcc/Clang here
 # As we might compile the lib with MSVC, but generates bitcode with CLANG
 # Intel vector extensions.
 set(CLANG_FLAGS_ENABLE_SSE2 "-msse2")
 set(CLANG_FLAGS_ENABLE_SSE4 "-msse4.1")
 set(CLANG_FLAGS_ENABLE_AVX "-mavx")
 set(CLANG_FLAGS_ENABLE_FMA4 "-mfma4")
 set(CLANG_FLAGS_ENABLE_AVX2 "-mavx2;-mfma")
 set(CLANG_FLAGS_ENABLE_AVX2128 "-mavx2;-mfma")
 set(CLANG_FLAGS_ENABLE_AVX512F "-mavx512f")
 set(CLANG_FLAGS_ENABLE_AVX512FNOFMA "-mavx512f")
 set(CLANG_FLAGS_ENABLE_NEON32 "--target=arm-linux-gnueabihf;-mcpu=cortex-a8")
 set(CLANG_FLAGS_ENABLE_NEON32VFPV4 "-march=armv7-a;-mfpu=neon-vfpv4")
 # Arm AArch64 vector extensions.
 set(CLANG_FLAGS_ENABLE_SVE "-march=armv8-a+sve")
 set(CLANG_FLAGS_ENABLE_SVENOFMA "-march=armv8-a+sve")
 # PPC64
 set(CLANG_FLAGS_ENABLE_VSX "-mcpu=power8")
 set(CLANG_FLAGS_ENABLE_VSXNOFMA "-mcpu=power8")
 set(CLANG_FLAGS_ENABLE_VSX3 "-mcpu=power9")
 set(CLANG_FLAGS_ENABLE_VSX3NOFMA "-mcpu=power9")
 # IBM z
 set(CLANG_FLAGS_ENABLE_VXE "-march=z14;-mzvector")
 set(CLANG_FLAGS_ENABLE_VXENOFMA "-march=z14;-mzvector")
 set(CLANG_FLAGS_ENABLE_VXE2 "-march=z15;-mzvector")
 set(CLANG_FLAGS_ENABLE_VXE2NOFMA "-march=z15;-mzvector")
 # RISC-V
 set(CLANG_FLAGS_ENABLE_RVVM1 "-march=rv64gcv_zba_zbb_zbs")
 set(CLANG_FLAGS_ENABLE_RVVM1NOFMA "-march=rv64gcv_zba_zbb_zbs")
 set(CLANG_FLAGS_ENABLE_RVVM2 "-march=rv64gcv_zba_zbb_zbs")
 set(CLANG_FLAGS_ENABLE_RVVM2NOFMA "-march=rv64gcv_zba_zbb_zbs")
 set(FLAGS_OTHERS "")
 # All variables storing compiler flags should be prefixed with FLAGS_
 if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
  # Always compile sleef with -ffp-contract.
  set(FLAGS_STRICTMATH "-ffp-contract=off")
  set(FLAGS_FASTMATH "-ffast-math")
  set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
  if (SLEEF_ARCH_X86 AND SLEEF_ARCH_32BIT)
    string(CONCAT FLAGS_STRICTMATH ${FLAGS_STRICTMATH} " -msse2 -mfpmath=sse")
    string(CONCAT FLAGS_FASTMATH ${FLAGS_FASTMATH} " -msse2 -mfpmath=sse")
  endif()
  # Without the options below, gcc generates calls to libm
  string(CONCAT FLAGS_OTHERS "-fno-math-errno -fno-trapping-math")
  # Intel vector extensions.
  foreach(SIMD ${SLEEF_ALL_SUPPORTED_EXTENSIONS})
    set(FLAGS_ENABLE_${SIMD} ${CLANG_FLAGS_ENABLE_${SIMD}})
  endforeach()
  # Warning flags.
  set(FLAGS_WALL "-Wall -Wno-unused-function -Wno-attributes -Wno-unused-result")
  if(CMAKE_C_COMPILER_ID MATCHES "GNU")
    # The following compiler option is needed to suppress the warning
    # "AVX vector return without AVX enabled changes the ABI" at
    # src/arch/helpervecext.h:88
    string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi")
    set(FLAGS_ENABLE_NEON32 "-mfpu=neon")
  endif(CMAKE_C_COMPILER_ID MATCHES "GNU")
  if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
    if (NOT SLEEF_LLVM_AR_COMMAND)
      find_program(SLEEF_LLVM_AR_COMMAND "llvm-ar")
    endif()
    if (SLEEF_LLVM_AR_COMMAND)
      SET(CMAKE_AR ${SLEEF_LLVM_AR_COMMAND})
      SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs <TARGET> <LINK_FLAGS> <OBJECTS>")
      SET(CMAKE_C_ARCHIVE_FINISH "true")
    endif(SLEEF_LLVM_AR_COMMAND)
    string(CONCAT FLAGS_OTHERS "-flto=thin")
  endif(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
  # Flags for generating inline headers
  set(FLAG_PREPROCESS "-E")
  set(FLAG_PRESERVE_COMMENTS "-C")
  set(FLAG_INCLUDE "-I")
  set(FLAG_DEFINE "-D")
  if (SLEEF_CLANG_ON_WINDOWS)
    # The following line is required to prevent clang from displaying
    # many warnings. Clang on Windows references MSVC header files,
    # which have deprecation and security attributes for many
    # functions.
    string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE -Wno-deprecated-declarations")
  endif()
 elseif(MSVC)
  # Intel vector extensions.
  if (CMAKE_CL_64)
    set(FLAGS_ENABLE_SSE2 /D__SSE2__)
    set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__)
  else()
    set(FLAGS_ENABLE_SSE2 /D__SSE2__ /arch:SSE2)
    set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /arch:SSE2)
  endif()
  set(FLAGS_ENABLE_AVX  /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /arch:AVX)
  set(FLAGS_ENABLE_FMA4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__FMA4__ /arch:AVX2)
  set(FLAGS_ENABLE_AVX2 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
  set(FLAGS_ENABLE_AVX2128 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
  set(FLAGS_ENABLE_AVX512F /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
  set(FLAGS_ENABLE_AVX512FNOFMA /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
  set(FLAGS_ENABLE_PURECFMA_SCALAR /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
  set(FLAGS_WALL "/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE")
  set(FLAGS_NO_ERRNO "")
  set(FLAG_PREPROCESS "/E")
  set(FLAG_PRESERVE_COMMENTS "/C")
  set(FLAG_INCLUDE "/I")
  set(FLAG_DEFINE "/D")
 elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
  set(FLAGS_ENABLE_SSE2 "-msse2")
  set(FLAGS_ENABLE_SSE4 "-msse4.1")
  set(FLAGS_ENABLE_AVX "-mavx")
  set(FLAGS_ENABLE_AVX2 "-march=core-avx2")
  set(FLAGS_ENABLE_AVX2128 "-march=core-avx2")
  set(FLAGS_ENABLE_AVX512F "-xCOMMON-AVX512")
  set(FLAGS_ENABLE_AVX512FNOFMA "-xCOMMON-AVX512")
  set(FLAGS_ENABLE_PURECFMA_SCALAR "-march=core-avx2;-fno-strict-aliasing")
  set(FLAGS_ENABLE_FMA4 "-msse2")  # This is a dummy flag
  if(CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
    set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_types")
    set(FLAGS_FASTMATH "-fp-model fast -Qoption,cpp,--extended_float_types")
  else()
    set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type")
    set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type")
  endif()
  set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
  set(FLAGS_WALL "-fmax-errors=3 -Wall -Wno-unused -Wno-attributes")
  set(FLAGS_NO_ERRNO "")
  set(FLAG_PREPROCESS "-E")
  set(FLAG_PRESERVE_COMMENTS "-C")
  set(FLAG_INCLUDE "-I")
  set(FLAG_DEFINE "-D")
 endif()
 set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_OTHERS}")
 if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
  set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_OTHERS}")
 else()
  set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_FASTMATH} ${FLAGS_OTHERS}")
 endif()
 if(CMAKE_C_COMPILER_ID MATCHES "GNU")
  set(FLAGS_ENABLE_SVE "${FLAGS_ENABLE_SVE};-fno-tree-vrp")
 endif()
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
  set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
  set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse")
 endif()
 if(CYGWIN OR MINGW)
  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-asynchronous-unwind-tables")
  set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-asynchronous-unwind-tables")
 endif()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 9.3 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10.2)
  set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
  set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
 endif()
 # FEATURE DETECTION
 # Long double
 option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF)
 option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF)
 if(NOT SLEEF_DISABLE_LONG_DOUBLE)
  CHECK_TYPE_SIZE("long double" LD_SIZE)
  if(LD_SIZE GREATER "9")
    # This is needed to check since internal compiler error occurs with gcc 4.x
    CHECK_C_SOURCE_COMPILES("
  typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*2)));
  vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
  int main() { vlongdouble vld = vcast_vl_l(0);
  }" COMPILER_SUPPORTS_LONG_DOUBLE)
  endif()
 else()
  message(STATUS "Support for long double disabled by CMake option")
 endif()
 if (SLEEF_ENFORCE_LONG_DOUBLE AND NOT COMPILER_SUPPORTS_LONG_DOUBLE)
  message(FATAL_ERROR "SLEEF_ENFORCE_LONG_DOUBLE is specified and that feature is disabled or not supported by the compiler")
 endif()
 # float128
 option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF)
 option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF)
 if(NOT SLEEF_DISABLE_FLOAT128)
  CHECK_C_SOURCE_COMPILES("
  int main() { __float128 r = 1;
  }" COMPILER_SUPPORTS_FLOAT128)
 else()
  message(STATUS "Support for float128 disabled by CMake option")
 endif()
 if (SLEEF_ENFORCE_FLOAT128 AND NOT COMPILER_SUPPORTS_FLOAT128)
  message(FATAL_ERROR "SLEEF_ENFORCE_FLOAT128 is specified and that feature is disabled or not supported by the compiler")
 endif()
 if(COMPILER_SUPPORTS_FLOAT128)
  CHECK_C_SOURCE_COMPILES("
  #include <quadmath.h>
  int main() { __float128 r = 1;
  }" COMPILER_SUPPORTS_QUADMATH)
 endif()
 # SSE2
 option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF)
 option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE2)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE2}")
  CHECK_C_SOURCE_COMPILES("
  #if defined(_MSC_VER)
  #include <intrin.h>
  #else
  #include <x86intrin.h>
  #endif
  int main() {
    __m128d r = _mm_mul_pd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
    COMPILER_SUPPORTS_SSE2)
 endif()
 if (SLEEF_ENFORCE_SSE2 AND NOT COMPILER_SUPPORTS_SSE2)
  message(FATAL_ERROR "SLEEF_ENFORCE_SSE2 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # SSE 4.1
 option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF)
 option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE4)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE4}")
  CHECK_C_SOURCE_COMPILES("
  #if defined(_MSC_VER)
  #include <intrin.h>
  #else
  #include <x86intrin.h>
  #endif
  int main() {
    __m128d r = _mm_floor_sd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
    COMPILER_SUPPORTS_SSE4)
 endif()
 if (SLEEF_ENFORCE_SSE4 AND NOT COMPILER_SUPPORTS_SSE4)
  message(FATAL_ERROR "SLEEF_ENFORCE_SSE4 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # AVX
 option(SLEEF_ENFORCE_AVX "Disable AVX" OFF)
 option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF)
 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX}")
  CHECK_C_SOURCE_COMPILES("
  #if defined(_MSC_VER)
  #include <intrin.h>
  #else
  #include <x86intrin.h>
  #endif
  int main() {
    __m256d r = _mm256_add_pd(_mm256_set1_pd(1), _mm256_set1_pd(2));
  }" COMPILER_SUPPORTS_AVX)
 endif()
 if (SLEEF_ENFORCE_AVX AND NOT COMPILER_SUPPORTS_AVX)
  message(FATAL_ERROR "SLEEF_ENFORCE_AVX is specified and that feature is disabled or not supported by the compiler")
 endif()
 # FMA4
 option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF)
 option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_FMA4)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_FMA4}")
  CHECK_C_SOURCE_COMPILES("
  #if defined(_MSC_VER)
  #include <intrin.h>
  #else
  #include <x86intrin.h>
  #endif
  int main() {
    __m256d r = _mm256_macc_pd(_mm256_set1_pd(1), _mm256_set1_pd(2), _mm256_set1_pd(3)); }"
    COMPILER_SUPPORTS_FMA4)
 endif()
 if (SLEEF_ENFORCE_FMA4 AND NOT COMPILER_SUPPORTS_FMA4)
  message(FATAL_ERROR "SLEEF_ENFORCE_FMA4 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # AVX2
 option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF)
 option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX2)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX2}")
  CHECK_C_SOURCE_COMPILES("
  #if defined(_MSC_VER)
  #include <intrin.h>
  #else
  #include <x86intrin.h>
  #endif
  int main() {
    __m256i r = _mm256_abs_epi32(_mm256_set1_epi32(1)); }"
    COMPILER_SUPPORTS_AVX2)
  # AVX2 implies AVX2128
  if(COMPILER_SUPPORTS_AVX2)
    set(COMPILER_SUPPORTS_AVX2128 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_AVX2 AND NOT COMPILER_SUPPORTS_AVX2)
  message(FATAL_ERROR "SLEEF_ENFORCE_AVX2 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # AVX512F
 option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF)
 option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF)
 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX512F)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX512F}")
  CHECK_C_SOURCE_COMPILES("
  #if defined(_MSC_VER)
  #include <intrin.h>
  #else
  #include <x86intrin.h>
  #endif
  __m512 addConstant(__m512 arg) {
    return _mm512_add_ps(arg, _mm512_set1_ps(1.f));
  }
  int main() {
    __m512i a = _mm512_set1_epi32(1);
    __m256i ymm = _mm512_extracti64x4_epi64(a, 0);
    __mmask16 m = _mm512_cmp_epi32_mask(a, a, _MM_CMPINT_EQ);
    __m512i r = _mm512_andnot_si512(a, a); }"
    COMPILER_SUPPORTS_AVX512F)
  if (COMPILER_SUPPORTS_AVX512F)
    set(COMPILER_SUPPORTS_AVX512FNOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_AVX512F AND NOT COMPILER_SUPPORTS_AVX512F)
  message(FATAL_ERROR "SLEEF_ENFORCE_AVX512F is specified and that feature is disabled or not supported by the compiler")
 endif()
 # SVE
 option(SLEEF_DISABLE_SVE "Disable SVE" OFF)
 option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF)
 # Darwin does not support SVE yet (see issue #474),
 # therefore we disable SVE on Darwin systems.
 if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SVE}")
  CHECK_C_SOURCE_COMPILES("
  #include <arm_sve.h>
  int main() {
    svint32_t r = svdup_n_s32(1); }"
    COMPILER_SUPPORTS_SVE)
  if(COMPILER_SUPPORTS_SVE)
    set(COMPILER_SUPPORTS_SVENOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_SVE AND NOT COMPILER_SUPPORTS_SVE)
  message(FATAL_ERROR "SLEEF_ENFORCE_SVE is specified and that feature is disabled or not supported by the compiler")
 endif()
 # VSX
 option(SLEEF_DISABLE_VSX "Disable VSX" OFF)
 option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF)
 if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX}")
  CHECK_C_SOURCE_COMPILES("
  #include <altivec.h>
  #ifndef __LITTLE_ENDIAN__
    #error \"Only VSX(ISA2.07) little-endian mode is supported \"
  #endif
  int main() {
    vector double d;
    vector unsigned char p = {
      4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
    };
    d = vec_perm(d, d, p);
  }"
    COMPILER_SUPPORTS_VSX)
  if (COMPILER_SUPPORTS_VSX)
    set(COMPILER_SUPPORTS_VSXNOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_VSX AND NOT COMPILER_SUPPORTS_VSX)
  message(FATAL_ERROR "SLEEF_ENFORCE_VSX is specified and that feature is disabled or not supported by the compiler")
 endif()
 # VSX3
 option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF)
 option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX3)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX3}")
  CHECK_C_SOURCE_COMPILES("
  #include <altivec.h>
  #ifndef __LITTLE_ENDIAN__
    #error \"Only VSX3 little-endian mode is supported \"
  #endif
  int main() {
    static vector double d;
    static vector unsigned long long a, b;
    d = vec_insert_exp(a, b);
  }"
    COMPILER_SUPPORTS_VSX3)
  if (COMPILER_SUPPORTS_VSX3)
    set(COMPILER_SUPPORTS_VSX3NOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_VSX3 AND NOT COMPILER_SUPPORTS_VSX3)
  message(FATAL_ERROR "SLEEF_ENFORCE_VSX3 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # IBM Z
 option(SLEEF_DISABLE_VXE "Disable VXE" OFF)
 option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF)
 if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE}")
  CHECK_C_SOURCE_COMPILES("
  #include <vecintrin.h>
  int main() {
    __vector float d;
    d = vec_sqrt(d);
  }"
    COMPILER_SUPPORTS_VXE)
  if(COMPILER_SUPPORTS_VXE)
    set(COMPILER_SUPPORTS_VXENOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_VXE AND NOT COMPILER_SUPPORTS_VXE)
  message(FATAL_ERROR "SLEEF_ENFORCE_VXE is specified and that feature is disabled or not supported by the compiler")
 endif()
 #
 option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF)
 option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE2)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE2}")
  CHECK_C_SOURCE_COMPILES("
  #include <vecintrin.h>
  int main() {
    __vector float d;
    d = vec_sqrt(d);
  }"
    COMPILER_SUPPORTS_VXE2)
  if(COMPILER_SUPPORTS_VXE2)
    set(COMPILER_SUPPORTS_VXE2NOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_VXE2 AND NOT COMPILER_SUPPORTS_VXE2)
  message(FATAL_ERROR "SLEEF_ENFORCE_VXE2 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # RVVM1
 option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF)
 option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM1)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM1}")
  CHECK_C_SOURCE_COMPILES("
  #include <riscv_vector.h>
  int main() {
    vint32m1_t r = __riscv_vmv_v_x_i32m1(1, __riscv_vlenb() * 8 / 32); }"
    COMPILER_SUPPORTS_RVVM1)
  if(COMPILER_SUPPORTS_RVVM1)
    set(COMPILER_SUPPORTS_RVVM1NOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_RVVM1 AND NOT COMPILER_SUPPORTS_RVVM1)
  message(FATAL_ERROR "SLEEF_ENFORCE_RVVM1 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # RVVM2
 option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF)
 option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF)
 if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM2)
  string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM2}")
  CHECK_C_SOURCE_COMPILES("
  #include <riscv_vector.h>
  int main() {
    vint32m2_t r = __riscv_vmv_v_x_i32m2(1, 2 * __riscv_vlenb() * 8 / 32); }"
    COMPILER_SUPPORTS_RVVM2)
  if(COMPILER_SUPPORTS_RVVM2)
    set(COMPILER_SUPPORTS_RVVM2NOFMA 1)
  endif()
 endif()
 if (SLEEF_ENFORCE_RVVM2 AND NOT COMPILER_SUPPORTS_RVVM2)
  message(FATAL_ERROR "SLEEF_ENFORCE_RVVM2 is specified and that feature is disabled or not supported by the compiler")
 endif()
 # CUDA
 option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
 if (SLEEF_ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
  message(FATAL_ERROR "SLEEF_ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
 endif()
 # OpenMP
 option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
 option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
 if(NOT SLEEF_DISABLE_OPENMP)
  find_package(OpenMP)
  # Check if compilation with OpenMP really succeeds
  # It might not succeed even though find_package(OpenMP) succeeds.
  if(OPENMP_FOUND)
    set (CMAKE_REQUIRED_FLAGS "${OpenMP_C_FLAGS}")
    CHECK_C_SOURCE_COMPILES("
  #include <stdio.h>
  int main() {
  int i;
  #pragma omp parallel for
    for(i=0;i < 10;i++) { putchar(0); }
  }"
      COMPILER_SUPPORTS_OPENMP)
    CHECK_C_SOURCE_COMPILES("
  #pragma omp declare simd notinbranch
  double func(double x) { return x + 1; }
  double a[1024];
  int main() {
  #pragma omp parallel for simd
    for (int i = 0; i < 1024; i++) a[i] = func(a[i]);
  }
  "
      COMPILER_SUPPORTS_OMP_SIMD)
  endif(OPENMP_FOUND)
 else()
  message(STATUS "Support for OpenMP disabled by CMake option")
 endif()
 if (SLEEF_ENFORCE_OPENMP AND NOT COMPILER_SUPPORTS_OPENMP)
  message(FATAL_ERROR "SLEEF_ENFORCE_OPENMP is specified and that feature is disabled or not supported by the compiler")
 endif()
 # Weak aliases
 CHECK_C_SOURCE_COMPILES("
 #if defined(__CYGWIN__)
 #define EXPORT __stdcall __declspec(dllexport)
 #else
 #define EXPORT
 #endif
  EXPORT int f(int a) {
   return a + 2;
  }
  EXPORT int g(int a) __attribute__((weak, alias(\"f\")));
  int main(void) {
    return g(2);
  }"
  COMPILER_SUPPORTS_WEAK_ALIASES)
 if (COMPILER_SUPPORTS_WEAK_ALIASES AND
    NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND
    NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND
    NOT SLEEF_CLANG_ON_WINDOWS AND
    NOT MINGW AND SLEEF_BUILD_GNUABI_LIBS)
  set(ENABLE_GNUABI ${COMPILER_SUPPORTS_WEAK_ALIASES})
 endif()
 # Built-in math functions
 CHECK_C_SOURCE_COMPILES("
  int main(void) {
    double a = __builtin_sqrt (2);
    float  b = __builtin_sqrtf(2);
  }"
  COMPILER_SUPPORTS_BUILTIN_MATH)
 # SYS_getrandom
 CHECK_C_SOURCE_COMPILES("
 #define _GNU_SOURCE
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/random.h>
  int main(void) {
    int i;
    syscall(SYS_getrandom, &i, sizeof(i), 0);
  }"
  COMPILER_SUPPORTS_SYS_GETRANDOM)
 #
 # Reset used flags
 set(CMAKE_REQUIRED_FLAGS)
 set(CMAKE_REQUIRED_LIBRARIES)
 # Save the default C flags
 set(ORG_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
 ##
 # Check if sde64 command is available
 find_program(SDE_COMMAND sde64)
 if (NOT SDE_COMMAND)
  find_program(SDE_COMMAND sde)
 endif()
 # Check if armie command is available
 find_program(ARMIE_COMMAND armie)
 if (NOT SVE_VECTOR_BITS)
  set(SVE_VECTOR_BITS 128)
 endif()
 #
 find_program(FILECHECK_COMMAND NAMES FileCheck FileCheck-11 FileCheck-10 FileCheck-9)
 #
 find_program(SED_COMMAND sed)
 ##
 if(SLEEF_SHOW_ERROR_LOG)
  if (EXISTS ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log)
    file(READ ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log FILE_CONTENT)
    message("")
    message("")
    message("======  Content of CMakeError.log  ======")
    message("")
    message("${FILE_CONTENT}")
    message("")
    message("========  End of CMakeError.log  ========")
    message("")
    message("")
  endif()
 endif(SLEEF_SHOW_ERROR_LOG)
 if (MSVC OR SLEEF_CLANG_ON_WINDOWS)
  set(COMPILER_SUPPORTS_OPENMP FALSE)   # At this time, OpenMP is not supported on MSVC
 endif()
 ##
 # Set common definitions
 if (NOT BUILD_SHARED_LIBS)
  set(COMMON_TARGET_DEFINITIONS SLEEF_STATIC_LIBS=1)
  set(SLEEF_STATIC_LIBS 1)
 endif()
 if (COMPILER_SUPPORTS_WEAK_ALIASES)
  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_ALIAS=1)
 endif()
 if (COMPILER_SUPPORTS_SYS_GETRANDOM)
  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_SYS_getrandom=1)
 endif()
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/LICENSE.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/LICENSE.txt
@@ -0,0 +1,23 @@
 Boost Software License - Version 1.0 - August 17th, 2003
 Permission is hereby granted, free of charge, to any person or organization
 obtaining a copy of the software and accompanying documentation covered by
 this license (the "Software") to use, reproduce, display, distribute,
 execute, and transmit the Software, and to prepare derivative works of the
 Software, and to permit third-parties to whom the Software is furnished to
 do so, all subject to the following:
 The copyright notices in the Software and this entire statement, including
 the above license grant, this restriction and the following disclaimer,
 must be included in all copies of the Software, in whole or in part, and
 all derivative works of the Software, unless such copies or derivative
 works are solely in the form of machine-executable object code generated by
 a source language processor.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/README.md
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/README.md
@@ -0,0 +1,221 @@
 # SLEEF
 ![Github Actions](https://github.com/shibatch/sleef/actions/workflows/build_and_test.yml/badge.svg?event=push&branch=master)
 [![DOI:10.1109/TPDS.2019.2960333](http://img.shields.io/badge/DOI-10.1109/TPDS.2019.2960333-blue.svg)](https://ieeexplore.ieee.org/document/8936472)
 [![License](https://img.shields.io/badge/License-Boost_1.0-lightblue.svg)](https://www.boost.org/LICENSE_1_0.txt)
 ![CMake](https://img.shields.io/badge/cmake-v3.18+-yellow.svg)
 [![Spack](https://img.shields.io/spack/v/sleef)](https://spack.readthedocs.io/en/v0.16.2/package_list.html#sleef)
 [![SourceForge Downloads](https://img.shields.io/sourceforge/dt/sleef)](https://sourceforge.net/projects/sleef/)
 SLEEF is a library that implements vectorized versions of C standard math functions. This library also includes DFT subroutines.
 - **Web Page:** [https://sleef.org/][webpage_url]
 - **Sources:** [https://github.com/shibatch/sleef][repo_url]
 ## Supported environment
 ### Test matrix
 The following table summarises currently supported vector extensions, compilers and OS-es.
 :green_circle: : Tested extensively in CI.
 :yellow_circle: : Tested partially in CI.
 :x: : Currently failing some tests in CI.
 :white_circle: : Not tested in CI. Might have passed tests in previous CI framework.
 [This issue](https://github.com/shibatch/sleef/issues/481) tracks progress on improving test coverage.
 Compilation of SLEEF on previously supported environments might still be safe, we just cannot verify it yet.
 <table>
 <tr>
  <th colspan="2" rowspan="2"></th>
  <th colspan="9">OS/Compiler</th>
 </tr>
 <tr>
  <th colspan="3">Linux</th>
  <th colspan="2">macOS</th>
  <th colspan="4">Windows</th>
 </tr>
 <tr>
  <th>Arch.</th>
  <th>Vector Extensions</th>
  <th>gcc</th><th>llvm</th><th>icc</th>
  <th>gcc</th><th>llvm</th>
  <th>gcc</th><th>llvm-gnu</th><th>llvm-msvc</th><th>msvc</th>
 </tr>
 <tr align="center"><th>x86_64</th><th>SSE2, SSE4,<br>AVX, AVX2, AVX512F</th>
  <td>:green_circle:</td><td>:green_circle:</td><td>:white_circle:</td>
  <td>:white_circle:</td><td>:green_circle:</td>
  <td>:white_circle:</td><td>:yellow_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
 </tr>
 <tr align="center"><th>x86 32bit<br>(i386)</th><th>SSE</th>
  <td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
  <td colspan="2">N/A</td>
  <td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
 </tr>
 <tr align="center"><th>AArch64<br>(arm)</th><th>Neon, SVE</th>
  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
  <td colspan="1">N/A</td><td>:green_circle:</td>
  <td colspan="1">N/A</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
 </tr>
 <tr align="center"><th>AArch32<br>(armhf)</th><th>NEON</th>
  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
  <td colspan="2">N/A</td>
  <td colspan="4">N/A</td>
 </tr>
 <tr align="center"><th>PowerPC<br>(ppc64el)</th><th>VSX, VSX3</th>
  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
  <td colspan="2">N/A</td>
  <td colspan="4">N/A</td>
 </tr>
 <tr align="center"><th>IBM/Z<br>(s390x)</th><th>VXE, VXE2</th>
  <td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
  <td colspan="2">N/A</td>
  <td colspan="4">N/A</td>
 </tr>
 <tr align="center"><th>RISC-V<br>(riscv64)</th><th>RVV1, RVV2</th>
  <td>N/A (14+)</td><td>:green_circle:</td><td>N/A</td>
  <td colspan="2">N/A</td>
  <td colspan="4">N/A</td>
 </tr>
 </table>
 ### Component support
 The above table is valid for libm in single, double and quadruple precision, as well as fast Discrete Fourier Transform (DFT).
 Generation of inline headers is also supported for most vector extensions.
 LTO is not tested in CI yet, except on Windows.
 ### Compiler support
 Results are displayed for gcc 11 and llvm 17, the compiler versions used in CI tests with GitHub Actions.
 Older versions should be supported too, while newer ones are either not tested or have known issues.
 Some compiler versions simply do not support certain vector extensions, for instance SVE is only supported for gcc version 9 onwards.
 Similarly, the RISC-V interface in SLEEF is based on version 1.0 of the intrinsics, which is only supported from llvm version 17 and gcc version 14 onwards.
 Toolchain files provide some information on supported compiler versions.
 ### OS support
 Only Linux distributions and macOS are fully tested in CI and thus officially supported.
 Building SLEEF for Windows on x86 machines was officially supported ( :white_circle: ), as of 3.5.1,
 however it is only partially tested due to [known limitations of the test suite with MinGW or MSYS2](https://github.com/shibatch/sleef/issues/544).
 As a result tests for Windows on x86 only include DFT for now (other tests are disabled in build system),
 but all components are built.
 Support for iOS and Android is only preliminary on AArch64.
 SVE is not supported on Darwin-based system and therefore automatically disabled by SLEEF on Darwin.
 ### More on supported environment
 Refer to our web page for [more on supported environment][supported_env_url].
 ## Install SLEEF dependencies
 The library itself does not have any additional dependency.
 However some tests require:
 - libssl and libcrypto, that can be provided by installing openssl.
 - libm, libgmp and libmpfr
 - libfftw.
 These tests can be disabled if necessary.
 ## How to build SLEEF
 We recommend relying on CMake as much as possible in the build process to ensure portability.
 **CMake 3.18+** is the minimum required.
 1. Check out the source code from our GitHub repository
 ```
 git clone https://github.com/shibatch/sleef
 ```
 2. Make a separate directory to create an out-of-source build
 ```
 cd sleef && mkdir build
 ```
 3. Run cmake to configure the project
 ```
 cmake -S . -B build
 ```
 By default this will generate shared libraries. In order to generate static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`.
 For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`.
 4. Run make to build the project
 ```
 cmake --build build -j --clean-first
 ```
 5. Run tests using ctests
 ```
 ctest --test-dir build -j
 ```
 For more detailed build instructions please refer to the [dedicated section on CMake](./docs/build-with-cmake.md) or to [our web page][build_info_url].
 ## Install SLEEF
 ### From source
 Assuming following instructions were followed.
 6. Install to specified directory `<prefix>`
 ```
 cmake --install build --prefix=<prefix>
 ```
 ### Using Spack
 SLEEF can also be directly installed using Spack.
 ```
 spack install sleef@master
 ```
 ### Uninstall
 In order to uninstall SLEEF library and headers run
 ```
 sudo xargs rm -v < build/install_manifest.txt
 ```
 ## License
 The software is distributed under the Boost Software License, Version 1.0.
 See accompanying file [LICENSE.txt](./LICENSE.txt) or copy at [http://www.boost.org/LICENSE_1_0.txt][license_url].
 Contributions to this project are accepted under the same license.
 Copyright &copy; 2010-2024 SLEEF Project, Naoki Shibata and contributors.<br/>
 <!-- Repository links -->
 [webpage_url]: https://sleef.org/
 [build_info_url]: https://sleef.org/compile.xhtml
 [supported_env_url]: https://sleef.org/index.xhtml#environment
 [repo_url]: https://github.com/shibatch/sleef
 [repo_license_url]: https://github.com/shibatch/sleef/blob/main/LICENSE.txt
 [license_url]: http://www.boost.org/LICENSE_1_0.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/include/sleefdft.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/include/sleefdft.h
@@ -0,0 +1,71 @@
 #ifndef __SLEEFDFT_H__
 #define __SLEEFDFT_H__
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 #include <stdlib.h>
 #include <stdint.h>
 #define SLEEF_MODE_FORWARD     (0 <<  0)
 #define SLEEF_MODE_BACKWARD    (1 <<  0)
 #define SLEEF_MODE_COMPLEX     (0 <<  1)
 #define SLEEF_MODE_REAL        (1 <<  1)
 #define SLEEF_MODE_ALT         (1 <<  2)
 #define SLEEF_MODE_FFTWCOMPAT  (1 <<  3)
 #define SLEEF_MODE_DEBUG       (1 << 10)
 #define SLEEF_MODE_VERBOSE     (1 << 11)
 #define SLEEF_MODE_NO_MT       (1 << 12)
 #define SLEEF_MODE_ESTIMATE    (1 << 20)
 #define SLEEF_MODE_MEASURE     (2 << 20)
 #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
 #ifdef IMPORT_IS_EXPORT
 #define IMPORT __declspec(dllexport)
 #else // #ifdef IMPORT_IS_EXPORT
 #define IMPORT __declspec(dllimport)
 #if (defined(_MSC_VER))
 #pragma comment(lib,"sleefdft.lib")
 #endif // #if (defined(_MSC_VER))
 #endif // #ifdef IMPORT_IS_EXPORT
 #else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
 #define IMPORT
 #endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
 IMPORT struct SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *out, uint64_t mode);
 IMPORT struct SleefDFT *SleefDFT_double_init2d(uint32_t n, uint32_t m, const double *in, double *out, uint64_t mode);
 IMPORT void SleefDFT_double_execute(struct SleefDFT *ptr, const double *in, double *out);
 IMPORT struct SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, uint64_t mode);
 IMPORT struct SleefDFT *SleefDFT_float_init2d(uint32_t n, uint32_t m, const float *in, float *out, uint64_t mode);
 IMPORT void SleefDFT_float_execute(struct SleefDFT *ptr, const float *in, float *out);
 IMPORT void SleefDFT_dispose(struct SleefDFT *ptr);
 IMPORT void SleefDFT_setPath(struct SleefDFT *ptr, char *pathStr);
 //
 IMPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode);
 #define SLEEF_PLAN_AUTOMATIC 0
 #define SLEEF_PLAN_READONLY (1 << 0)
 #define SLEEF_PLAN_RESET (1 << 1)
 #define SLEEF_PLAN_BUILDALLPLAN (1 << 2)
 #define SLEEF_PLAN_NOLOCK (1 << 3)
 #define SLEEF_PLAN_MEASURE (1 << 29)
 #define SLEEF_PLAN_REFERTOENVVAR (1 << 30)
 #undef IMPORT
 #ifdef __cplusplus
 }
 #endif
 #endif // #ifndef __SLEEFDFT_H__
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleef-config.h.in
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleef-config.h.in
@@ -0,0 +1,11 @@
 // Configuration of @PROJECT_NAME@ /////////////////////////////////////////////
 #ifndef SLEEF_CONFIG_H
 #define SLEEF_CONFIG_H
 #define SLEEF_VERSION_MAJOR @SLEEF_VERSION_MAJOR@
 #define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@
 #cmakedefine SLEEF_STATIC_LIBS
 #endif // SLEEF_CONFIG_H
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleefConfig.cmake
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/sleefConfig.cmake
@@ -0,0 +1 @@
 include("${CMAKE_CURRENT_LIST_DIR}/sleefTargets.cmake")
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/CMakeLists.txt
@@ -0,0 +1,22 @@
 include_directories("common")
 include_directories("arch")
 add_subdirectory("libm")
 if (SLEEF_BUILD_TESTS AND NOT MINGW)
  add_subdirectory("libm-tester")
 endif()
 add_subdirectory("common")
 if (SLEEF_BUILD_DFT)
  add_subdirectory("dft")
  if (SLEEF_BUILD_TESTS)
    add_subdirectory("dft-tester")
  endif()
 endif()
 if (SLEEF_BUILD_QUAD)
  add_subdirectory("quad")
  if (SLEEF_BUILD_TESTS AND NOT MINGW)
    add_subdirectory("quad-tester")
  endif()
 endif()
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperadvsimd.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperadvsimd.h
@@ -0,0 +1,837 @@
 /*********************************************************************/
 /*          Copyright ARM Ltd. 2010 - 2024.                          */
 /* Distributed under the Boost Software License, Version 1.0.        */
 /*    (See accompanying file LICENSE.txt or copy at                  */
 /*          http://www.boost.org/LICENSE_1_0.txt)                    */
 /*********************************************************************/
 #if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
 #error Please specify advsimd flags.
 #endif
 #if !defined(SLEEF_GENHEADER)
 #include <arm_neon.h>
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 1
 //@#define LOG2VECTLENDP 1
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP 2
 //@#define LOG2VECTLENSP 2
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #if CONFIG == 1
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #endif
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #define ISANAME "AArch64 AdvSIMD"
 // Mask definition
 typedef uint32x4_t vmask;
 typedef uint32x4_t vopmask;
 // Single precision definitions
 typedef float32x4_t vfloat;
 typedef int32x4_t vint2;
 // Double precision definitions
 typedef float64x2_t vdouble;
 typedef int32x2_t vint;
 typedef int64x2_t vint64;
 typedef uint64x2_t vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 #define DFTPRIORITY 10
 static INLINE int vavailability_i(int name) { return 3; }
 static INLINE void vprefetch_v_p(const void *ptr) { }
 static INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) {
  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmin_u32(x0, x0);
  return vget_lane_u32(x1, 0);
 }
 static INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) {
  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmin_u32(x0, x0);
  return vget_lane_u32(x1, 0);
 }
 // Vector load / store
 static INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
 static INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
 static INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
 static INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
 static INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
 static INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
 static INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
 static INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
 static INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
 static INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 static INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
 static INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
 static INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
 }
 static INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  return ((vfloat) {
      ptr[vgetq_lane_s32(vi2, 0)],
      ptr[vgetq_lane_s32(vi2, 1)],
      ptr[vgetq_lane_s32(vi2, 2)],
      ptr[vgetq_lane_s32(vi2, 3)]
    });
 }
 // Basic logical operations for mask
 static INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
 static INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) {
  return vbicq_u32(y, x);
 }
 static INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
 static INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
 // Mask <--> single precision reinterpret
 static INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) {
  return vreinterpretq_u32_f32(vf);
 }
 static INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) {
  return vreinterpretq_f32_u32(vm);
 }
 static INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
 static INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }
 // Mask <--> double precision reinterpret
 static INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) {
  return vreinterpretq_u32_f64(vd);
 }
 static INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) {
  return vreinterpretq_f64_u32(vm);
 }
 static INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) {
  return vreinterpretq_f32_s32(vm);
 }
 static INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) {
  return vreinterpretq_s32_f32(vf);
 }
 /****************************************/
 /* Single precision FP operations */
 /****************************************/
 // Broadcast
 static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
 // Add, Sub, Mul
 static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
  return vaddq_f32(x, y);
 }
 static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
  return vsubq_f32(x, y);
 }
 static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
  return vmulq_f32(x, y);
 }
 // |x|, -x
 static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
 static INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
 #if CONFIG == 1
 // Multiply accumulate: z = z + x * y
 static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return vfmaq_f32(z, x, y);
 }
 // Multiply subtract: z = z - x * y
 static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return vfmsq_f32(z, x, y);
 }
 // Multiply subtract: z = x * y - z
 static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return vneg_vf_vf(vfmsq_f32(z, x, y));
 }
 #else
 static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
 static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 #endif
 static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
  return vfmaq_f32(z, x, y);
 }
 static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
  return vfmsq_f32(z, x, y);
 }
 static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
  return vfma_vf_vf_vf_vf(x, y, vneg_vf_vf(z));
 }
 // Reciprocal 1/x, Division, Square root
 static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
 #ifndef SLEEF_ENABLE_ALTDIV
  return vdivq_f32(n, d);
 #else
  // Finite numbers (including denormal) only, gives mostly correctly rounded result
  float32x4_t t, u, x, y;
  uint32x4_t i0, i1;
  i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
  i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
  i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
  t = vreinterpretq_f32_u32(i0);
  y = vmulq_f32(d, t);
  x = vmulq_f32(n, t);
  t = vrecpeq_f32(y);
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  u = vmulq_f32(x, t);
  u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
  return u;
 #endif
 }
 static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
 #ifndef SLEEF_ENABLE_ALTDIV
  return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
 #else
  return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
                   vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
 #endif
 }
 static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
 #ifndef SLEEF_ENABLE_ALTSQRT
  return vsqrtq_f32(d);
 #else
  // Gives correctly rounded result for all input range
  vfloat w, x, y, z;
  y = vrsqrteq_f32(d);
  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);
  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
  w = vmul_vf_vf_vf(w, y);
  x = vmul_vf_vf_vf(w, d);
  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
  w = vfma_vf_vf_vf_vf(w, z, y);
  w = vadd_vf_vf_vf(w, x);
  return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
                             vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
 #endif
 }
 // max, min
 static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
  return vmaxq_f32(x, y);
 }
 static INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
  return vminq_f32(x, y);
 }
 // Comparisons
 static INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
 static INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
  return vmvnq_u32(vceqq_f32(x, y));
 }
 static INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
 static INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
 static INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
 static INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
 // Conditional select
 static INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
  return vbslq_f32(mask, x, y);
 }
 // int <--> float conversions
 static INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
 static INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
 static INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
 static INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) {
  return vcvtq_s32_f32(vrndnq_f32(d));
 }
 /***************************************/
 /* Single precision integer operations */
 /***************************************/
 // Add, Sub, Neg (-x)
 static INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vaddq_s32(x, y);
 }
 static INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vsubq_s32(x, y);
 }
 static INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
 // Logical operations
 static INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vandq_s32(x, y);
 }
 static INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vbicq_s32(y, x);
 }
 static INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vorrq_s32(x, y);
 }
 static INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
  return veorq_s32(x, y);
 }
 // Shifts
 #define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
 //@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
 #define vsrl_vi2_vi2_i(x, c)                                                   \
  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
 //@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
 #define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
 //@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
 #define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
 //@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
 #define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
 //@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
 #define vsrl_vi_vi_i(x, c)                                                     \
  vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
 //@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
 // Comparison returning masks
 static INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
 static INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
 // Comparison returning integers
 static INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vreinterpretq_s32_u32(vcgtq_s32(x, y));
 }
 static INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vreinterpretq_s32_u32(vceqq_s32(x, y));
 }
 // Conditional select
 static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
  return vbslq_s32(m, x, y);
 }
 /* -------------------------------------------------------------------------- */
 /* -------------------------------------------------------------------------- */
 /* -------------------------------------------------------------------------- */
 /* -------------------------------------------------------------------------- */
 /****************************************/
 /* Double precision FP operations */
 /****************************************/
 // Broadcast
 static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
 // Add, Sub, Mul
 static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
  return vaddq_f64(x, y);
 }
 static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
  return vsubq_f64(x, y);
 }
 static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
  return vmulq_f64(x, y);
 }
 // |x|, -x
 static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
 static INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }
 // max, min
 static INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
  return vmaxq_f64(x, y);
 }
 static INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
  return vminq_f64(x, y);
 }
 #if CONFIG == 1
 // Multiply accumulate: z = z + x * y
 static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return vfmaq_f64(z, x, y);
 }
 static INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return vfmsq_f64(z, x, y);
 }
 //[z = x * y - z]
 static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return vneg_vd_vd(vfmsq_f64(z, x, y));
 }
 #else
 static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 #endif
 static INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
  return vfmaq_f64(z, x, y);
 }
 static INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
  return vfmsq_f64(z, x, y);
 }
 static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
  return vfma_vd_vd_vd_vd(x, y, vneg_vd_vd(z));
 }
 // Reciprocal 1/x, Division, Square root
 static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
 #ifndef SLEEF_ENABLE_ALTDIV
  return vdivq_f64(n, d);
 #else
  // Finite numbers (including denormal) only, gives mostly correctly rounded result
  float64x2_t t, u, x, y;
  uint64x2_t i0, i1;
  i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
  i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
  i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
  t = vreinterpretq_f64_u64(i0);
  y = vmulq_f64(d, t);
  x = vmulq_f64(n, t);
  t = vrecpeq_f64(y);
  t = vmulq_f64(t, vrecpsq_f64(y, t));
  t = vmulq_f64(t, vrecpsq_f64(y, t));
  t = vmulq_f64(t, vrecpsq_f64(y, t));
  u = vmulq_f64(x, t);
  u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
  return u;
 #endif
 }
 static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
 #ifndef SLEEF_ENABLE_ALTDIV
  return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
 #else
  return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
                   vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
 #endif
 }
 static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
 #ifndef SLEEF_ENABLE_ALTSQRT
  return vsqrtq_f64(d);
 #else
  // Gives correctly rounded result for all input range
  vdouble w, x, y, z;
  y = vrsqrteq_f64(d);
  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
  w = vmul_vd_vd_vd(w, y);
  x = vmul_vd_vd_vd(w, d);
  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
  w = vfma_vd_vd_vd_vd(w, z, y);
  w = vadd_vd_vd_vd(w, x);
  return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
                             vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
 #endif
 }
 /* Comparisons */
 static INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vceqq_f64(x, y));
 }
 static INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));
 }
 static INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcltq_f64(x, y));
 }
 static INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcgtq_f64(x, y));
 }
 static INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcleq_f64(x, y));
 }
 static INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcgeq_f64(x, y));
 }
 // Conditional select
 static INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
  return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);
 }
 #if 1
 static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 #else
 // This implementation is slower on the current CPU models (as of May 2017.)
 // I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.
 static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
                            (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });
  uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };
  return (vdouble) vqtbl1q_u8(tab, idx);
 }
 static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
                            vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },
                                     vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },
                                              (uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));
  uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } };
  return (vdouble) vqtbl2q_u8(tab, idx);
 }
 static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
 }
 #endif
 static INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
 static INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }
 /****************************************/
 /* int <--> float conversions           */
 /****************************************/
 static INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) {
  return vmovn_s64(vcvtq_s64_f64(vf));
 }
 static INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) {
  return vcvtq_f64_s64(vmovl_s32(vi));
 }
 static INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); }
 static INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) {
  return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d)));
 }
 /***************************************/
 /* Integer operations */
 /***************************************/
 // Add, Sub, Neg (-x)
 static INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
 static INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
 static INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); }
 // Logical operations
 static INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
 static INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
 static INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
 static INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }
 // Comparison returning masks
 static INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) {
  return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));
 }
 // Conditional select
 static INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
  return vbsl_s32(vget_low_u32(m), x, y);
 }
 /***************************************/
 /* Predicates                          */
 /***************************************/
 static INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) {
  const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY);
  const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY);
  uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));
  return vreinterpretq_u32_u64(cmp);
 }
 static INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) {
  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));
 }
 static INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY)));
 }
 static INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) {
  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY)));
 }
 static INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
  return vbslq_f32(mask, x, y);
 }
 static INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
  return vceqq_f32(x, y);
 }
 static INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
  return vmvnq_u32(vceqq_f32(x, y));
 }
 static INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
  return vcltq_f32(x, y);
 }
 static INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
  return vcleq_f32(x, y);
 }
 static INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
  return vcgtq_f32(x, y);
 }
 static INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
  return vcgeq_f32(x, y);
 }
 static INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
  return vceqq_s32(x, y);
 }
 static INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
  return vcgtq_s32(x, y);
 }
 static INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) {
  return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0));
 }
 static INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) {
  return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf));
 }
 static INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) {
  return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf));
 }
 static INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) {
  return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf));
 }
 static INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 static INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) {
  return vuzpq_u32(m, m).val[0];
 }
 static INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) {
  return vzipq_u32(m, m).val[0];
 }
 static INLINE VECTOR_CC vopmask vcast_vo_i(int i) {
  return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)(i ? -1 : 0)));
 }
 static INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
  return vandq_u32(x, y);
 }
 static INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
  return vbicq_u32(y, x);
 }
 static INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
  return vorrq_u32(x, y);
 }
 static INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
  return veorq_u32(x, y);
 }
 static INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return vbslq_s32(m, x, y);
 }
 static INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
  return vandq_s32(vreinterpretq_s32_u32(x), y);
 }
 static INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
  return vbicq_s32(y, vreinterpretq_s32_u32(x));
 }
 static INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) {
  return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));
 }
 static INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) {
  return vandq_u32(x, y);
 }
 static INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) {
  return vandq_u32(x, y);
 }
 static INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
  return vbicq_u32(y, x);
 }
 static INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
  return vbicq_u32(y, x);
 }
 static INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) {
  return vorrq_u32(x, y);
 }
 static INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) {
  return vorrq_u32(x, y);
 }
 static INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
  return veorq_u32(x, y);
 }
 static INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }
 static INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) {
  return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
 }
 static INLINE vmask vcast_vm_i64(int64_t i) {
  return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)i));
 }
 static INLINE vmask vcast_vm_u64(uint64_t i) {
  return vreinterpretq_u32_u64(vdupq_n_u64(i));
 }
 static INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
 }
 static INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
 }
 static INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  return vbsl_s32(vget_low_u32(m), x, y);
 }
 // Logical operations
 static INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) {
  return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);
 }
 static INLINE VECTOR_CC vmask vcastu_vm_vi(vint vi) {
  return vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi))));
 }
 static INLINE VECTOR_CC vint vcastu_vi_vm(vmask vi2) {
  return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vi2))));
 }
 static INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }
 //
 #define PNMASK ((vdouble) { +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0 })
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
 static INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
 static INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
 static INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
 static INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
 static INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
 static INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }
 static INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; }
 static INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
 static INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 static INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
 static INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
 static INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
 static INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
 }
 static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
 }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) {
  uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmax_u32(x0, x0);
  return ~vget_lane_u32(x1, 0);
 }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
 }
 static INLINE vmask vneg64_vm_vm(vmask x) {
  return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));
 }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
 }
 #define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
 //@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
 #define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
 //@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
 static INLINE vmask vcast_vm_vi(vint vi) {
  vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
  return vor_vm_vm_vm(vcastu_vm_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi)))), m);
 }
 static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v)  { return vreinterpretq_u32_s64(v); }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m)  { return vreinterpretq_s64_u32(m); }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return vreinterpretq_u32_u64(v); }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return vreinterpretq_u64_u32(m); }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx.h
@@ -0,0 +1,638 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 1
 #if !defined(__AVX__) && !defined(SLEEF_GENHEADER)
 #error Please specify -mavx.
 #endif
 #elif CONFIG == 4
 #if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER)
 #error Please specify -mavx and -mfma4.
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 2
 //@#define LOG2VECTLENDP 2
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #if !defined(SLEEF_GENHEADER)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 typedef __m256i vmask;
 typedef __m256i vopmask;
 typedef __m256d vdouble;
 typedef __m128i vint;
 typedef __m256 vfloat;
 typedef struct { __m128i x, y; } vint2;
 typedef __m256i vint64;
 typedef __m256i vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 //
 #if !defined(SLEEF_GENHEADER)
 #ifndef __SLEEF_H__
 void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
 #endif
 static INLINE int cpuSupportsAVX() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 28)) != 0;
 }
 static INLINE int cpuSupportsFMA4() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 0x80000001, 0);
    return (reg[2] & (1 << 16)) != 0;
 }
 #if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__)
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX() && cpuSupportsFMA4();
  return d ? 3 : 0;
 }
 #define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 #define ISANAME "AVX + AMD FMA4"
 #define DFTPRIORITY 21
 #else
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX();
  return d ? 3 : 0;
 }
 #define ISANAME "AVX"
 #define DFTPRIORITY 20
 #endif
 #endif // #if !defined(SLEEF_GENHEADER)
 static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
 static INLINE int vtestallones_i_vo32(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
 }
 static INLINE int vtestallones_i_vo64(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
 }
 //
 static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
 //
 static vint2 vloadu_vi2_p(int32_t *p) {
  vint2 r;
  r.x = _mm_loadu_si128((__m128i *) p     );
  r.y = _mm_loadu_si128((__m128i *)(p + 4));
  return r;
 }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
  _mm_storeu_si128((__m128i *) p     , v.x);
  _mm_storeu_si128((__m128i *)(p + 4), v.y);
 }
 static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vcast_vo32_vo64(vopmask o) {
  return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));
 }
 static INLINE vopmask vcast_vo64_vo32(vopmask o) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));
 }
 static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
 //
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vmask vcastu_vm_vi(vint vi) {
  __m256i m = _mm256_castsi128_si256(_mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0)));
  return _mm256_insertf128_si256(m,  _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0)), 1);
 }
 static INLINE vint vcastu_vi_vm(vmask vi) {
  return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(_mm256_castsi256_si128(vi)     , 0x0d), _mm_set_epi32( 0,  0, -1, -1)),
                      _mm_and_si128(_mm_shuffle_epi32(_mm256_extractf128_si256(vi, 1), 0xd0), _mm_set_epi32(-1, -1,  0,  0)));
 }
 static INLINE vmask vcast_vm_i_i(int i0, int i1) {
  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
 }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));
 }
 static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
 //
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
 static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
 static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
 #if CONFIG == 1
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
 #else
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }
 #endif
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
 //
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
 static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
 static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
 static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
 static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
 }
 static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  int a[VECTLENDP];
  vstoreu_v_p_vi(a, vi);
  return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
 }
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
  double a[VECTLENDP];
  vstoreu_v_p_vd(a, v);
  return a[0];
 }
 #endif
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) {
  vint2 r;
  r.x = _mm256_castsi256_si128(vm);
  r.y = _mm256_extractf128_si256(vm, 1);
  return r;
 }
 static INLINE vmask vcast_vm_vi2(vint2 vi) {
  vmask m = _mm256_castsi128_si256(vi.x);
  m = _mm256_insertf128_si256(m, vi.y, 1);
  return m;
 }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
 static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
 static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
 #if CONFIG == 1
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 #else
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
 static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }
 #endif
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };
  return vi;
 }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };
  return vi;
 }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) {
  vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };
  return vi;
 }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };
  return vi;
 }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };
  return vi;
 }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };
  return vi;
 }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };
  return vi;
 }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
  vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };
  return vi;
 }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
  vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };
  return vi;
 }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
  vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };
  return vi;
 }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpeq_epi32(x.x, y.x);
  r.y = _mm_cmpeq_epi32(x.y, y.y);
  return vcast_vm_vi2(r);
 }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpgt_epi32(x.x, y.x);
  r.y = _mm_cmpgt_epi32(x.y, y.y);
  return vcast_vm_vi2(r);
 }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpeq_epi32(x.x, y.x);
  r.y = _mm_cmpeq_epi32(x.y, y.y);
  return r;
 }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpgt_epi32(x.x, y.x);
  r.y = _mm_cmpgt_epi32(x.y, y.y);
  return r;
 }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  vint2 n = vcast_vi2_vm(m);
  vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };
  return r;
 }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
  vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;
  iz.x = _mm_add_epi64(ix.x, iy.x);
  iz.y = _mm_add_epi64(ix.y, iy.y);
  return vcast_vm_vi2(iz);
 }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 //
 static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  int a[VECTLENSP];
  vstoreu_v_p_vi2(a, vi2);
  return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
                       ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
 }
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
  float a[VECTLENSP];
  vstoreu_v_p_vf(a, v);
  return a[0];
 }
 #endif
 //
 #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
 #if CONFIG == 1
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 #else
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
 #endif
 static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
 static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
 }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
 }
 //
 static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
 }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));
 }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
  vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));
  return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);
 }
 static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
  vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));
  return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);
 }
 #define vsll64_vm_vm_i(x, c) \
  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \
                          _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
 #define vsrl64_vm_vm_i(x, c) \
  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
                          _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
 //@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
 //@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
 static INLINE vmask vcast_vm_vi(vint vi) {
  vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
  vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
  vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
  return vor_vm_vm_vm(vcastu_vm_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1))), m);
 }
 static INLINE vint vcast_vi_vm(vmask vm) {
  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
                      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
 }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2.h
@@ -0,0 +1,485 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 1
 #if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
 #error Please specify -mavx2.
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 2
 //@#define LOG2VECTLENDP 2
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #if !defined(SLEEF_GENHEADER)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 typedef __m256i vmask;
 typedef __m256i vopmask;
 typedef __m256d vdouble;
 typedef __m128i vint;
 typedef __m256 vfloat;
 typedef __m256i vint2;
 typedef __m256i vint64;
 typedef __m256i vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 //
 #if !defined(SLEEF_GENHEADER)
 #ifndef __SLEEF_H__
 void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
 #endif
 static INLINE int cpuSupportsAVX2() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 5)) != 0;
 }
 static INLINE int cpuSupportsFMA() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 12)) != 0;
 }
 #if CONFIG == 1 && defined(__AVX2__)
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX2() && cpuSupportsFMA();
  return d ? 3 : 0;
 }
 #define ISANAME "AVX2"
 #define DFTPRIORITY 25
 #endif
 #endif // #if !defined(SLEEF_GENHEADER)
 static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
 static INLINE int vtestallones_i_vo32(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
 }
 static INLINE int vtestallones_i_vo64(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
 }
 //
 static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
 //
 static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }
 static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vcast_vo32_vo64(vopmask o) {
  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));
 }
 static INLINE vopmask vcast_vo64_vo32(vopmask o) {
  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
 }
 static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
 //
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vmask vcastu_vm_vi(vint vi) {
  return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);
 }
 static INLINE vint vcastu_vi_vm(vmask vi) {
  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
                      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
 }
 static INLINE vmask vcast_vm_i_i(int i0, int i1) {
  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
 }
 static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }
 //
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
 static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
 static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
 //
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
 static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
 static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
 static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
 static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  __m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),
                                                   vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),
                                                                    vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),
                                                                                     _mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));
  return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
 }
 static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
 }
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
  double s[4];
  _mm256_storeu_pd(s, v);
  return s[0];
 }
 #endif
 static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
 static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
 static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return _mm256_blendv_epi8(y, x, m);
 }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
 // At this point, the following three functions are implemented in a generic way,
 // but I will try target-specific optimization later on.
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
  float s[8];
  _mm256_storeu_ps(s, v);
  return s[0];
 }
 #endif
 static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
 //
 #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
 static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
 static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
 }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
 }
 //
 static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
 }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); }
 static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare
 #define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
 #define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
 //@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
 //@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
 static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } // signed 32-bit => 64-bit
 static INLINE vint vcast_vi_vm(vmask vm) { // signed 32-bit <= 64-bit
  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
                      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
 }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2_128.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx2_128.h
@@ -0,0 +1,463 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 1
 #if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
 #error Please specify -mavx2.
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 1
 //@#define LOG2VECTLENDP 1
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #if !defined(SLEEF_GENHEADER)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 typedef __m128i vmask;
 typedef __m128i vopmask;
 typedef __m128d vdouble;
 typedef __m128i vint;
 typedef __m128  vfloat;
 typedef __m128i vint2;
 typedef __m128i vint64;
 typedef __m128i vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 //
 #if !defined(SLEEF_GENHEADER)
 #ifndef __SLEEF_H__
 void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
 #endif
 static INLINE int cpuSupportsAVX2() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 5)) != 0;
 }
 static INLINE int cpuSupportsFMA() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 12)) != 0;
 }
 #if CONFIG == 1 && defined(__AVX2__)
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX2() && cpuSupportsFMA();
  return d ? 3 : 0;
 }
 #define ISANAME "AVX2"
 #define DFTPRIORITY 25
 #endif
 #endif // #if !defined(SLEEF_GENHEADER)
 static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
 static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
 static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
 //
 static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm);  }
 //
 static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i const *)p); }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
 static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
 static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
 //
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vmask vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
 static INLINE vint vcastu_vi_vm(vmask vi) { return _mm_shuffle_epi32(vi, 0x0d); }
 static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
 static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
 //
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
 static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
 static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmsub_pd(x, y, z); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_EQ_OQ)); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_NEQ_UQ)); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LT_OQ)); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LE_OQ)); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GT_OQ)); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GE_OQ)); }
 //
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
 static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(o)); }
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmp_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
 }
 static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmp_pd(d, d, _CMP_NEQ_UQ));
 }
 static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
  double a[VECTLENDP];
  vstoreu_v_p_vd(a, v);
  return a[0];
 }
 #endif
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvtps_epi32(vf)); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvttps_epi32(vf)); }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
 static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
 static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmsub_ps(x, y, z); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_EQ_OQ)); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_NEQ_UQ)); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LT_OQ)); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LE_OQ)); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GT_OQ)); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GE_OQ)); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi32(x, y); }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi32(x, y); }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_and_si128(x, y); }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_andnot_si128(x, y); }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_or_si128(x, y); }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_xor_si128(x, y); }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm_slli_epi32(x, c); }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm_srli_epi32(x, c); }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm_srai_epi32(x, c); }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return _mm_blendv_epi8(y, x, m);
 }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(o)); }
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
  float a[VECTLENSP];
  vstoreu_v_p_vf(a, v);
  return a[0];
 }
 #endif
 //
 #define PNMASK ((vdouble) { +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0 })
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
 static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
 //
 static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
 }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq = {
    vloadu_vi2_p((int32_t *)p),
    vloadu_vi2_p((int32_t *)((uint8_t *)p + sizeof(vmask)))
  };
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 static void vstoreu_v_p_vq(void *p, vquad vq) {
  vstoreu_v_p_vi2((int32_t *)p, vcast_vi2_vm(vq.x));
  vstoreu_v_p_vi2((int32_t *)((uint8_t *)p + sizeof(vmask)), vcast_vi2_vm(vq.y));
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm_blendv_epi8(y, x, o); }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
 static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpgt_epi64(x, y); } // signed compare
 #define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
 #define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
 //@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
 //@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
 static INLINE vmask vcast_vm_vi(vint vi) {
  vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
  return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
 }
 static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx512f.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperavx512f.h
@@ -0,0 +1,600 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 1 || CONFIG == 2
 #if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER)
 #error Please specify -mavx512f.
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 3
 //@#define LOG2VECTLENDP 3
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #if CONFIG == 1
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #endif
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #if !defined(SLEEF_GENHEADER)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 typedef __m512i vmask;
 typedef __mmask16 vopmask;
 typedef __m512d vdouble;
 typedef __m256i vint;
 typedef __m512 vfloat;
 typedef __m512i vint2;
 typedef __m512i vint64;
 typedef __m512i vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 //
 #if !defined(SLEEF_GENHEADER)
 #ifndef __SLEEF_H__
 void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
 #endif
 static INLINE int cpuSupportsAVX512F() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 16)) != 0;
 }
 #if CONFIG == 1 && defined(__AVX512F__)
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX512F();
  return d ? 3 : 0;
 }
 #define ISANAME "AVX512F"
 #define DFTPRIORITY 30
 #endif
 #if CONFIG == 2 && defined(__AVX512F__)
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX512F();
  return d ? 3 : 0;
 }
 #define ISANAME "AVX512FNOFMA"
 #define DFTPRIORITY 0
 #endif
 #endif // #if !defined(SLEEF_GENHEADER)
 static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
 #ifdef __INTEL_COMPILER
 static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
 static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }
 #else
 static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
 static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
 #endif
 //
 static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); }
 static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }
 static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
 static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
 static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
 static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
 static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
 static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }
 static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
 //
 static INLINE vint vrint_vi_vd(vdouble vd) {
  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
 }
 static INLINE vint vtruncate_vi_vd(vdouble vd) {
  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
 }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
 }
 static INLINE vdouble vrint_vd_vd(vdouble vd) {
  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
 }
 static INLINE vmask vcastu_vm_vi(vint vi) {
  return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
 }
 static INLINE vint vcastu_vi_vm(vmask vi) {
  return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));
 }
 static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }
 static INLINE vmask vcast_vm_i64(int64_t i) { return _mm512_set1_epi64(i); }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm512_set1_epi64((uint64_t)i); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }
 //
 static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }
 static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
 static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }
 #if CONFIG == 1
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
 #else
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 #endif
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }
 //
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
 static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }
 static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));
 }
 static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));
 }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }
 #define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
 #define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
 #define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
 //@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
 //@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
 //@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);
 }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);
 }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
  return _mm512_mask_blend_pd(mask, y, x);
 }
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 #if 1
 // Probably this is faster
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  __m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),
                                                   vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),
                                                                    vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),
                                                                                     _mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));
  return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
 }
 #else
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 #endif
 static INLINE vopmask visinf_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
 }
 static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
 }
 static INLINE vopmask visminf_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ);
 }
 static INLINE vopmask visnan_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);
 }
 static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
 // vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to
 // be a normalized FP value.
 static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
 static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }
 static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }
 static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
 static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
 #define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
 #define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
 //@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
 //@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
  double s[VECTLENDP];
  _mm512_storeu_pd(s, v);
  return s[0];
 }
 #endif
 static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
 //
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));
 }
 //
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); }
 static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); }
 static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); }
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); }
 static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
 }
 static INLINE vfloat vrint_vf_vf(vfloat vd) {
  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
 }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }
 #if CONFIG == 1
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
 #else
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 #endif
 static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);
 }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
  return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));
 }
 #define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
 #define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
 #define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
 //@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
 //@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
 //@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  __mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
 }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  __mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);
  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
 }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return _mm512_mask_blend_epi32(m, y, x);
 }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
  return _mm512_mask_blend_ps(m, y, x);
 }
 // At this point, the following three functions are implemented in a generic way,
 // but I will try target-specific optimization later on.
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
 static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
  float s[VECTLENSP];
  _mm512_storeu_ps(s, v);
  return s[0];
 }
 #endif
 static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
 //
 static INLINE vdouble vposneg_vd_vd(vdouble d) {
  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
 }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) {
  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
 }
 static INLINE vfloat vposneg_vf_vf(vfloat d) {
  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
 }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) {
  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
 }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }
 static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) {
  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd)));
 }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
  _mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
  _mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
 }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
  _mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
  _mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
 }
 //
 static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); }
 static INLINE vfloat vreva2_vf_vf(vfloat vf) {
  return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf)));
 }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
  _mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
  _mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 #ifdef __INTEL_COMPILER
 static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; }
 #else
 static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }
 #endif
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); }
 static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare
 #define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
 #define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
 //@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
 //@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
 static INLINE vmask vcast_vm_vi(vint vi) {
  return _mm512_cvtepi32_epi64(vi);
 }
 static INLINE vint vcast_vi_vm(vmask vm) {
  return _mm512_cvtepi64_epi32(vm);
 }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperneon32.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperneon32.h
@@ -0,0 +1,297 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
 #error Please specify -mfpu=neon.
 #endif
 #ifdef __aarch64__
 #warning This implementation is for AARCH32.
 #endif
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP 2
 //@#define LOG2VECTLENSP 2
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #if CONFIG == 4
 #define ISANAME "AARCH32 NEON-VFPV4"
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #else
 #define ISANAME "AARCH32 NEON"
 #endif
 #define DFTPRIORITY 10
 #define ENABLE_RECSQRT_SP
 //@#define ENABLE_RECSQRT_SP
 #include <arm_neon.h>
 #include <stdint.h>
 #include "misc.h"
 typedef uint32x4_t vmask;
 typedef uint32x4_t vopmask;
 //typedef int32x4_t vint;
 typedef float32x4_t vfloat;
 typedef int32x4_t vint2;
 //
 static INLINE void vprefetch_v_p(const void *ptr) { }
 static INLINE int vtestallones_i_vo32(vopmask g) {
  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmin_u32(x0, x0);
  return vget_lane_u32(x1, 0);
 }
 static vfloat vloaduf(float *p) { return vld1q_f32(p); }
 static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
 static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }
 //
 static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  uint32x4_t t = vceqq_u32(x, y);
  return vandq_u32(t, vrev64q_u32(t));
 }
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
 static INLINE vint2 vrint_vi2_vf(vfloat d) {
  return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
 }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
 static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
 static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
 #if CONFIG == 4
 static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
 static INLINE vfloat vfma_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) {
  float32x4_t t = vrecpeq_f32(y), u;
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
  u = vmulq_f32(x, t);
  return vfmaq_f32(u, vfmsq_f32(x, y, u), t);
 }
 static INLINE vfloat vsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  float32x4_t u = vmulq_f32(x, d);
  u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
 }
 static INLINE vfloat vrec_vf_vf(vfloat y) {
  float32x4_t t = vrecpeq_f32(y);
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
  return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
 }
 static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
 }
 #else // #if CONFIG == 4
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
  float32x4_t x = vrecpeq_f32(d);
  x = vmulq_f32(x, vrecpsq_f32(d, x));
  float32x4_t t = vmulq_f32(n, x);
  return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
 }
 static INLINE vfloat vsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  float32x4_t u = vmulq_f32(x, d);
  u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
 }
 static INLINE vfloat vrec_vf_vf(vfloat d) {
  float32x4_t x = vrecpeq_f32(d);
  x = vmulq_f32(x, vrecpsq_f32(d, x));
  return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
 }
 static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
 }
 #endif // #if CONFIG == 4
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }
 #define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
 #define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
 #define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
 //@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
 //@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
 //@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
  return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
 }
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {
  float p[4];
  vst1q_f32 (p, v);
  return p[0];
 }
 static INLINE int vavailability_i(int name) {
  if (name != 2) return 0;
  return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
 }
 static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  return ((vfloat) {
      ptr[vgetq_lane_s32(vi2, 0)],
      ptr[vgetq_lane_s32(vi2, 1)],
      ptr[vgetq_lane_s32(vi2, 2)],
      ptr[vgetq_lane_s32(vi2, 3)]
    });
 }
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpower_128.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpower_128.h
@@ -0,0 +1,873 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 1 || CONFIG == 2 || CONFIG == 3 || CONFIG == 4
 #ifndef __VSX__
 #error Please specify -mcpu=power8 or -mcpu=power9
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 1
 //@#define LOG2VECTLENDP 1
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #if CONFIG == 1 || CONFIG == 3
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #endif
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #if !defined(SLEEF_GENHEADER)
 #include <altivec.h>
 // undef altivec types since CPP and C99 use them as compiler tokens
 // use __vector and __bool instead
 #undef vector
 #undef bool
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 #if CONFIG == 1 || CONFIG == 2
 #define ISANAME "VSX"
 #else
 #define ISANAME "VSX-3"
 #endif
 #define DFTPRIORITY 25
 static INLINE int vavailability_i(int name) { return 3; }
 static INLINE void vprefetch_v_p(const void *ptr) { }
 /**********************************************
 ** Types
 ***********************************************/
 typedef __vector unsigned int vmask;
 // using __bool with typedef may cause ambiguous errors
 #define vopmask __vector __bool int
 //@#define vopmask __vector __bool int
 typedef __vector signed int vint;
 typedef __vector signed int vint2;
 typedef __vector float  vfloat;
 typedef __vector double vdouble;
 // internal use types
 typedef __vector unsigned int v__u32;
 typedef __vector unsigned char v__u8;
 typedef __vector signed long long  v__i64;
 typedef __vector unsigned long long  v__u64;
 #define v__b64 __vector __bool long long
 typedef __vector long long vint64;
 typedef __vector unsigned long long vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 /**********************************************
 ** Utilities
 ***********************************************/
 #define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1})
 #define vset__vi2(...) ((vint2) {__VA_ARGS__})
 #define vset__vm(...) ((vmask) {__VA_ARGS__})
 #define vset__vo(...) ((vopmask) {__VA_ARGS__})
 #define vset__vf(...) ((vfloat) {__VA_ARGS__})
 #define vset__vd(...) ((vdouble) {__VA_ARGS__})
 #define vset__u8(...) ((v__u8) {__VA_ARGS__})
 #define vset__u32(...) ((v__u32) {__VA_ARGS__})
 #define vset__s64(...) ((v__i64) {__VA_ARGS__})
 #define vset__u64(...) ((v__u64) {__VA_ARGS__})
 #define vsetall__vi(v)  vset__vi(v, v)
 #define vsetall__vi2(v) vset__vi2(v, v, v, v)
 #define vsetall__vm(v)  vset__vm(v, v, v, v)
 #define vsetall__vo(v)  vset__vo(v, v, v, v)
 #define vsetall__vf(v)  vset__vf(v, v, v, v)
 #define vsetall__vd(v)  vset__vd(v, v)
 #define vsetall__u8(v)  vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)
 #define vsetall__u32(v) vset__u32(v, v, v, v)
 #define vsetall__s64(v) vset__s64(v, v)
 #define vsetall__u64(v) vset__u64(v, v)
 #define vzero__vi()  vsetall__vi(0)
 #define vzero__vi2() vsetall__vi2(0)
 #define vzero__vm()  vsetall__vm(0)
 #define vzero__vo()  vsetall__vo(0)
 #define vzero__vf()  vsetall__vf(0)
 #define vzero__vd()  vsetall__vd(0)
 #define vzero__u8()  vsetall__u8(0)
 #define vzero__u32() vsetall__u32(0)
 #define vzero__s64() vsetall__s64(0)
 #define vzero__u64() vsetall__u64(0)
 //// Swap doubleword elements
 #if defined(__clang__) || __GNUC__ >= 7
  static INLINE v__u64 v__swapd_u64(v__u64 v)
  { return vec_xxpermdi(v, v, 2); }
 #else
  static INLINE v__u64 v__swapd_u64(v__u64 v)
  {
    __asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v));
    return v;
  }
 #endif
 /**********************************************
 ** Memory
 ***********************************************/
 ////////////// Unaligned memory access //////////////
 /**
 * It's not safe to use vector assignment via (cast & dereference) for unaligned memory access
 * with almost all clang versions and gcc8 when VSX3 isn't enabled,
 * these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x'
 * for more information check https://github.com/seiko2plus/vsx_mem_test
 *
 * TODO: check GCC(9, 10)
 */
 //// load
 #if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
 static vint vloadu_vi_p(const int32_t *ptr)
 { return *((vint*)ptr); }
 static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
 { return *((vint2*)ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr)
 { return *((vfloat*)ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr)
 { return *((vdouble*)ptr); }
 #else
 static vint vloadu_vi_p(const int32_t *ptr)
 { return vec_vsx_ld(0, ptr); }
 static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
 { return vec_vsx_ld(0, ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr)
 { return vec_vsx_ld(0, ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr)
 { return vec_vsx_ld(0, ptr); }
 #endif
 //// store
 #if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
 static void vstoreu_v_p_vi(int32_t *ptr, vint v)
 { *((vint*)ptr) = v; }
 static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
 { *((vint2*)ptr) = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
 { *((vfloat*)ptr) = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
 { *((vdouble*)ptr) = v; }
 #else
 static void vstoreu_v_p_vi(int32_t *ptr, vint v)
 { vec_vsx_st(v, 0, ptr); }
 static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
 { vec_vsx_st(v, 0, ptr); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
 { vec_vsx_st(v, 0, ptr); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
 { vec_vsx_st(v, 0, ptr); }
 #endif
 ////////////// aligned memory access //////////////
 //// load
 static INLINE vfloat vload_vf_p(const float *ptr)
 { return vec_ld(0, ptr); }
 static INLINE vdouble vload_vd_p(const double *ptr)
 { return *((vdouble*)ptr); }
 //// store
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v)
 { vec_st(v, 0, ptr); }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v)
 { *((vdouble*)ptr) = v; }
 ////////////// non-temporal memory access //////////////
 //// store
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v)
 { vstore_v_p_vf(ptr, v); }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v)
 { vstore_v_p_vd(ptr, v); }
 ////////////// LUT //////////////
 //// load
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi)
 { return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2)
 {
  return vset__vf(
    ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)],
    ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)]
  );
 }
 //// store
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
 {
  const v__u64 vll = (v__u64)v;
  float *ptr_low = ptr + offset*2;
  float *ptr_high = ptr + (offset + step)*2;
  *((uint64_t*)ptr_low) = vec_extract(vll, 0);
  *((uint64_t*)ptr_high) = vec_extract(vll, 1);
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
 { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
 { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
 { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
 /**********************************************
 ** Misc
 **********************************************/
 // vector with a specific value set to all lanes (Vector Splat)
 static INLINE vint vcast_vi_i(int i)
 { return vsetall__vi(i); }
 static INLINE vint2 vcast_vi2_i(int i)
 { return vsetall__vi2(i); }
 static INLINE vfloat vcast_vf_f(float f)
 { return vsetall__vf(f); }
 static INLINE vdouble vcast_vd_d(double d)
 { return vsetall__vd(d); }
 // cast
 static INLINE vint2 vcast_vi2_vm(vmask vm)
 { return (vint2)vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi)
 { return (vmask)vi; }
 // get the first element
 static INLINE float vcast_f_vf(vfloat v)
 { return vec_extract(v, 0); }
 static INLINE double vcast_d_vd(vdouble v)
 { return vec_extract(v, 0); }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd)
 { return (vmask)vd; }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm)
 { return (vdouble)vm; }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf)
 { return (vmask)vf; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm)
 { return (vfloat)vm; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi)
 { return (vfloat)vi; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf)
 { return (vint2)vf; }
 // per element select via mask (blend)
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y)
 { return vec_sel(y, x, (v__b64)o); }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y)
 { return vec_sel(y, x, o); }
 static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y)
 { return vec_sel(y, x, o); }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)
 { return vec_sel(y, x, o); }
 static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0)
 {
  return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2)
 {
  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3)
 {
  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0)
 {
  return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2)
 {
  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3)
 {
  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE int vtestallones_i_vo32(vopmask g)
 { return vec_all_ne((vint2)g, vzero__vi2()); }
 static INLINE int vtestallones_i_vo64(vopmask g)
 { return vec_all_ne((v__i64)g, vzero__s64()); }
 /**********************************************
 ** Conversions
 **********************************************/
 ////////////// Numeric //////////////
 // pack 64-bit mask to 32-bit
 static INLINE vopmask vcast_vo32_vo64(vopmask m)
 { return (vopmask)vec_pack((v__u64)m, (v__u64)m); }
 // clip 64-bit lanes to lower 32-bit
 static INLINE vint vcastu_vi_vi2(vint2 vi2)
 { return vec_mergeo(vi2, vec_splat(vi2, 3)); }
 static INLINE vint vcastu_vi_vm(vmask vi2)
 { return vec_mergeo((vint2)vi2, vec_splat((vint2)vi2, 3)); }
 // expand lower 32-bit mask
 static INLINE vopmask vcast_vo64_vo32(vopmask m)
 { return vec_mergeh(m, m); }
 // unsigned expand lower 32-bit integer
 static INLINE vint2 vcastu_vi2_vi(vint vi)
 { return vec_mergeh(vzero__vi(), vi); }
 static INLINE vmask vcastu_vm_vi(vint vi)
 { return (vmask)vec_mergeh(vzero__vi(), vi); }
 static INLINE vopmask vcast_vo_i(int i) {
  i = i ? -1 : 0;
  return (vopmask) { i, i, i, i };
 }
 // signed int to single-precision
 static INLINE vfloat vcast_vf_vi2(vint2 vi)
 {
  vfloat ret;
 #if defined(__clang__) || __GNUC__ >= 9
  ret = __builtin_convertvector(vi, vfloat);
 #else
  __asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi));
 #endif
  return ret;
 }
 // lower signed int to double-precision
 static INLINE vdouble vcast_vd_vi(vint vi)
 {
  vdouble ret;
  vint swap = vec_mergeh(vi, vi);
 #if defined(__clang__) || __GNUC__ >= 7
  ret = __builtin_vsx_xvcvsxwdp(swap);
 #else
  __asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap));
 #endif
  return ret;
 }
 // zip two scalars
 static INLINE vmask vcast_vm_i_i(int l, int h)
 { return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); }
 static INLINE vmask vcast_vm_i64(int64_t i) {
  return (vmask)vsetall__s64(i);
 }
 static INLINE vmask vcast_vm_u64(uint64_t i) {
  return (vmask)vsetall__u64(i);
 }
 ////////////// Truncation //////////////
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
 {
  vint2 ret;
 #if defined(__clang__) || __GNUC__ >= 9
  ret = __builtin_convertvector(vf, vint2);
 #else
  __asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf));
 #endif
  return ret;
 }
 static INLINE vint vtruncate_vi_vd(vdouble vd)
 {
  vint ret;
 #if defined(__clang__) || __GNUC__ >= 7
  ret = __builtin_vsx_xvcvdpsxws(vd);
 #else
  __asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd));
 #endif
  return vec_mergeo(ret, vec_splat(ret, 3));
 }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd)
 { return vec_trunc(vd); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vf)
 { return vec_trunc(vf); }
 ////////////// Rounding //////////////
 // towards the nearest even
 static INLINE vint vrint_vi_vd(vdouble vd)
 { return vtruncate_vi_vd(vec_rint(vd)); }
 static INLINE vint2 vrint_vi2_vf(vfloat vf)
 { return vtruncate_vi2_vf(vec_rint(vf)); }
 static INLINE vdouble vrint_vd_vd(vdouble vd)
 { return vec_rint(vd); }
 static INLINE vfloat vrint_vf_vf(vfloat vf)
 { return vec_rint(vf); }
 /**********************************************
 ** Logical
 **********************************************/
 ////////////// And //////////////
 static INLINE vint vand_vi_vi_vi(vint x, vint y)
 { return vec_and(x, y); }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y)
 { return vec_and((vint)x, y); }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)
 { return vec_and(x, y); }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y)
 { return (vint2)vec_and((vint2)x, y); }
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y)
 { return vec_and(x, y); }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)
 { return vec_and((vmask)x, y); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)
 { return vec_and((vmask)x, y); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y)
 { return vec_and(x, y); }
 ////////////// Or //////////////
 static INLINE vint vor_vi_vi_vi(vint x, vint y)
 { return vec_or(x, y); }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)
 { return vec_or(x, y); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y)
 { return vec_or(x, y); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)
 { return vec_or((vmask)x, y); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)
 { return vec_or((vmask)x, y); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y)
 { return vec_or(x, y); }
 ////////////// Xor //////////////
 static INLINE vint vxor_vi_vi_vi(vint x, vint y)
 { return vec_xor(x, y); }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)
 { return vec_xor(x, y); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y)
 { return vec_xor(x, y); }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)
 { return vec_xor((vmask)x, y); }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)
 { return vec_xor((vmask)x, y); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y)
 { return vec_xor(x, y); }
 ////////////// Not //////////////
 static INLINE vopmask vnot_vo_vo(vopmask o)
 { return vec_nor(o, o); }
 ////////////// And Not ((~x) & y) //////////////
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y)
 { return vec_andc(y, x); }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y)
 { return vec_andc(y, (vint)x); }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y)
 { return vec_andc(y, x); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y)
 { return vec_andc(y, x); }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)
 { return vec_andc(y, x); }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)
 { return vec_andc(y, x); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y)
 { return vec_andc(y, x); }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y)
 { return vec_andc(y, (vint2)x); }
 /**********************************************
 ** Comparison
 **********************************************/
 ////////////// Equal //////////////
 static INLINE vint veq_vi_vi_vi(vint x, vint y)
 { return (vint)vec_cmpeq(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y)
 { return vec_cmpeq(x, y); }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y)
 { return vec_cmpeq(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y)
 { return (vint2)vec_cmpeq(x, y); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y)
 { return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)
 { return vec_cmpeq(x, y); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)
 { return (vopmask)vec_cmpeq(x, y); }
 ////////////// Not Equal //////////////
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y)
 { return vnot_vo_vo(vec_cmpeq(x, y)); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y)
 { return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); }
 ////////////// Less Than //////////////
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)
 { return vec_cmplt(x, y); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)
 { return (vopmask)vec_cmplt(x, y); }
 ////////////// Greater Than //////////////
 static INLINE vint vgt_vi_vi_vi(vint x, vint y)
 { return (vint)vec_cmpgt(x, y); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y)
 { return vec_cmpgt(x, y);}
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y)
 { return (vint2)vec_cmpgt(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y)
 { return vec_cmpgt(x, y); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)
 { return vec_cmpgt(x, y); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)
 { return (vopmask)vec_cmpgt(x, y); }
 ////////////// Less Than Or Equal //////////////
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)
 { return vec_cmple(x, y); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)
 { return (vopmask)vec_cmple(x, y); }
 ////////////// Greater Than Or Equal //////////////
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)
 { return vec_cmpge(x, y); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)
 { return (vopmask)vec_cmpge(x, y); }
 ////////////// Special Cases //////////////
 static INLINE vopmask visinf_vo_vf(vfloat d)
 { return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); }
 static INLINE vopmask visinf_vo_vd(vdouble d)
 { return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d)
 { return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vd(vdouble d)
 { return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); }
 static INLINE vopmask visminf_vo_vf(vfloat d)
 { return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vd(vdouble d)
 { return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); }
 static INLINE vopmask visnan_vo_vf(vfloat d)
 { return vnot_vo_vo(vec_cmpeq(d, d)); }
 static INLINE vopmask visnan_vo_vd(vdouble d)
 { return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); }
 /**********************************************
 ** Shift
 **********************************************/
 ////////////// Left //////////////
 static INLINE vint vsll_vi_vi_i(vint x, int c)
 { return vec_sl (x, vsetall__u32(c)); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c)
 { return vec_sl(x, vsetall__u32(c)); }
 ////////////// Right //////////////
 static INLINE vint vsrl_vi_vi_i(vint x, int c)
 { return vec_sr(x, vsetall__u32(c)); }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c)
 { return vec_sr(x, vsetall__u32(c)); }
 ////////////// Algebraic Right //////////////
 static INLINE vint vsra_vi_vi_i(vint x, int c)
 { return vec_sra(x, vsetall__u32(c)); }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c)
 { return vec_sra(x, vsetall__u32(c)); }
 /**********************************************
 ** Reorder
 **********************************************/
 ////////////// Reverse //////////////
 // Reverse elements order inside the lower and higher parts
 static INLINE vint2 vrev21_vi2_vi2(vint2 vi)
 { return vec_mergee(vec_mergeo(vi, vi), vi); }
 static INLINE vfloat vrev21_vf_vf(vfloat vf)
 { return (vfloat)vrev21_vi2_vi2((vint2)vf); }
 // Swap the lower and higher parts
 static INLINE vfloat vreva2_vf_vf(vfloat vf)
 { return (vfloat)v__swapd_u64((v__u64)vf); }
 static INLINE vdouble vrev21_vd_vd(vdouble vd)
 { return (vdouble)v__swapd_u64((v__u64)vd); }
 static INLINE vdouble vreva2_vd_vd(vdouble vd)
 { return vd; }
 /**********************************************
 ** Arithmetic
 **********************************************/
 ////////////// Negation //////////////
 static INLINE vint vneg_vi_vi(vint e) {
 #if defined(__clang__) || __GNUC__ >= 9
  return vec_neg(e);
 #else
  return vec_sub(vzero__vi(), e);
 #endif
 }
 static INLINE vint2 vneg_vi2_vi2(vint2 e)
 { return vneg_vi_vi(e); }
 static INLINE vfloat vneg_vf_vf(vfloat d)
 {
  vfloat ret;
 #if defined(__clang__) || __GNUC__ >= 9
  ret = vec_neg(d);
 #else
  __asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d));
 #endif
  return ret;
 }
 static INLINE vdouble vneg_vd_vd(vdouble d)
 {
  vdouble ret;
 #if defined(__clang__) || __GNUC__ >= 9
  ret = vec_neg(d);
 #else
  __asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d));
 #endif
  return ret;
 }
 static INLINE vfloat vposneg_vf_vf(vfloat d)
 { return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); }
 static INLINE vdouble vposneg_vd_vd(vdouble d)
 { return vec_xor(d, vset__vd(+0.0, -0.0)); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d)
 { return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d)
 { return vec_xor(d, vset__vd(-0.0, +0.0)); }
 ////////////// Addition //////////////
 static INLINE vint vadd_vi_vi_vi(vint x, vint y)
 { return vec_add(x, y); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y)
 { return vec_add(x, y); }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y)
 { return vec_add(x, y); }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y)
 { return vec_add(x, y); }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y)
 { return (vmask)vec_add((v__i64)x, (v__i64)y); }
 ////////////// Subtraction //////////////
 static INLINE vint vsub_vi_vi_vi(vint x, vint y)
 { return vec_sub(x, y); }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y)
 { return vec_sub(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y)
 { return vec_sub(x, y); }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y)
 { return vec_sub(x, y); }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y)
 { return vec_add(x, vnegpos_vd_vd(y)); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y)
 { return vec_add(x, vnegpos_vf_vf(y)); }
 ////////////// Multiplication //////////////
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y)
 { return vec_mul(x, y); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y)
 { return vec_mul(x, y); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y)
 { return vec_div(x, y); }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y)
 { return vec_div(x, y); }
 static INLINE vfloat vrec_vf_vf(vfloat x)
 { return vec_div(vsetall__vf(1.0f), x); }
 static INLINE vdouble vrec_vd_vd(vdouble x)
 { return vec_div(vsetall__vd(1.0), x); }
 /**********************************************
 ** Math
 **********************************************/
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y)
 { return vec_max(x, y); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y)
 { return vec_max(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y)
 { return vec_min(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y)
 { return vec_min(x, y); }
 static INLINE vfloat vabs_vf_vf(vfloat f)
 { return vec_abs(f); }
 static INLINE vdouble vabs_vd_vd(vdouble d)
 { return vec_abs(d); }
 static INLINE vfloat vsqrt_vf_vf(vfloat f)
 { return vec_sqrt(f); }
 static INLINE vdouble vsqrt_vd_vd(vdouble d)
 { return vec_sqrt(d); }
 /**********************************************
 ** FMA3
 **********************************************/
 #if CONFIG == 1 || CONFIG == 3
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_madd(x, y, z); }
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_madd(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_msub(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_msub(x, y, z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_nmsub(x, y, z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_nmsub(x, y, z); }
 #else
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_add(vec_mul(x, y), z); }
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_add(vec_mul(x, y), z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_sub(vec_mul(x, y), z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_sub(vec_mul(x, y), z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_sub(z, vec_mul(x, y)); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_sub(z, vec_mul(x, y)); }
 #endif
 static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_madd(x, y, z); }
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_madd(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_madd(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_madd(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_msub(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_msub(x, y, z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_nmsub(x, y, z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_nmsub(x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vec_nmadd(x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vec_nmadd(x, y, z); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
 { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return vec_all_eq((__vector signed long long)g, vzero__s64());
 }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (v__b64)o);
 }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  return (vmask)vec_sub((__vector signed long long)x, (__vector signed long long)y);
 }
 static INLINE vmask vneg64_vm_vm(vmask x) {
  return (vmask)vec_sub((__vector signed long long) {0, 0}, (__vector signed long long)x);
 }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
 }
 #define vsll64_vm_vm_i(x, c) ((vmask)vec_sl((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
 #define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
 static INLINE vint vcast_vi_vm(vmask vm) {
  return (vint) { vm[0], vm[2] };
 }
 static INLINE vmask vcast_vm_vi(vint vi) {
  return (vmask) (__vector signed long long) { vi[0], vi[1] };
 }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec.h
@@ -0,0 +1,561 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdint.h>
 #include <math.h>
 #include "misc.h"
 #ifndef CONFIG
 #error CONFIG macro not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENDP CONFIG
 //@#define LOG2VECTLENDP CONFIG
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #define DFTPRIORITY LOG2VECTLENDP
 #define ISANAME "Pure C Array"
 typedef union {
  uint32_t u[VECTLENDP*2];
  uint64_t x[VECTLENDP];
  double d[VECTLENDP];
  float f[VECTLENDP*2];
  int32_t i[VECTLENDP*2];
 } versatileVector;
 typedef versatileVector vmask;
 typedef versatileVector vopmask;
 typedef versatileVector vdouble;
 typedef versatileVector vint;
 typedef versatileVector vfloat;
 typedef versatileVector vint2;
 typedef union {
  uint8_t u[sizeof(long double)*VECTLENDP];
  long double ld[VECTLENDP];
 } longdoubleVector;
 typedef longdoubleVector vmaskl;
 typedef longdoubleVector vlongdouble;
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 typedef union {
  uint8_t u[sizeof(Sleef_quad)*VECTLENDP];
  Sleef_quad q[VECTLENDP];
 } quadVector;
 typedef quadVector vmaskq;
 typedef quadVector vquad;
 #endif
 //
 static INLINE int vavailability_i(int name) { return -1; }
 static INLINE void vprefetch_v_p(const void *ptr) { }
 static INLINE int vtestallones_i_vo64(vopmask g) {
  int ret = 1; for(int i=0;i<VECTLENDP;i++) ret = ret && g.x[i]; return ret;
 }
 static INLINE int vtestallones_i_vo32(vopmask g) {
  int ret = 1; for(int i=0;i<VECTLENSP;i++) ret = ret && g.u[i]; return ret;
 }
 //
 static vint2 vloadu_vi2_p(int32_t *p) {
  vint2 vi;
  for(int i=0;i<VECTLENSP;i++) vi.i[i] = p[i];
  return vi;
 }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
  for(int i=0;i<VECTLENSP;i++) p[i] = v.i[i];
 }
 static vint vloadu_vi_p(int32_t *p) {
  vint vi;
  for(int i=0;i<VECTLENDP;i++) vi.i[i] = p[i];
  return vi;
 }
 static void vstoreu_v_p_vi(int32_t *p, vint v) {
  for(int i=0;i<VECTLENDP;i++) p[i] = v.i[i];
 }
 //
 static INLINE vopmask vcast_vo32_vo64(vopmask m) {
  vopmask ret;
  for(int i=0;i<VECTLENDP;i++) ret.u[i] = m.u[i*2+1];
  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.u[i] = 0;
  return ret;
 }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) {
  vopmask ret;
  for(int i=0;i<VECTLENDP;i++) ret.u[i*2] = ret.u[i*2+1] = m.u[i];
  return ret;
 }
 static INLINE vmask vcast_vm_i_i(int h, int l) {
  vmask ret;
  for(int i=0;i<VECTLENDP;i++) {
    ret.u[i*2+0] = l;
    ret.u[i*2+1] = h;
  }
  return ret;
 }
 static INLINE vint2 vcastu_vi2_vi(vint vi) {
  vint2 ret;
  for(int i=0;i<VECTLENDP;i++) {
    ret.i[i*2+0] = 0;
    ret.i[i*2+1] = vi.i[i];
  }
  return ret;
 }
 static INLINE vint vcastu_vi_vi2(vint2 vi2) {
  vint ret;
  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i*2+1];
  return ret;
 }
 static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
  vint ret;
  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i];
  return ret;
 }
 static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
  vint2 ret;
  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi.i[i];
  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.i[i] = 0;
  return ret;
 }
 static INLINE vdouble vrev21_vd_vd(vdouble d0) {
  vdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r.d[i*2+0] = d0.d[i*2+1];
    r.d[i*2+1] = d0.d[i*2+0];
  }
  return r;
 }
 static INLINE vdouble vreva2_vd_vd(vdouble d0) {
  vdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r.d[i*2+0] = d0.d[(VECTLENDP/2-1-i)*2+0];
    r.d[i*2+1] = d0.d[(VECTLENDP/2-1-i)*2+1];
  }
  return r;
 }
 static INLINE vfloat vrev21_vf_vf(vfloat d0) {
  vfloat r;
  for(int i=0;i<VECTLENSP/2;i++) {
    r.f[i*2+0] = d0.f[i*2+1];
    r.f[i*2+1] = d0.f[i*2+0];
  }
  return r;
 }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) {
  vfloat r;
  for(int i=0;i<VECTLENSP/2;i++) {
    r.f[i*2+0] = d0.f[(VECTLENSP/2-1-i)*2+0];
    r.f[i*2+1] = d0.f[(VECTLENSP/2-1-i)*2+1];
  }
  return r;
 }
 static INLINE vdouble vcast_vd_d(double d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = d; return ret; }
 //
 static INLINE vopmask vand_vo_vo_vo   (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
 static INLINE vopmask vor_vo_vo_vo    (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
 static INLINE vopmask vxor_vo_vo_vo   (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
 static INLINE vmask vand_vm_vm_vm     (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
 static INLINE vmask vandnot_vm_vm_vm  (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
 static INLINE vmask vor_vm_vm_vm      (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
 static INLINE vmask vxor_vm_vm_vm     (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)   { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)       { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)   { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)       { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
 //
 static INLINE vdouble vsel_vd_vo_vd_vd   (vopmask o, vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
 static INLINE vint2   vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)     { vint2 ret;   for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = vi.i[i]; return ret; }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = (int)vd.d[i]; return ret; }
 static INLINE vint vrint_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = vd.d[i] > 0 ? (int)(vd.d[i] + 0.5) : (int)(vd.d[i] - 0.5); return ret; }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
 static INLINE vint vcast_vi_i(int j) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = j; return ret; }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] == y.x[i] ? -1 : 0; return ret; }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] + y.x[i]; return ret; }
 //
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { union { vdouble vd; vmask vm; } cnv; cnv.vd = vd; return cnv.vm; }
 static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { union { vdouble vd; vint2 vi2; } cnv; cnv.vd = vd; return cnv.vi2; }
 static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { union { vint2 vi2; vdouble vd; } cnv; cnv.vi2 = vi; return cnv.vd; }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { union { vmask vm; vdouble vd; } cnv; cnv.vm = vm; return cnv.vd; }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] + y.d[i]; return ret; }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] - y.d[i]; return ret; }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i]; return ret; }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] / y.d[i]; return ret; }
 static INLINE vdouble vrec_vd_vd(vdouble x)               { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = 1.0 / x.d[i];    return ret; }
 static INLINE vdouble vabs_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.x[i] & 0x7fffffffffffffffULL; return ret; }
 static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = -d.d[i]; return ret; }
 static INLINE vdouble vmla_vd_vd_vd_vd  (vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] + z.d[i]; return ret; }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] - z.d[i]; return ret; }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] > y.d[i] ? x.d[i] : y.d[i]; return ret; }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] < y.d[i] ? x.d[i] : y.d[i]; return ret; }
 static INLINE vdouble vposneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ?  d.d[i] : -d.d[i]; return ret; }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? -d.d[i] :  d.d[i]; return ret; }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? x.d[i] - y.d[i] : x.d[i] + y.d[i]; return ret; }
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] == y.d[i] ? -1 : 0; return ret; }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] != y.d[i] ? -1 : 0; return ret; }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <  y.d[i] ? -1 : 0; return ret; }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <= y.d[i] ? -1 : 0; return ret; }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >  y.d[i] ? -1 : 0; return ret; }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >= y.d[i] ? -1 : 0; return ret; }
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
 static INLINE vint vneg_vi_vi   (vint x)         { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = -x.i[i];         return ret; }
 static INLINE vint vand_vi_vi_vi(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] &  y.i[i]; return ret; }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
 static INLINE vint vor_vi_vi_vi(vint x, vint y)     { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] |  y.i[i]; return ret; }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] ^  y.i[i]; return ret; }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y)    { return vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] << c; return ret; }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] >> c; return ret; }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  union { vopmask vo; vint2 vi2; } cnv;
  cnv.vo = m;
  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), x),
                      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), y));
 }
 static INLINE vopmask visinf_vo_vd(vdouble d)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = (d.d[i] == SLEEF_INFINITY || d.d[i] == -SLEEF_INFINITY) ? -1 : 0; return ret; }
 static INLINE vopmask vispinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == SLEEF_INFINITY ? -1 : 0; return ret; }
 static INLINE vopmask visminf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == -SLEEF_INFINITY ? -1 : 0; return ret; }
 static INLINE vopmask visnan_vo_vd(vdouble d)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] != d.d[i] ? -1 : 0; return ret; }
 static INLINE vdouble vsqrt_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = sqrt(d.d[i]); return ret; }
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
 #endif
 static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  vdouble vd;
  for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
  return vd;
 }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  for(int i=0;i<VECTLENDP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v.d[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v.d[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) { union { vint2 vi2; vmask vm; } cnv; cnv.vm = vm; return cnv.vi2; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { union { vint2 vi2; vmask vm; } cnv; cnv.vi2 = vi; return cnv.vm; }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = vi.i[i]; return ret; }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = (int)vf.f[i]; return ret; }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = vf.f[i] > 0 ? (int)(vf.f[i] + 0.5) : (int)(vf.f[i] - 0.5); return ret; }
 static INLINE vint2 vcast_vi2_i(int j) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = j; return ret; }
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
 static INLINE vfloat vcast_vf_f(float f) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = f; return ret; }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { union { vfloat vf; vmask vm; } cnv; cnv.vf = vf; return cnv.vm; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { union { vfloat vf; vmask vm; } cnv; cnv.vm = vm; return cnv.vf; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { union { vfloat vf; vint2 vi2; } cnv; cnv.vi2 = vi; return cnv.vf; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { union { vfloat vf; vint2 vi2; } cnv; cnv.vf = vf; return cnv.vi2; }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] + y.f[i]; return ret; }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] - y.f[i]; return ret; }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i]; return ret; }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] / y.f[i]; return ret; }
 static INLINE vfloat vrec_vf_vf   (vfloat x)           { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = 1.0    / x.f[i]; return ret; }
 static INLINE vfloat vabs_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & 0x7fffffff; return ret; }
 static INLINE vfloat vneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = -x.f[i]; return ret; }
 static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] + z.f[i]; return ret; }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] - z.f[i]; return ret; }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] > y.f[i] ? x.f[i] : y.f[i]; return ret; }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] < y.f[i] ? x.f[i] : y.f[i]; return ret; }
 static INLINE vfloat vposneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ?  x.f[i] : -x.f[i]; return ret; }
 static INLINE vfloat vnegpos_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? -x.f[i] :  x.f[i]; return ret; }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] - y.f[i] : x.f[i] + y.f[i]; return ret; }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] == y.f[i]) ? -1 : 0); return ret; }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] != y.f[i]) ? -1 : 0); return ret; }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <  y.f[i]) ? -1 : 0); return ret; }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <= y.f[i]) ? -1 : 0); return ret; }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >  y.f[i]) ? -1 : 0); return ret; }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >= y.f[i]) ? -1 : 0); return ret; }
 static INLINE vint vadd_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
 static INLINE vint vsub_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
 static INLINE vint vneg_vi2_vi2(vint x)             { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = -x.i[i]; return ret; }
 static INLINE vint vand_vi2_vi2_vi2(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] &  y.i[i]; return ret; }
 static INLINE vint vandnot_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
 static INLINE vint vor_vi2_vi2_vi2(vint x, vint y)     { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] |  y.i[i]; return ret; }
 static INLINE vint vxor_vi2_vi2_vi2(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] ^  y.i[i]; return ret; }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
  union { vopmask vo; vint2 vi2; } cnv;
  cnv.vo = x;
  return vand_vi2_vi2_vi2(cnv.vi2, y);
 }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(x, y); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] << c; return ret; }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >> c; return ret; }
 static INLINE vopmask visinf_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (d.f[i] == SLEEF_INFINITYf || d.f[i] == -SLEEF_INFINITYf) ? -1 : 0; return ret; }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == SLEEF_INFINITYf ? -1 : 0; return ret; }
 static INLINE vopmask visminf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == -SLEEF_INFINITYf ? -1 : 0; return ret; }
 static INLINE vopmask visnan_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] != d.f[i] ? -1 : 0; return ret; }
 static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
 static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
 static INLINE vint2   veq_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
 static INLINE vint2   vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = sqrtf(x.f[i]); return ret; }
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) { return v.f[0]; }
 #endif
 static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
 static INLINE vfloat vloadu_vf_p(const float *ptr) {
  vfloat vf;
  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[i];
  return vf;
 }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  vfloat vf;
  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
  return vf;
 }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
  for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
 }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  for(int i=0;i<VECTLENSP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v.f[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v.f[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 //
 static INLINE vlongdouble vcast_vl_l(long double d) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = d; return ret; }
 static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
  vlongdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r.ld[i*2+0] = d0.ld[i*2+1];
    r.ld[i*2+1] = d0.ld[i*2+0];
  }
  return r;
 }
 static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
  vlongdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r.ld[i*2+0] = d0.ld[(VECTLENDP/2-1-i)*2+0];
    r.ld[i*2+1] = d0.ld[(VECTLENDP/2-1-i)*2+1];
  }
  return r;
 }
 static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] + y.ld[i]; return ret; }
 static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] - y.ld[i]; return ret; }
 static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] * y.ld[i]; return ret; }
 static INLINE vlongdouble vneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = -x.ld[i]; return ret; }
 static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] - y.ld[i] : x.ld[i] + y.ld[i]; return ret; }
 static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
 static INLINE vlongdouble vposneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ?  x.ld[i] : -x.ld[i]; return ret; }
 static INLINE vlongdouble vnegpos_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? -x.ld[i] :  x.ld[i]; return ret; }
 static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
 static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
  vlongdouble vd;
  for(int i=0;i<VECTLENDP;i++) vd.ld[i] = ptr[i];
  return vd;
 }
 static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
  for(int i=0;i<VECTLENDP;i++) ptr[i] = v.ld[i];
 }
 static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
  for(int i=0;i<VECTLENDP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v.ld[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v.ld[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 static INLINE vquad vcast_vq_q(Sleef_quad d) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = d; return ret; }
 static INLINE vquad vrev21_vq_vq(vquad d0) {
  vquad r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r.q[i*2+0] = d0.q[i*2+1];
    r.q[i*2+1] = d0.q[i*2+0];
  }
  return r;
 }
 static INLINE vquad vreva2_vq_vq(vquad d0) {
  vquad r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r.q[i*2+0] = d0.q[(VECTLENDP/2-1-i)*2+0];
    r.q[i*2+1] = d0.q[(VECTLENDP/2-1-i)*2+1];
  }
  return r;
 }
 static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] + y.q[i]; return ret; }
 static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] - y.q[i]; return ret; }
 static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] * y.q[i]; return ret; }
 static INLINE vquad vneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = -x.q[i]; return ret; }
 static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] - y.q[i] : x.q[i] + y.q[i]; return ret; }
 static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
 static INLINE vquad vposneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ?  x.q[i] : -x.q[i]; return ret; }
 static INLINE vquad vnegpos_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? -x.q[i] :  x.q[i]; return ret; }
 static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
 static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
  vquad vd;
  for(int i=0;i<VECTLENDP;i++) vd.q[i] = ptr[i];
  return vd;
 }
 static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
 static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
  for(int i=0;i<VECTLENDP;i++) ptr[i] = v.q[i];
 }
 static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
  for(int i=0;i<VECTLENDP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v.q[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v.q[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec_scalar.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperpurec_scalar.h
@@ -0,0 +1,487 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2023.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if !defined(SLEEF_GENHEADER)
 #include <stdint.h>
 #endif
 #ifndef ENABLE_BUILTIN_MATH
 #if !defined(SLEEF_GENHEADER)
 #include <math.h>
 #endif
 #define SQRT sqrt
 #define SQRTF sqrtf
 #define FMA fma
 #define FMAF fmaf
 #define RINT rint
 #define RINTF rintf
 #define TRUNC trunc
 #define TRUNCF truncf
 #else
 #define SQRT __builtin_sqrt
 #define SQRTF __builtin_sqrtf
 #define FMA __builtin_fma
 #define FMAF __builtin_fmaf
 #define RINT __builtin_rint
 #define RINTF __builtin_rintf
 #define TRUNC __builtin_trunc
 #define TRUNCF __builtin_truncf
 #endif
 #if !defined(SLEEF_GENHEADER)
 #include "misc.h"
 #endif
 #ifndef CONFIG
 #error CONFIG macro not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define ENABLE_SP
 //@#define ENABLE_SP
 #if CONFIG == 2 || CONFIG == 3
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) || defined(__riscv) || CONFIG == 3
 #ifndef FP_FAST_FMA
 //@#ifndef FP_FAST_FMA
 #define FP_FAST_FMA
 //@#define FP_FAST_FMA
 #endif
 //@#endif
 #ifndef FP_FAST_FMAF
 //@#ifndef FP_FAST_FMAF
 #define FP_FAST_FMAF
 //@#define FP_FAST_FMAF
 #endif
 //@#endif
 #endif
 #if (!defined(FP_FAST_FMA) || !defined(FP_FAST_FMAF)) && !defined(SLEEF_GENHEADER)
 #error FP_FAST_FMA or FP_FAST_FMAF not defined
 #endif
 #define ISANAME "Pure C scalar with FMA"
 #else // #if CONFIG == 2 || CONFIG == 3
 #define ISANAME "Pure C scalar"
 #endif // #if CONFIG == 2 || CONFIG == 3
 #define LOG2VECTLENDP 0
 //@#define LOG2VECTLENDP 0
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define LOG2VECTLENSP 0
 //@#define LOG2VECTLENSP 0
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #if defined(__SSE4_1__) || defined(__aarch64__) || CONFIG == 3
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #endif
 #define DFTPRIORITY LOG2VECTLENDP
 typedef uint64_t vmask;
 typedef uint32_t vopmask;
 typedef double vdouble;
 typedef int32_t vint;
 typedef float vfloat;
 typedef int32_t vint2;
 typedef int64_t vint64;
 typedef uint64_t vuint64;
 typedef Sleef_uint64_2t vquad;
 #if CONFIG != 3
 typedef Sleef_quad vargquad;
 #else
 typedef Sleef_uint64_2t vargquad;
 #endif
 //
 static INLINE int vavailability_i(int name) { return -1; }
 static INLINE void vprefetch_v_p(const void *ptr) {}
 static INLINE int vtestallones_i_vo64(vopmask g) { return g; }
 static INLINE int vtestallones_i_vo32(vopmask g) { return g; }
 //
 static vint2 vloadu_vi2_p(int32_t *p) { return *p; }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { *p = v; }
 static vint vloadu_vi_p(int32_t *p) { return *p; }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { *p = v; }
 //
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return m; }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return m; }
 static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
 static INLINE vmask vcast_vm_i_i(int h, int l) { return (((uint64_t)h) << 32) | (uint32_t)l; }
 static INLINE vmask vcast_vm_i64(int64_t i) { return (int64_t)i; }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return i; }
 static INLINE vmask vcastu_vm_vi(vint vi) { return ((uint64_t)vi) << 32; }
 static INLINE vint vcastu_vi_vm(vmask vm) { return (int32_t)(vm >> 32); }
 static INLINE vdouble vcast_vd_d(double d) { return d; }
 //
 static INLINE vopmask vand_vo_vo_vo   (vopmask x, vopmask y) { return x & y; }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
 static INLINE vopmask vor_vo_vo_vo    (vopmask x, vopmask y) { return x | y; }
 static INLINE vopmask vxor_vo_vo_vo   (vopmask x, vopmask y) { return x ^ y; }
 static INLINE vmask vand_vm_vm_vm     (vmask x, vmask y)     { return x & y; }
 static INLINE vmask vandnot_vm_vm_vm  (vmask x, vmask y)     { return y & ~x; }
 static INLINE vmask vor_vm_vm_vm      (vmask x, vmask y)     { return x | y; }
 static INLINE vmask vxor_vm_vm_vm     (vmask x, vmask y)     { return x ^ y; }
 static INLINE vmask vcast_vm_vo(vopmask o) { return (vmask)o | (((vmask)o) << 32); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) & y; }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)   { return y & ~vcast_vm_vo(x); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)       { return vcast_vm_vo(x) | y; }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) ^ y; }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) & y; }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)   { return y & ~vcast_vm_vo(x); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)       { return vcast_vm_vo(x) | y; }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)      { return vcast_vm_vo(x) ^ y; }
 //
 static INLINE vdouble vsel_vd_vo_vd_vd   (vopmask o, vdouble x, vdouble y) { return o ? x : y; }
 static INLINE vint2   vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)     { return o ? x : y; }
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return o ? v1 : v0; }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE vdouble vcast_vd_vi(vint vi) { return vi; }
 static INLINE vint vcast_vi_i(int j) { return j; }
 #ifdef FULL_FP_ROUNDING
 static INLINE vint vrint_vi_vd(vdouble d) { return (int32_t)RINT(d); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return RINT(vd); }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return TRUNC(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return (int32_t)TRUNC(vd); }
 #else
 static INLINE vint vrint_vi_vd(vdouble a) {
  a += a > 0 ? 0.5 : -0.5;
  uint64_t vx;
  memcpy(&vx, &a, sizeof(vx));
  vx -= 1 & (int)a;
  memcpy(&a, &vx, sizeof(a));
  return a;
 }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return vd; }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
 #endif
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return x == y ? ~(uint32_t)0 : 0; }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return x + y; }
 //
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { vmask vm; memcpy(&vm, &vd, sizeof(vm)); return vm; }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { vdouble vd; memcpy(&vd, &vm, sizeof(vd)); return vd; }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
 static INLINE vdouble vrec_vd_vd(vdouble x)               { return 1 / x; }
 static INLINE vdouble vabs_vd_vd(vdouble d) {
  uint64_t vx;
  memcpy(&vx, &d, sizeof(vx));
  vx &= UINT64_C(0x7fffffffffffffff);
  memcpy(&d, &vx, sizeof(d));
  return d;
 }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return x > y ? x : y; }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return x < y ? x : y; }
 #ifndef ENABLE_FMA_DP
 static INLINE vdouble vmla_vd_vd_vd_vd  (vdouble x, vdouble y, vdouble z) { return x * y + z; }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return -x * y + z; }
 #else
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, -z); }
 #endif
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)  { return x == y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return x != y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)  { return x <  y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)  { return x <= y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)  { return x >  y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)  { return x >= y ? ~(uint32_t)0 : 0; }
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
 static INLINE vint vneg_vi_vi   (vint x)         { return   - x; }
 static INLINE vint vand_vi_vi_vi(vint x, vint y)    { return x & y; }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
 static INLINE vint vor_vi_vi_vi(vint x, vint y)     { return x | y; }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y)    { return x ^ y; }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y)    { return x & y; }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~x; }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { return (uint32_t)x << c; }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (uint32_t)x >> c; }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return x == y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return x >  y ? ~(uint32_t)0 : 0; }
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return m ? x : y; }
 static INLINE vopmask visinf_vo_vd(vdouble d)  { return (d == SLEEF_INFINITY || d == -SLEEF_INFINITY) ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vispinf_vo_vd(vdouble d) { return d == SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
 static INLINE vopmask visminf_vo_vd(vdouble d) { return d == -SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
 static INLINE vopmask visnan_vo_vd(vdouble d)  { return d != d ? ~(uint32_t)0 : 0; }
 static INLINE vdouble vsqrt_vd_vd(vdouble d) { return SQRT(d); }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { return SQRTF(x); }
 static INLINE double vcast_d_vd(vdouble v) { return v; }
 static INLINE vdouble vload_vd_p(const double *ptr) { return *ptr; }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { return *ptr; }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return ptr[vi]; }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return (int32_t)vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return (uint32_t)vi; }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (int32_t)vi; }
 static INLINE vint2 vcast_vi2_i(int j) { return j; }
 #ifdef FULL_FP_ROUNDING
 static INLINE vint2 vrint_vi2_vf(vfloat d) { return (int)RINTF(d); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return RINTF(vd); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return TRUNCF(vd); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (int32_t)TRUNCF(vf); }
 #else
 static INLINE vint2 vrint_vi2_vf(vfloat a) {
  a += a > 0 ? 0.5f : -0.5f;
  uint32_t vu[1];
  memcpy(vu, &a, sizeof(vu));
  vu[0] -= 1 & (int)a;
  memcpy(&a, vu, sizeof(a));
  return (int32_t)a;
 }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vf; }
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 #endif
 static INLINE vfloat vcast_vf_f(float f) { return f; }
 static INLINE vmask vreinterpret_vm_vf(vfloat f) { vfloat vf[2] = { f, 0 }; vmask vm; memcpy(&vm, &vf, sizeof(vm)); return vm; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { vfloat vf[2]; memcpy(&vf, &vm, sizeof(vf)); return vf[0]; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { vfloat vf; memcpy(&vf, &vi, sizeof(vf)); return vf; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat f) { vint2 vi2; memcpy(&vi2, &f, sizeof(vi2)); return vi2; }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
 static INLINE vfloat vrec_vf_vf   (vfloat x)           { return 1 / x; }
 static INLINE vfloat vabs_vf_vf(vfloat x) {
  int32_t vi[1];
  memcpy(vi, &x, sizeof(vi));
  vi[0] &= 0x7fffffff;
  memcpy(&x, vi, sizeof(x));
  return x;
 }
 static INLINE vfloat vneg_vf_vf(vfloat x) { return -x; }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return x > y ? x : y; }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return x < y ? x : y; }
 #ifndef ENABLE_FMA_SP
 static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return x * y + z; }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return - x * y + z; }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x * y - z; }
 #else
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
 static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, -z); }
 #endif
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)  { return x == y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return x != y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)  { return x <  y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)  { return x <= y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)  { return x >  y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)  { return x >= y ? ~(uint32_t)0 : 0; }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
 static INLINE vint2 vneg_vi2_vi2(vint2 x) { return -x; }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)    { return x & y; }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)     { return x | y; }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)    { return x ^ y; }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return o ? x : y; }
 static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return o ? v1 : v0; }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vcast_vm_vo(x) & y; }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~vcast_vm_vo(x); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
  return x << c;
 }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
  return ((uint32_t)x) >> c;
 }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
  return x >> c;
 }
 static INLINE vopmask visinf_vo_vf (vfloat d) { return (d == SLEEF_INFINITYf || d == -SLEEF_INFINITYf) ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return d == SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return d == -SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
 static INLINE vopmask visnan_vo_vf (vfloat d) { return d != d ? ~(uint32_t)0 : 0; }
 static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
 static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x >  (int32_t)y ? ~(uint32_t)0 : 0; }
 static INLINE vint2   veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
 static INLINE vint2   vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x >  (int32_t)y ? ~(uint32_t)0 : 0; }
 static INLINE float vcast_f_vf(vfloat v) { return v; }
 static INLINE vfloat vload_vf_p(const float *ptr) { return *ptr; }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return *ptr; }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) { return ptr[vi]; }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
 //
 #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(8 + (char *)&vq, p, 8);
  memcpy((char *)&vq, 8 + p, 8);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(8 + (char *)&vq, (char *)&aq, 8);
  memcpy((char *)&vq, 8 + (char *)&aq, 8);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(8 + (char *)&aq, (char *)&vq, 8);
  memcpy((char *)&aq, 8 + (char *)&vq, 8);
  return aq;
 }
 #else
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, sizeof(vq));
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, sizeof(vq));
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, sizeof(aq));
  return aq;
 }
 #endif
 //
 static INLINE int vtestallzeros_i_vo64(vopmask g) { return !g ? ~(uint32_t)0 : 0; }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return o ? x : y; }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return (int64_t)x - (int64_t)y; }
 static INLINE vmask vneg64_vm_vm(vmask x) { return -(int64_t)x; }
 #define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
 #define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
 //@#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
 //@#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (int64_t)x > (int64_t)y ? ~(uint32_t)0 : 0; }
 static INLINE vmask vcast_vm_vi(vint vi) { return vi; }
 static INLINE vint vcast_vi_vm(vmask vm) { return vm; }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperrvv.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helperrvv.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpers390x_128.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpers390x_128.h
@@ -0,0 +1,462 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 140 || CONFIG == 141 || CONFIG == 150 || CONFIG == 151
 #if !defined(__VX__) && !defined(SLEEF_GENHEADER)
 #error This helper is for IBM s390x.
 #endif
 #if __ARCH__ < 12 && !defined(SLEEF_GENHEADER)
 #error Please specify -march=z14 or higher.
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 1
 //@#define LOG2VECTLENDP 1
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #if CONFIG == 140 || CONFIG == 150
 #define ENABLE_FMA_DP
 //@#define ENABLE_FMA_DP
 #define ENABLE_FMA_SP
 //@#define ENABLE_FMA_SP
 #endif
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #if !defined(SLEEF_GENHEADER)
 #ifndef SLEEF_VECINTRIN_H_INCLUDED
 #include <vecintrin.h>
 #define SLEEF_VECINTRIN_H_INCLUDED
 #endif
 #include <stdint.h>
 #include <math.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 typedef __vector unsigned long long vmask;
 typedef __vector unsigned long long vopmask;
 typedef __vector double vdouble;
 typedef __vector int vint;
 typedef __vector float vfloat;
 typedef __vector int vint2;
 typedef __vector long long vint64;
 typedef __vector unsigned long long vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 //
 #if !defined(SLEEF_GENHEADER)
 static INLINE int vavailability_i(int n) {
  if (n == 1 || n == 2) {
    return vec_max((vdouble) {n, n}, (vdouble) {n, n})[0] != 0;
  }
  return 0;
 }
 #if CONFIG == 140 || CONFIG == 141
 #define ISANAME "VXE"
 #else
 #define ISANAME "VXE2"
 #endif
 #define DFTPRIORITY 14
 #endif // #if !defined(SLEEF_GENHEADER)
 static INLINE void vprefetch_v_p(const void *ptr) { }
 static vint2 vloadu_vi2_p(int32_t *p) { return (vint2) { p[0], p[1], p[2], p[3] }; }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
 static vint vloadu_vi_p(int32_t *p) { return (vint) { p[0], p[1] }; }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { p[0] = v[0]; p[1] = v[1]; }
 static INLINE vdouble vload_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
 static INLINE void vstore_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
 static INLINE vdouble vloadu_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
 static INLINE void vstoreu_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
 static INLINE vfloat vload_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
 static INLINE void vstore_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
 static INLINE void vscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) {
  *(p+(offset + step * 0)*2 + 0) = v[0];
  *(p+(offset + step * 0)*2 + 1) = v[1];
  *(p+(offset + step * 1)*2 + 0) = v[2];
  *(p+(offset + step * 1)*2 + 1) = v[3];
 }
 static INLINE vfloat vloadu_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
 static INLINE void vstoreu_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
 static INLINE void vscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&p[2*offset]), v); }
 static INLINE vdouble vgather_vd_p_vi(const double *p, vint vi) {
  return ((vdouble) { p[vi[0]], p[vi[1]] });
 }
 static INLINE vfloat vgather_vf_p_vi2(const float *p, vint2 vi2) {
  return ((vfloat) { p[vi2[0]], p[vi2[1]], p[vi2[2]], p[vi2[3]] });
 }
 static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (long long)-1 : 0, i ? (long long)-1 : 0 }; }
 static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
 static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
 static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { vi[0], vi[1] }; }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { vi[0], vi[1], vi[2], vi[3] }; }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 5); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 4); }
 static INLINE vint vrint_vi_vd(vdouble vd) {
  vd = vrint_vd_vd(vd);
  return (vint) { vd[0], vd[1] };
 }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { vd[0], vd[1] }; }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { vf[0], vf[1], vf[2], vf[3] }; }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return 1 / x; }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return vec_sel(y, x, o); }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return vec_sel(y, x, (__vector unsigned int)o); }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return vec_sel(y, x, (__vector unsigned int)o); }
 static INLINE int vtestallones_i_vo32(vopmask g) { return vec_all_ne((vint2)g, (vint2 ) { 0, 0, 0, 0 }); }
 static INLINE int vtestallones_i_vo64(vopmask g) { return vec_all_ne((__vector unsigned long long)g, (__vector unsigned long long) { 0, 0 }); }
 static INLINE vopmask vcast_vo32_vo64(vopmask g) { return (vopmask)(vint) { g[0] != 0 ? -1 : 0, g[1] != 0 ? -1 : 0, 0, 0 }; }
 static INLINE vopmask vcast_vo64_vo32(vopmask g) { return (vopmask) { ((vint)g)[0] != 0 ? 0xffffffffffffffffLL : 0, ((vint)g)[1] != 0 ? 0xffffffffffffffffLL : 0 }; }
 static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask)(vint){ h, l, h, l }; }
 static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)(vint64){ i, i }; }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)(vuint64){ i, i }; }
 static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)(vint2){ vi[0], 0, vi[1], 0 }; }
 static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ vi2[0] >> 32, vi2[1] >> 32 }; }
 static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
 static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
 static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
 static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
 static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  return (vopmask) { x[0] == y[0] ? 0xffffffffffffffffLL : 0, x[1] == y[1] ? 0xffffffffffffffffLL : 0 };
 }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
  return (vmask)((__vector long long)x +  (__vector long long)y);
 }
 //
 #define PNMASK ((vdouble) { +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0 })
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
 //
 static INLINE vdouble vabs_vd_vd(vdouble d) { return vec_abs(d); }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
 #if CONFIG == 140 || CONFIG == 150
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
 #else
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 #endif
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
 static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
 static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
 static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
 static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
 static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmadd(x, y, z); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
 #if CONFIG == 140 || CONFIG == 150
 static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
 static INLINE vfloat vfma_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
 static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
 static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
 static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
 static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmadd(x, y, z); }
 #else
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 #endif
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
 //
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 //
 static INLINE vopmask vnot_vo_vo(vopmask o) { return ~o; }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpeq(x, y); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmplt(x, y); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmple(x, y); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpgt(x, y); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpge(x, y); }
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
 static INLINE vint vneg_vi_vi(vint e) { return -e; }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, vreinterpretFirstHalf_vi_vi2((vint2)x)); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> (__vector int){c, c, c, c}; }
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return vec_cmpgt(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpeq(x, y)); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpgt(x, y));}
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
                      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
 }
 static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY))); }
 static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(SLEEF_INFINITY))); }
 static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(-SLEEF_INFINITY))); }
 static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(vnot_vo_vo(vec_cmpeq(d, d))); }
 static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
 static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
 static INLINE void vstream_v_p_vd(double *p, vdouble v) { vstore_v_p_vd(p, v); }
 static INLINE void vsscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(p, offset, step, v); }
 //
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return  y & ~x; }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> (__vector int){c, c, c, c}; }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpeq(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpgt(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpeq(x, y); }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpgt(x, y); }
 static INLINE void vsscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(p, offset, step, v); }
 static INLINE void vstream_v_p_vf(float *p, vfloat v) { vstore_v_p_vf(p, v); }
 //
 static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vec_sqrt(d); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vec_max(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vec_min(x, y); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpeq(x, y); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmplt(x, y); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmple(x, y); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpgt(x, y); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpge(x, y); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vec_abs(f); }
 static INLINE vfloat vrint_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 4); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 5); }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vec_max(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vec_min(x, y); }
 static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vec_sqrt(d); }
 static INLINE vopmask visinf_vo_vf (vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf (vfloat d) { return vneq_vo_vf_vf(d, d); }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) {
  vf = vrint_vf_vf(vf);
  return (vint) { vf[0], vf[1], vf[2], vf[3] };
 }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad m = { aq.y, aq.x };
  return m;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad a = { vq.y, vq.x };
  return a;
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return vec_all_eq((__vector signed long long)g, (__vector signed long long){ 0, 0 });
 }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (__vector __bool long long)o);
 }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  return (vmask)((__vector signed long long)x - (__vector signed long long)y);
 }
 static INLINE vmask vneg64_vm_vm(vmask x) {
  return (vmask)((__vector signed long long) {0, 0} - (__vector signed long long)x);
 }
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
 }
 #define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { c, c }))
 #define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { c, c }))
 static INLINE vint vcast_vi_vm(vmask vm) {
  return (vint) { vm[0], vm[1] };
 }
 static INLINE vmask vcast_vm_vi(vint vi) {
  return (vmask) (__vector signed long long) { vi[0], vi[1] };
 }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersse2.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersse2.h
@@ -0,0 +1,517 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if CONFIG == 2
 #if !defined(__SSE2__) && !defined(SLEEF_GENHEADER)
 #error Please specify -msse2.
 #endif
 #elif CONFIG == 3
 #if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER)
 #error Please specify -msse2 and -msse3
 #endif
 #elif CONFIG == 4
 #if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER)
 #error Please specify -msse2, -msse3 and -msse4.1
 #endif
 #else
 #error CONFIG macro invalid or not defined
 #endif
 #define ENABLE_DP
 //@#define ENABLE_DP
 #define LOG2VECTLENDP 1
 //@#define LOG2VECTLENDP 1
 #define VECTLENDP (1 << LOG2VECTLENDP)
 //@#define VECTLENDP (1 << LOG2VECTLENDP)
 #define ENABLE_SP
 //@#define ENABLE_SP
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 //@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 //@#define VECTLENSP (1 << LOG2VECTLENSP)
 #define ACCURATE_SQRT
 //@#define ACCURATE_SQRT
 #if !defined(SLEEF_GENHEADER)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 #include <stdint.h>
 #include "misc.h"
 #endif // #if !defined(SLEEF_GENHEADER)
 typedef __m128i vmask;
 typedef __m128i vopmask;
 typedef __m128d vdouble;
 typedef __m128i vint;
 typedef __m128  vfloat;
 typedef __m128i vint2;
 typedef __m128i vint64;
 typedef __m128i vuint64;
 typedef struct {
  vmask x, y;
 } vquad;
 typedef vquad vargquad;
 //
 #if !defined(SLEEF_GENHEADER)
 #ifndef __SLEEF_H__
 void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
 #endif
 static INLINE int cpuSupportsSSE2() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[3] & (1 << 26)) != 0;
 }
 static INLINE int cpuSupportsSSE3() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 0)) != 0;
 }
 static INLINE int cpuSupportsSSE4_1() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 19)) != 0;
 }
 #if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__)
 static INLINE int vavailability_i(int name) {
  //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3") && __builtin_cpu_supports("sse4.1");
  int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1();
  return d ? 3 : 0;
 }
 #define ISANAME "SSE4.1"
 #define DFTPRIORITY 12
 #elif defined(__SSE2__) && defined(__SSE3__)
 static INLINE int vavailability_i(int name) {
  //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3");
  int d = cpuSupportsSSE2() && cpuSupportsSSE3();
  return d ? 3 : 0;
 }
 #define ISANAME "SSE3"
 #define DFTPRIORITY 11
 #else
 static INLINE int vavailability_i(int name) {
  int d = cpuSupportsSSE2();
  return d ? 3 : 0;
 }
 #define ISANAME "SSE2"
 #define DFTPRIORITY 10
 #endif
 #endif // #if !defined(SLEEF_GENHEADER)
 static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
 static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
 static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
 //
 static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
 static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
 static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
 static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
 static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
 static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
 static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
 static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
 //
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
 static INLINE vint2 vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
 static INLINE vint vcastu_vi_vm(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
 #if CONFIG == 4
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
 #define FULL_FP_ROUNDING
 //@#define FULL_FP_ROUNDING
 #else
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  vmask t = _mm_cmpeq_epi32(x, y);
  return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));
 }
 #endif
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
 static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
 static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
 static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
 //
 static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
 static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
 static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
 static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
 #if CONFIG == 4
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }
 #else
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {
  return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y));
 }
 #endif
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY)));
 }
 static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY)));
 }
 static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY)));
 }
 static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));
 }
 //
 static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  int a[sizeof(vint)/sizeof(int)];
  vstoreu_v_p_vi(a, vi);
  return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
 }
 // This function is for debugging
 static INLINE double vcast_d_vd(vdouble v) {
  double a[VECTLENDP];
  vstoreu_v_p_vd(a, v);
  return a[0];
 }
 //
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
 static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }
 #if CONFIG != 4
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }
 #endif
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
 static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
 #if CONFIG == 4
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }
 #else
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));
 }
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) {
  return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y));
 }
 #endif
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
 static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
  int a[VECTLENSP];
  vstoreu_v_p_vi2(a, vi);
  return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
 }
 // This function is for debugging
 static INLINE float vcast_f_vf(vfloat v) {
  float a[VECTLENSP];
  vstoreu_v_p_vf(a, v);
  return a[0];
 }
 //
 #define PNMASK ((vdouble) { +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0 })
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
 #if CONFIG >= 3
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
 #else
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
 #endif
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
 //
 static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
 }
 //
 static vquad loadu_vq_p(void *p) {
  vquad vq;
  memcpy(&vq, p, VECTLENDP * 16);
  return vq;
 }
 static INLINE vquad cast_vq_aq(vargquad aq) {
  vquad vq;
  memcpy(&vq, &aq, VECTLENDP * 16);
  return vq;
 }
 static INLINE vargquad cast_aq_vq(vquad vq) {
  vargquad aq;
  memcpy(&aq, &vq, VECTLENDP * 16);
  return aq;
 }
 static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
 static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y));
 }
 static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
 static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
 #define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
 #define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
 //@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
 //@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
 static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  int64_t ax[2], ay[2];
  _mm_storeu_si128((__m128i *)ax, x);
  _mm_storeu_si128((__m128i *)ay, y);
  return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
 }
 static INLINE vmask vcast_vm_vi(vint vi) {
  vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
  return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
 }
 static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
 static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
 static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
 static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
 static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersve.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpersve.h
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpervecext.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/arch/helpervecext.h
@@ -0,0 +1,871 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdint.h>
 #include "misc.h"
 #ifndef CONFIG
 #error CONFIG macro not defined
 #endif
 #define ENABLE_DP
 #define ENABLE_SP
 #define LOG2VECTLENDP CONFIG
 #define VECTLENDP (1 << LOG2VECTLENDP)
 #define LOG2VECTLENSP (LOG2VECTLENDP+1)
 #define VECTLENSP (1 << LOG2VECTLENSP)
 #define DFTPRIORITY LOG2VECTLENDP
 #if defined(__clang__)
 #define ISANAME "Clang Vector Extension"
 typedef uint32_t vmask __attribute__((ext_vector_type(VECTLENDP*2)));
 typedef uint32_t vopmask __attribute__((ext_vector_type(VECTLENDP*2)));
 typedef double vdouble __attribute__((ext_vector_type(VECTLENDP)));
 typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP)));
 typedef float vfloat __attribute__((ext_vector_type(VECTLENDP*2)));
 typedef int32_t vint2 __attribute__((ext_vector_type(VECTLENDP*2)));
 #ifdef ENABLE_LONGDOUBLE
 typedef uint8_t vmaskl __attribute__((ext_vector_type(sizeof(long double)*VECTLENDP)));
 typedef long double vlongdouble __attribute__((ext_vector_type(VECTLENDP)));
 #endif
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 typedef uint8_t vmaskq __attribute__((ext_vector_type(sizeof(Sleef_quad)*VECTLENDP)));
 #ifdef ENABLE_LONGDOUBLE
 typedef Sleef_quad vquad __attribute__((ext_vector_type(VECTLENDP)));
 #endif
 #endif
 #elif defined(__GNUC__)
 #define ISANAME "GCC Vector Extension"
 typedef uint32_t vmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
 typedef uint32_t vopmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
 typedef double vdouble __attribute__((vector_size(sizeof(double)*VECTLENDP)));
 typedef int32_t vint __attribute__((vector_size(sizeof(int32_t)*VECTLENDP)));
 typedef float vfloat __attribute__((vector_size(sizeof(float)*VECTLENDP*2)));
 typedef int32_t vint2 __attribute__((vector_size(sizeof(int32_t)*VECTLENDP*2)));
 #ifdef ENABLE_LONGDOUBLE
 typedef uint8_t vmaskl __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
 typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
 #endif
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 typedef uint8_t vmaskq __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
 typedef Sleef_quad vquad __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
 #endif
 #endif
 //
 #if VECTLENDP == 2
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], 0, 0 }; }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1] }; }
 static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
 static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
 #endif
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 static INLINE vquad vcast_vq_q(Sleef_quad d) { return (vquad) { d, d }; }
 #endif
 static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h }; }
 static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1] }; }
 static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3] }; }
 static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
 static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
 static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
 static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
 static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0] }; }
 static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return vd; }
 static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1] }; }
 static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1] }; }
 #endif
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 static INLINE vquad vrev21_vq_vq(vquad vd) { return (vquad) { vd[1], vd[0] }; }
 static INLINE vquad vreva2_vq_vq(vquad vd) { return vd; }
 static INLINE vquad vposneg_vq_vq(vquad vd) { return (vquad) { +vd[0], -vd[1] }; }
 static INLINE vquad vnegpos_vq_vq(vquad vd) { return (vquad) { -vd[0], +vd[1] }; }
 #endif
 #define PNMASK ((vdouble) { +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0 })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
 #elif VECTLENDP == 4
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], 0, 0, 0, 0 }; }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3] }; }
 static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i }; }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i }; }
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f }; }
 static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d }; }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d }; }
 #endif
 static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h }; }
 static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3] }; }
 static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7] }; }
 static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3] }; }
 static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], 0, 0, 0, 0 }; }
 #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
 static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2] }; }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[2], vd[3], vd[0], vd[1] }; }
 static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
 static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2] }; }
 static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[2], vd[3], vd[0], vd[1] }; }
 static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3] }; }
 static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3] }; }
 #endif
 #elif VECTLENDP == 8
 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], m[9], m[11], m[13], m[15], 0, 0, 0, 0, 0, 0, 0, 0 }; }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3], m[4], m[4], m[5], m[5], m[6], m[6], m[7], m[7] }; }
 static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i, i, i, i, i }; }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i }; }
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f }; }
 static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d, d, d, d, d }; }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d, d, d, d, d }; }
 #endif
 static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h, l, h, l, h, l, h, l, h }; }
 static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3], 0, vi[4], 0, vi[5], 0, vi[6], 0, vi[7] }; }
 static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7], vi2[9], vi2[11], vi2[13], vi2[15] }; }
 static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3], vi2[4], vi2[5], vi2[6], vi2[7] }; }
 static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], vi[4], vi[5], vi[6], vi[7], 0, 0, 0, 0, 0, 0, 0, 0 }; }
 #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 })
 #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 })
 static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
 static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
 static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
 static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
 static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
 static INLINE vfloat vrev21_vf_vf(vfloat vd) {
  return (vfloat) {
    vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6],
      vd[9], vd[8], vd[11], vd[10], vd[13], vd[12], vd[15], vd[14] };
 }
 static INLINE vfloat vreva2_vf_vf(vfloat vd) {
  return (vfloat) {
    vd[14], vd[15], vd[12], vd[13], vd[10], vd[11], vd[8], vd[9],
      vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1]};
 }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
 static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
 static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3], +vd[4], -vd[5], +vd[6], -vd[7] }; }
 static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3], -vd[4], +vd[5], -vd[6], +vd[7] }; }
 #endif
 #else
 static INLINE vint vcast_vi_i(int k) {
  vint ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = k;
  return ret;
 }
 static INLINE vint2 vcast_vi2_i(int k) {
  vint2 ret;
  for(int i=0;i<VECTLENSP;i++) ret[i] = k;
  return ret;
 }
 static INLINE vdouble vcast_vd_d(double d) {
  vdouble ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = d;
  return ret;
 }
 static INLINE vfloat vcast_vf_f(float f) {
  vfloat ret;
  for(int i=0;i<VECTLENSP;i++) ret[i] = f;
  return ret;
 }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vcast_vl_l(long double d) {
  vlongdouble ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = d;
  return ret;
 }
 #endif
 static INLINE vopmask vcast_vo32_vo64(vopmask m) {
  vopmask ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = m[i*2+1];
  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
  return ret;
 }
 static INLINE vopmask vcast_vo64_vo32(vopmask m) {
  vopmask ret;
  for(int i=0;i<VECTLENDP;i++) ret[i*2] = ret[i*2+1] = m[i];
  return ret;
 }
 static INLINE vmask vcast_vm_i_i(int h, int l) {
  vmask ret;
  for(int i=0;i<VECTLENDP;i++) {
    ret[i*2+0] = l;
    ret[i*2+1] = h;
  }
  return ret;
 }
 static INLINE vint2 vcastu_vi2_vi(vint vi) {
  vint2 ret;
  for(int i=0;i<VECTLENDP;i++) {
    ret[i*2+0] = 0;
    ret[i*2+1] = vi[i];
  }
  return ret;
 }
 static INLINE vint vcastu_vi_vi2(vint2 vi2) {
  vint ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i*2+1];
  return ret;
 }
 static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
  vint ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i];
  return ret;
 }
 static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
  vint2 ret;
  for(int i=0;i<VECTLENDP;i++) ret[i] = vi[i];
  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
  return ret;
 }
 static INLINE vdouble vrev21_vd_vd(vdouble d0) {
  vdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = d0[i*2+1];
    r[i*2+1] = d0[i*2+0];
  }
  return r;
 }
 static INLINE vdouble vreva2_vd_vd(vdouble d0) {
  vdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
    r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
  }
  return r;
 }
 static INLINE vfloat vrev21_vf_vf(vfloat d0) {
  vfloat r;
  for(int i=0;i<VECTLENSP/2;i++) {
    r[i*2+0] = d0[i*2+1];
    r[i*2+1] = d0[i*2+0];
  }
  return r;
 }
 static INLINE vfloat vreva2_vf_vf(vfloat d0) {
  vfloat r;
  for(int i=0;i<VECTLENSP/2;i++) {
    r[i*2+0] = d0[(VECTLENSP/2-1-i)*2+0];
    r[i*2+1] = d0[(VECTLENSP/2-1-i)*2+1];
  }
  return r;
 }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
  vlongdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = d0[i*2+1];
    r[i*2+1] = d0[i*2+0];
  }
  return r;
 }
 static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
  vlongdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
    r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
  }
  return r;
 }
 #endif
 static INLINE vdouble vposneg_vd_vd(vdouble d0) {
  vdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = +d0[i*2+0];
    r[i*2+1] = -d0[i*2+1];
  }
  return r;
 }
 static INLINE vdouble vnegpos_vd_vd(vdouble d0) {
  vdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = -d0[i*2+0];
    r[i*2+1] = +d0[i*2+1];
  }
  return r;
 }
 static INLINE vfloat vposneg_vf_vf(vfloat d0) {
  vfloat r;
  for(int i=0;i<VECTLENSP/2;i++) {
    r[i*2+0] = +d0[i*2+0];
    r[i*2+1] = -d0[i*2+1];
  }
  return r;
 }
 static INLINE vfloat vnegpos_vf_vf(vfloat d0) {
  vfloat r;
  for(int i=0;i<VECTLENSP/2;i++) {
    r[i*2+0] = -d0[i*2+0];
    r[i*2+1] = +d0[i*2+1];
  }
  return r;
 }
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vposneg_vl_vl(vlongdouble d0) {
  vlongdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = +d0[i*2+0];
    r[i*2+1] = -d0[i*2+1];
  }
  return r;
 }
 static INLINE vlongdouble vnegpos_vl_vl(vlongdouble d0) {
  vlongdouble r;
  for(int i=0;i<VECTLENDP/2;i++) {
    r[i*2+0] = -d0[i*2+0];
    r[i*2+1] = +d0[i*2+1];
  }
  return r;
 }
 #endif
 #endif
 //
 static INLINE int vavailability_i(int name) { return -1; }
 static INLINE void vprefetch_v_p(const void *ptr) { }
 static INLINE int vtestallones_i_vo64(vopmask g) {
  int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
 }
 static INLINE int vtestallones_i_vo32(vopmask g) {
  int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
 }
 //
 static vint2 vloadu_vi2_p(int32_t *p) {
  vint2 vi;
  for(int i=0;i<VECTLENSP;i++) vi[i] = p[i];
  return vi;
 }
 static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
  for(int i=0;i<VECTLENSP;i++) p[i] = v[i];
 }
 static vint vloadu_vi_p(int32_t *p) {
  vint vi;
  for(int i=0;i<VECTLENDP;i++) vi[i] = p[i];
  return vi;
 }
 static void vstoreu_v_p_vi(int32_t *p, vint v) {
  for(int i=0;i<VECTLENDP;i++) p[i] = v[i];
 }
 //
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
 //
 static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return (vdouble)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (vint2)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
 static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
 }
 static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
 }
 static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
 }
 static INLINE vdouble vcast_vd_vi(vint vi) {
 #if defined(__clang__)
  return __builtin_convertvector(vi, vdouble);
 #else
  vdouble vd;
  for(int i=0;i<VECTLENDP;i++) vd[i] = vi[i];
  return vd;
 #endif
 }
 static INLINE vint vtruncate_vi_vd(vdouble vd) {
 #if defined(__clang__)
  return __builtin_convertvector(vd, vint);
 #else
  vint vi;
  for(int i=0;i<VECTLENDP;i++) vi[i] = vd[i];
  return vi;
 #endif
 }
 static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); }
 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
 static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
 #if defined(__clang__)
  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
 #else
  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
 #endif
  return (vopmask)((vi64)x == (vi64)y);
 }
 static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
 #if defined(__clang__)
  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
 #else
  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
 #endif
  return (vmask)((vi64)x + (vi64)y);
 }
 //
 static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
 static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (vint2)vd; }
 static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (vdouble)vi; }
 static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
 static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
 static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
 static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
 static INLINE vdouble vrec_vd_vd(vdouble x) { return 1.0 / x; }
 static INLINE vdouble vabs_vd_vd(vdouble d) { return (vdouble)((vmask)d & ~(vmask)vcast_vd_d(-0.0)); }
 static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y + z; }
 static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x > y), x, y); }
 static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x < y), x, y); }
 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
 static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x == y); }
 static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x != y); }
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x < y); }
 static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x <= y); }
 static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x > y); }
 static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x >= y); }
 static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
 static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
 static INLINE vint vneg_vi_vi(vint e) { return -e; }
 static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
 static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
 static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
 static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
 static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~vreinterpretFirstHalf_vi_vi2((vint2)x); }
 static INLINE vint vsll_vi_vi_i(vint x, int c) {
 #if defined(__clang__)
  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
 #else
  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
 #endif
  return (vint)(((vu)x) << c);
 }
 static INLINE vint vsrl_vi_vi_i(vint x, int c) {
 #if defined(__clang__)
  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
 #else
  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
 #endif
  return (vint)(((vu)x) >> c);
 }
 static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
 static INLINE vint veq_vi_vi_vi(vint x, vint y) { return x == y; }
 static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return x > y; }
 static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x == y); }
 static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x > y);}
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
                      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
 }
 static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vabs_vd_vd(d) == SLEEF_INFINITY); }
 static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(d == SLEEF_INFINITY); }
 static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(d == -SLEEF_INFINITY); }
 static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(d != d); }
 static INLINE vdouble vsqrt_vd_vd(vdouble d) {
 #if defined(__clang__)
  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
 #else
  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
 #endif
  vdouble q = vcast_vd_d(1);
  vopmask o = (vopmask)(d < 8.636168555094445E-78);
  d = (vdouble)((o & (vmask)(d * 1.157920892373162E77)) | (~o & (vmask)d));
  q = (vdouble)((o & (vmask)vcast_vd_d(2.9387358770557188E-39)) | (~o & (vmask)vcast_vd_d(1)));
  q = (vdouble)vor_vm_vm_vm(vlt_vo_vd_vd(d, vcast_vd_d(0)), (vmask)q);
  vdouble x = (vdouble)(0x5fe6ec85e7de30daLL - ((vi64)(d + 1e-320) >> 1));
  x = x * (  3 - d * x * x);
  x = x * ( 12 - d * x * x);
  x = x * (768 - d * x * x);
  x *= 1.0 / (1 << 13);
  x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
  return x * q;
 }
 static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
 static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
 static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
 static INLINE vdouble vloadu_vd_p(const double *ptr) {
  vdouble vd;
  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
  return vd;
 }
 static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  vdouble vd;
  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
  return vd;
 }
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
 }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  for(int i=0;i<VECTLENDP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
 //
 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return (vfloat)(((vmask)o & (vmask)x) | (~(vmask)o & (vmask)y)); }
 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
 }
 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
 }
 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
 }
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
 static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) {
 #if defined(__clang__)
  return __builtin_convertvector(vi, vfloat);
 #else
  vfloat vf;
  for(int i=0;i<VECTLENDP*2;i++) vf[i] = vi[i];
  return vf;
 #endif
 }
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) {
 #if defined(__clang__)
  return __builtin_convertvector(vf, vint2);
 #else
  vint2 vi;
  for(int i=0;i<VECTLENDP*2;i++) vi[i] = vf[i];
  return vi;
 #endif
 }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); }
 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
 static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
 static INLINE vfloat vrec_vf_vf(vfloat x) { return 1.0f / x; }
 static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
 static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x*y+z; }
 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return z-x*y; }
 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x > y), x, y); }
 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x < y), x, y); }
 static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x == y); }
 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x != y); }
 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x < y); }
 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x <= y); }
 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x > y); }
 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x >= y); }
 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return  y & ~x; }
 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
 #if defined(__clang__)
  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
 #else
  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
 #endif
  return (vint2)(((vu)x) << c);
 }
 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
 #if defined(__clang__)
  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
 #else
  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
 #endif
  return (vint2)(((vu)x) >> c);
 }
 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> c; }
 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x == y); }
 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x > y); }
 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return x == y; }
 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return x > y; }
 static INLINE vopmask visinf_vo_vf(vfloat d) { return (vopmask)(vabs_vf_vf(d) == SLEEF_INFINITYf); }
 static INLINE vopmask vispinf_vo_vf(vfloat d) { return (vopmask)(d == SLEEF_INFINITYf); }
 static INLINE vopmask visminf_vo_vf(vfloat d) { return (vopmask)(d == -SLEEF_INFINITYf); }
 static INLINE vopmask visnan_vo_vf(vfloat d) { return (vopmask)(d != d); }
 static INLINE vfloat vsqrt_vf_vf(vfloat d) {
  vfloat q = vcast_vf_f(1);
  vopmask o = (vopmask)(d < 5.4210108624275221700372640043497e-20f); // 2^-64
  d = (vfloat)((o & (vmask)(d * vcast_vf_f(18446744073709551616.0f))) | (~o & (vmask)d)); // 2^64
  q = (vfloat)((o & (vmask)vcast_vf_f(0.00000000023283064365386962890625f)) | (~o & (vmask)vcast_vf_f(1))); // 2^-32
  q = (vfloat)vor_vm_vm_vm(vlt_vo_vf_vf(d, vcast_vf_f(0)), (vmask)q);
  vfloat x = (vfloat)(0x5f330de2 - (((vint2)d) >> 1));
  x = x * ( 3.0f - d * x * x);
  x = x * (12.0f - d * x * x);
  x *= 0.0625f;
  x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
  return x * q;
 }
 static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
 static INLINE vfloat vloadu_vf_p(const float *ptr) {
  vfloat vf;
  for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[i];
  return vf;
 }
 static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  vfloat vf;
  for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
  return vf;
 }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
  for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
 }
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  for(int i=0;i<VECTLENSP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 //
 #ifdef ENABLE_LONGDOUBLE
 static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return x + y; }
 static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { return x - y; }
 static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { return x * y; }
 static INLINE vlongdouble vneg_vl_vl(vlongdouble d) { return -d; }
 static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return vadd_vl_vl_vl(x, vnegpos_vl_vl(y)); }
 static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
 static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
 static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
  vlongdouble vd;
  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
  return vd;
 }
 static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
 }
 static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
  for(int i=0;i<VECTLENDP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
 #endif
 #if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
 static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { return x + y; }
 static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { return x - y; }
 static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { return x * y; }
 static INLINE vquad vneg_vq_vq(vquad d) { return -d; }
 static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { return vadd_vq_vq_vq(x, vnegpos_vq_vq(y)); }
 static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
 static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
 static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
  vquad vd;
  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
  return vd;
 }
 static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
 static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
 }
 static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
 static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
  for(int i=0;i<VECTLENDP/2;i++) {
    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
  }
 }
 static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/CMakeLists.txt
@@ -0,0 +1,25 @@
 # Compiler properties
 set(COMMON_TARGET_PROPERTIES
  C_STANDARD 99                  # -std=gnu99
  )
 if (BUILD_SHARED_LIBS)
  list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON)   # -fPIC
 endif()
 # This is a workaround of appveyor bug
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}")
 # Target TARGET_LIBCOMMON_OBJ
 add_library(${TARGET_LIBCOMMON_OBJ} OBJECT common.c)
 set_target_properties(${TARGET_LIBCOMMON_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 # Target TARGET_LIBARRAYMAP_OBJ
 add_library(${TARGET_LIBARRAYMAP_OBJ} OBJECT arraymap.c)
 set_target_properties(${TARGET_LIBARRAYMAP_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 add_host_executable("addSuffix" addSuffix.c)
 set_target_properties("addSuffix" PROPERTIES C_STANDARD 99)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/addSuffix.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/addSuffix.c
@@ -0,0 +1,234 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <string.h>
 #include <stdbool.h>
 #define N 1000
 FILE *cygopen(const char *path, const char *mode) {
 #if defined(__MINGW64__) || defined(__MINGW32__)
  FILE *fp = fopen(path, mode);
  if (fp != NULL) return fp;
  char *buf = malloc(strlen(path) + N + 1);
  snprintf(buf, strlen(path) + N, "cygpath -m '%s'", path);
  FILE *pfp = popen(buf, "r");
  if (pfp == NULL || fgets(buf, N, pfp) == NULL) {
    if (pfp != NULL) pclose(pfp);
    free(buf);
    return NULL;
  }
  pclose(pfp);
  int len = strlen(buf);
  if (0 < len && len < N && buf[len-1] == '\n') buf[len-1] = '\0';
  fp = fopen(buf, mode);
  free(buf);
  return fp;
 #else
  return fopen(path, mode);
 #endif
 }
 int nkeywords = 0, nalloc = 0;
 char **keywords = NULL, *suffix = NULL;
 int nIgnore = 0;
 char **ignore = NULL;
 void insert(char *buf) {
  for(int i=0;i<nIgnore;i++) if (strcmp(ignore[i], buf) == 0) return;
  for(int i=0;i<nkeywords;i++) {
    if (strcmp(keywords[i], buf) == 0) printf("%s", suffix);
  }
 }
 void doit(FILE *fp) {
  int state = 0;
  bool nl = true;
  char buf[N+10], *p = buf;
  for(;;) {
    int c = getc(fp);
    if (c == EOF) break;
    switch(state) {
    case 0:
      if (isalnum(c) || c == '_') {
        ungetc(c, fp);
        p = buf;
        state = 1;
        break;
      }
      if (c == '/') {
        int c2 = getc(fp);
        if (c2 == '*') {
          putc(c, stdout);
          putc(c2, stdout);
          state = 4;
          break;
        } else if (c2 == '/') {
          putc(c, stdout);
          putc(c2, stdout);
          do {
            c = getc(fp);
            putc(c, stdout);
          } while(c != '\n');
          break;
        }
        ungetc(c2, fp);
      }
      if (nl && c == '#') {
        putc(c, stdout);
        do {
          c = getc(fp);
          putc(c, stdout);
        } while(c != '\n');
        break;
      }
      putc(c, stdout);
      if (!isspace(c)) nl = false;
      if (c == '\n') nl = true;
      if (c == '\"') state = 2;
      if (c == '\'') state = 3;
      break;
    case 1: // Identifier
      if (isalnum(c) || c == '_') {
        if (p - buf < N) { *p++ = c; *p = '\0'; }
        putc(c, stdout);
      } else if (c == '\"') {
        insert(buf);
        putc(c, stdout);
        state = 2;
      } else if (c == '\'') {
        insert(buf);
        putc(c, stdout);
        state = 3;
      } else {
        insert(buf);
        putc(c, stdout);
        state = 0;
      }
      break;
    case 2: // String
      if (c == '\\') {
        putc(c, stdout);
        putc(getc(fp), stdout);
      } else if (c == '\"') {
        putc(c, stdout);
        state = 0;
      } else {
        putc(c, stdout);
      }
      break;
    case 3: // Character
      if (c == '\\') {
        putc(c, stdout);
        putc(getc(fp), stdout);
      } else if (c == '\'') {
        putc(c, stdout);
        state = 0;
      } else {
        putc(c, stdout);
      }
      break;
    case 4: // Comment
      if (c == '*') {
        int c2 = getc(fp);
        if (c2 == '/') {
          putc(c, stdout);
          putc(c2, stdout);
          state = 0;
          break;
        }
        ungetc(c2, fp);
      }
      putc(c, stdout);
      break;
    }
  }
 }
 int main(int argc, char **argv) {
  nalloc = 1;
  keywords = malloc(sizeof(char *) * nalloc);
  if (argc < 2) {
    fprintf(stderr, "%s <input file>\n", argv[0]);
    fprintf(stderr, "Print the file on the standard output\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "%s <input file> <keywords file> <suffix> [<keywords to ignore> ... ]\n", argv[0]);
    fprintf(stderr, "Add the suffix to keywords\n");
    exit(-1);
  }
  char buf[N];
  if (argc == 2) {
    FILE *fp = cygopen(argv[1], "r");
    if (fp == NULL) {
      fprintf(stderr, "Cannot open %s\n", argv[1]);
      exit(-1);
    }
    while(fgets(buf, N, fp) != NULL) {
      fputs(buf, stdout);
    }
    fclose(fp);
    exit(0);
  }
  FILE *fp = cygopen(argv[2], "r");
  if (fp == NULL) {
    fprintf(stderr, "Cannot open %s\n", argv[2]);
    exit(-1);
  }
  while(fgets(buf, N, fp) != NULL) {
    if (strlen(buf) >= 1) buf[strlen(buf)-1] = '\0';
    keywords[nkeywords] = malloc(sizeof(char) * (strlen(buf) + 1));
    strcpy(keywords[nkeywords], buf);
    nkeywords++;
    if (nkeywords >= nalloc) {
      nalloc *= 2;
      keywords = realloc(keywords, sizeof(char *) * nalloc);
    }
  }
  fclose(fp);
  nIgnore = argc - 4;
  ignore = argv + 4;
  suffix = argv[3];
  fp = cygopen(argv[1], "r");
  if (fp == NULL) {
    fprintf(stderr, "Cannot open %s\n", argv[1]);
    exit(-1);
  }
  doit(fp);
  fclose(fp);
  exit(0);
 }
 // cat sleef*inline*.h | egrep -o '[a-zA-Z_][0-9a-zA-Z_]*' | sort | uniq > cand.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.c
@@ -0,0 +1,347 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <ctype.h>
 #include <inttypes.h>
 #include <assert.h>
 //
 #if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER))
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/file.h>
 static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); }
 static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); }
 static void FTRUNCATE(FILE *fp, off_t z) {
  if (ftruncate(fileno(fp), z))
    ;
 }
 static FILE *OPENTMPFILE() { return tmpfile(); }
 static void CLOSETMPFILE(FILE *fp) { fclose(fp); }
 #else
 #include <windows.h>
 #include <io.h>
 static void FLOCK(FILE *fp) { }
 static void FUNLOCK(FILE *fp) { }
 static void FTRUNCATE(FILE *fp, long z) {
  fseek(fp, 0, SEEK_SET);
  SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp)));
 }
 static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); }
 static void CLOSETMPFILE(FILE *fp) {
  fclose(fp);
  remove("tmpfile.txt");
 }
 #endif
 //
 #define MAGIC_ARRAYMAPNODE 0xf73130fa
 #define MAGIC_ARRAYMAP 0x8693bd21
 #define LOGNBUCKETS 8
 #define NBUCKETS (1 << LOGNBUCKETS)
 static int hash(uint64_t key) {
  return (key ^ (key >> LOGNBUCKETS) ^ (key >> (LOGNBUCKETS*2)) ^ (key >> (LOGNBUCKETS*3))) & (NBUCKETS-1);
 }
 static void String_trim(char *str) {
  char *dst = str, *src = str, *pterm = src;
  while(*src != '\0' && isspace((int)*src)) src++;
  for(;*src != '\0';src++) {
    *dst++ = *src;
    if (!isspace((int)*src)) pterm = dst;
  }
  *pterm = '\0';
 }
 typedef struct ArrayMapNode {
  uint32_t magic;
  uint64_t key;
  void *value;
 } ArrayMapNode;
 typedef struct ArrayMap {
  uint32_t magic;
  ArrayMapNode *array[NBUCKETS];
  int size[NBUCKETS], capacity[NBUCKETS], totalSize;
 } ArrayMap;
 ArrayMap *initArrayMap() {
  ArrayMap *thiz = (ArrayMap *)calloc(1, sizeof(ArrayMap));
  thiz->magic = MAGIC_ARRAYMAP;
  for(int i=0;i<NBUCKETS;i++) {
    thiz->capacity[i] = 8;
    thiz->array[i] = (ArrayMapNode *)malloc(thiz->capacity[i] * sizeof(ArrayMapNode));
    thiz->size[i] = 0;
  }
  thiz->totalSize = 0;
  return thiz;
 }
 void ArrayMap_dispose(ArrayMap *thiz) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  for(int j=0;j<NBUCKETS;j++) {
    for(int i=0;i<thiz->size[j];i++) {
      assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
      thiz->array[j][i].magic = 0;
    }
    free(thiz->array[j]);
  }
  thiz->magic = 0;
  free(thiz);
 }
 int ArrayMap_size(ArrayMap *thiz) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  return thiz->totalSize;
 }
 uint64_t *ArrayMap_keyArray(ArrayMap *thiz) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  uint64_t *a = (uint64_t *)malloc(sizeof(uint64_t) * thiz->totalSize);
  int p = 0;
  for(int j=0;j<NBUCKETS;j++) {
    for(int i=0;i<thiz->size[j];i++) {
      assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
      a[p++] = thiz->array[j][i].key;
    }
  }
  return a;
 }
 void **ArrayMap_valueArray(ArrayMap *thiz) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  void **a = (void **)malloc(sizeof(void *) * thiz->totalSize);
  int p = 0;
  for(int j=0;j<NBUCKETS;j++) {
    for(int i=0;i<thiz->size[j];i++) {
      assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
      a[p++] = thiz->array[j][i].value;
    }
  }
  return a;
 }
 void *ArrayMap_remove(ArrayMap *thiz, uint64_t key) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  int h = hash(key);
  for(int i=0;i<thiz->size[h];i++) {
    assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
    if (thiz->array[h][i].key == key) {
      void *old = thiz->array[h][i].value;
      thiz->array[h][i].key   = thiz->array[h][thiz->size[h]-1].key;
      thiz->array[h][i].value = thiz->array[h][thiz->size[h]-1].value;
      thiz->array[h][thiz->size[h]-1].magic = 0;
      thiz->size[h]--;
      thiz->totalSize--;
      return old;
    }
  }
  return NULL;
 }
 void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value) {
  if (value == NULL) return ArrayMap_remove(thiz, key);
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  int h = hash(key);
  for(int i=0;i<thiz->size[h];i++) {
    assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
    if (thiz->array[h][i].key == key) {
      void *old = thiz->array[h][i].value;
      thiz->array[h][i].value = value;
      return old;
    }
  }
  if (thiz->size[h] >= thiz->capacity[h]) {
    thiz->capacity[h] *= 2;
    thiz->array[h] = (ArrayMapNode *)realloc(thiz->array[h], thiz->capacity[h] * sizeof(ArrayMapNode));
  }
  ArrayMapNode *n = &(thiz->array[h][thiz->size[h]++]);
  n->magic = MAGIC_ARRAYMAPNODE;
  n->key = key;
  n->value = value;
  thiz->totalSize++;
  return NULL;
 }
 void *ArrayMap_get(ArrayMap *thiz, uint64_t key) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  int h = hash(key);
  for(int i=0;i<thiz->size[h];i++) {
    assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
    if (thiz->array[h][i].key == key) {
      return thiz->array[h][i].value;
    }
  }
  return NULL;
 }
 #define LINELEN (1024*1024)
 ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock) {
  const int idstrlen = (int)strlen(idstr);
  int prefixLen = (int)strlen(prefix) + 3;
  if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return NULL;
  FILE *fp = fopen(fn, "r");
  if (fp == NULL) return NULL;
  if (doLock) FLOCK(fp);
  ArrayMap *thiz = initArrayMap();
  char *prefix2 = malloc(prefixLen+10);
  strcpy(prefix2, prefix);
  String_trim(prefix2);
  for(char *p = prefix2;*p != '\0';p++) {
    if (*p == ':') *p = ';';
    if (*p == ' ') *p = '_';
  }
  strcat(prefix2, " : ");
  prefixLen = (int)strlen(prefix2);
  char *line = malloc(sizeof(char) * (LINELEN+10));
  line[idstrlen] = '\0';
  if (fread(line, sizeof(char), idstrlen, fp) != idstrlen ||
      strcmp(idstr, line) != 0) {
    if (doLock) FUNLOCK(fp);
    fclose(fp);
    free(prefix2);
    free(line);
    return NULL;
  }
  for(;;) {
    line[LINELEN] = '\0';
    if (fgets(line, LINELEN, fp) == NULL) break;
    if (strncmp(line, prefix2, prefixLen) != 0) continue;
    uint64_t key;
    char *value = malloc(sizeof(char) * LINELEN);
    if (sscanf(line + prefixLen, "%" SCNx64 " : %s\n", &key, value) == 2) {
      ArrayMap_put(thiz, (uint64_t)key, (void *)value);
    } else {
      free(value);
    }
  }
  if (doLock) FUNLOCK(fp);
  fclose(fp);
  free(prefix2);
  free(line);
  return thiz;
 }
 int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr) {
  assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
  const int idstrlen = (int)strlen(idstr);
  int prefixLen = (int)strlen(prefix) + 3;
  if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return -1;
  // Generate prefix2
  char *prefix2 = malloc(prefixLen+10);
  strcpy(prefix2, prefix);
  String_trim(prefix2);
  for(char *p = prefix2;*p != '\0';p++) {
    if (*p == ':') *p = ';';
    if (*p == ' ') *p = '_';
  }
  strcat(prefix2, " : ");
  prefixLen = (int)strlen(prefix2);
  //
  FILE *fp = fopen(fn, "a+");
  if (fp == NULL) return -1;
  FLOCK(fp);
  fseek(fp, 0, SEEK_SET);
  // Copy the file specified by fn to tmpfile
  FILE *tmpfp = OPENTMPFILE();
  if (tmpfp == NULL) {
    FUNLOCK(fp);
    fclose(fp);
    return -1;
  }
  char *line = malloc(sizeof(char) * (LINELEN+10));
  line[idstrlen] = '\0';
  if (fread(line, sizeof(char), idstrlen, fp) == idstrlen && strcmp(idstr, line) == 0) {
    for(;;) {
      line[LINELEN] = '\0';
      if (fgets(line, LINELEN, fp) == NULL) break;
      if (strncmp(line, prefix2, prefixLen) != 0) fputs(line, tmpfp);
    }
  }
  // Write the contents in the map into tmpfile
  uint64_t *keys = ArrayMap_keyArray(thiz);
  int s = ArrayMap_size(thiz);
  for(int i=0;i<s;i++) {
    char *value = ArrayMap_get(thiz, keys[i]);
    if (strlen(value) + prefixLen >= LINELEN-10) continue;
    fprintf(tmpfp, "%s %" PRIx64 " : %s\n", prefix2, keys[i], value);
  }
  free(keys);
  fseek(fp, 0, SEEK_SET);
  FTRUNCATE(fp, 0);
  fwrite(idstr, sizeof(char), strlen(idstr), fp);
  fseek(tmpfp, 0, SEEK_SET);
  for(;;) {
    size_t s = fread(line, 1, LINELEN, tmpfp);
    if (s == 0) break;
    fwrite(line, 1, s, fp);
  }
  FUNLOCK(fp);
  fclose(fp);
  CLOSETMPFILE(tmpfp);
  free(prefix2);
  free(line);
  return 0;
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/arraymap.h
@@ -0,0 +1,21 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #ifndef __ARRAYMAP_H__
 #define __ARRAYMAP_H__
 typedef struct ArrayMap ArrayMap;
 ArrayMap *initArrayMap();
 void ArrayMap_dispose(ArrayMap *thiz);
 int ArrayMap_size(ArrayMap *thiz);
 void *ArrayMap_remove(ArrayMap *thiz, uint64_t key);
 void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value);
 void *ArrayMap_get(ArrayMap *thiz, uint64_t key);
 uint64_t *ArrayMap_keyArray(ArrayMap *thiz);
 void **ArrayMap_valueArray(ArrayMap *thiz);
 int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr);
 ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock);
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.c
@@ -0,0 +1,98 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include "misc.h"
 #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
 #include <sys/timeb.h>
 EXPORT void *Sleef_malloc(size_t z) { return _aligned_malloc(z, 256); }
 EXPORT void Sleef_free(void *ptr) { _aligned_free(ptr); }
 EXPORT uint64_t Sleef_currentTimeMicros() {
  struct __timeb64 t;
  _ftime64(&t);
  return t.time * INT64_C(1000000) + t.millitm*1000;
 }
 #elif defined(__APPLE__)
 #include <sys/time.h>
 EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 256, z); return ptr; }
 EXPORT void Sleef_free(void *ptr) { free(ptr); }
 EXPORT uint64_t Sleef_currentTimeMicros() {
  struct timeval time;
  gettimeofday(&time, NULL);
  return (uint64_t)((time.tv_sec * INT64_C(1000000)) + time.tv_usec);
 }
 #else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
 #include <time.h>
 #include <unistd.h>
 #if defined(__FreeBSD__) || defined(__OpenBSD__)
 #include <stdlib.h>
 #else
 #include <malloc.h>
 #endif
 EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 4096, z); return ptr; }
 EXPORT void Sleef_free(void *ptr) { free(ptr); }
 EXPORT uint64_t Sleef_currentTimeMicros() {
  struct timespec tp;
  clock_gettime(CLOCK_MONOTONIC, &tp);
  return (uint64_t)tp.tv_sec * INT64_C(1000000) + ((uint64_t)tp.tv_nsec/1000);
 }
 #endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
 #ifdef _MSC_VER
 #include <intrin.h>
 EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
  __cpuidex(out, eax, ecx);
 }
 #else
 #if defined(__x86_64__) || defined(__i386__)
 EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
  uint32_t a, b, c, d;
  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
 }
 #endif
 #endif
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
 static char x86BrandString[256];
 EXPORT char *Sleef_getCpuIdString() {
  union {
    int32_t info[4];
    uint8_t str[16];
  } u;
  int i,j;
  char *p;
  p = x86BrandString;
  for(i=0;i<3;i++) {
    Sleef_x86CpuID(u.info, i + 0x80000002, 0);
    for(j=0;j<16;j++) {
      *p++ = u.str[j];
    }
  }
  *p++ = '\n';
  return x86BrandString;
 }
 #else
 EXPORT char *Sleef_getCpuIdString() {
  return "Unknown architecture";
 }
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/common.h
@@ -0,0 +1,9 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #ifndef __COMMON_H__
 #define __COMMON_H__
 char *Sleef_getCpuIdString();
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/commonfuncs.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/commonfuncs.h
@@ -0,0 +1,438 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2023.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
 typedef struct {
  vdouble x, y, z;
 } vdouble3;
 static INLINE CONST VECTOR_CC vdouble  vd3getx_vd_vd3(vdouble3 v) { return v.x; }
 static INLINE CONST VECTOR_CC vdouble  vd3gety_vd_vd3(vdouble3 v) { return v.y; }
 static INLINE CONST VECTOR_CC vdouble  vd3getz_vd_vd3(vdouble3 v) { return v.z; }
 static INLINE CONST VECTOR_CC vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z)  {
  vdouble3 v = { x, y, z };
  return v;
 }
 static INLINE CONST VECTOR_CC vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { v.x = d; return v; }
 static INLINE CONST VECTOR_CC vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { v.y = d; return v; }
 static INLINE CONST VECTOR_CC vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { v.z = d; return v; }
 //
 typedef struct {
  vdouble2 a, b;
 } dd2;
 static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {
  dd2 r = { a, b };
  return r;
 }
 static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; }
 static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; }
 //
 typedef struct {
  vmask e;
  vdouble3 d3;
 } tdx;
 static INLINE CONST VECTOR_CC vmask tdxgete_vm_tdx(tdx t) { return t.e; }
 static INLINE CONST VECTOR_CC vdouble3 tdxgetd3_vd3_tdx(tdx t) { return t.d3; }
 static INLINE CONST VECTOR_CC vdouble tdxgetd3x_vd_tdx(tdx t) { return t.d3.x; }
 static INLINE CONST VECTOR_CC vdouble tdxgetd3y_vd_tdx(tdx t) { return t.d3.y; }
 static INLINE CONST VECTOR_CC vdouble tdxgetd3z_vd_tdx(tdx t) { return t.d3.z; }
 static INLINE CONST VECTOR_CC tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { t.e = e; return t; }
 static INLINE CONST VECTOR_CC tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { t.d3 = d3; return t; }
 static INLINE CONST VECTOR_CC tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { t.d3.x = x; return t; }
 static INLINE CONST VECTOR_CC tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { t.d3.y = y; return t; }
 static INLINE CONST VECTOR_CC tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { t.d3.z = z; return t; }
 static INLINE CONST VECTOR_CC tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) {
  t.d3 = (vdouble3) { x, y, z };
  return t;
 }
 static INLINE CONST VECTOR_CC tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return (tdx) { e, d3 }; }
 static INLINE CONST VECTOR_CC tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) {
  return (tdx) { e, (vdouble3) { x, y, z } };
 }
 static INLINE CONST VECTOR_CC vmask vqgetx_vm_vq(vquad v) { return v.x; }
 static INLINE CONST VECTOR_CC vmask vqgety_vm_vq(vquad v) { return v.y; }
 static INLINE CONST VECTOR_CC vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return (vquad) { x, y }; }
 static INLINE CONST VECTOR_CC vquad vqsetx_vq_vq_vm(vquad v, vmask x) { v.x = x; return v; }
 static INLINE CONST VECTOR_CC vquad vqsety_vq_vq_vm(vquad v, vmask y) { v.y = y; return v; }
 //
 typedef struct {
  vdouble d;
  vint i;
 } di_t;
 static INLINE CONST VECTOR_CC vdouble digetd_vd_di(di_t d) { return d.d; }
 static INLINE CONST VECTOR_CC vint digeti_vi_di(di_t d) { return d.i; }
 static INLINE CONST VECTOR_CC di_t disetdi_di_vd_vi(vdouble d, vint i) {
  di_t r = { d, i };
  return r;
 }
 //
 typedef struct {
  vdouble2 dd;
  vint i;
 } ddi_t;
 static INLINE CONST VECTOR_CC vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; }
 static INLINE CONST VECTOR_CC vint ddigeti_vi_ddi(ddi_t d) { return d.i; }
 static INLINE CONST VECTOR_CC ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {
  ddi_t r = { v, i };
  return r;
 }
 static INLINE CONST VECTOR_CC ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {
  ddi.dd = v;
  return ddi;
 }
 //
 typedef struct {
  vdouble3 td;
  vint i;
 } tdi_t;
 static INLINE CONST VECTOR_CC vdouble3 tdigettd_vd3_tdi(tdi_t d) { return d.td; }
 static INLINE CONST VECTOR_CC vdouble tdigetx_vd_tdi(tdi_t d) { return d.td.x; }
 static INLINE CONST VECTOR_CC vint tdigeti_vi_tdi(tdi_t d) { return d.i; }
 static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) {
  tdi_t r = { v, i };
  return r;
 }
 #endif
 #if defined(ENABLE_MAIN)
 // Functions for debugging
 #include <stdio.h>
 #include <wchar.h>
 static void printvmask(char *mes, vmask g) {
  uint64_t u[VECTLENDP];
  vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(g));
  printf("%s ", mes);
  for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
  printf("\n");
 }
 #if !defined(ENABLE_SVE)
 static void printvopmask(char *mes, vopmask g) {
  union {
    vopmask g;
    uint8_t u[sizeof(vopmask)];
  } cnv = { .g = g };
  printf("%s ", mes);
  for(int i=0;i<sizeof(vopmask);i++) printf("%02x", cnv.u[i]);
  printf("\n");
 }
 #else
 static void printvopmask(char *mes, vopmask g) {
  vmask m = vand_vm_vo64_vm(g, vcast_vm_i64(-1));
  printvmask(mes, m);
 }
 #endif
 static void printvdouble(char *mes, vdouble vd) {
  double u[VECTLENDP];
  vstoreu_v_p_vd((double *)u, vd);
  printf("%s ", mes);
  for(int i=0;i<VECTLENDP;i++) printf("%.20g : ", u[i]);
  printf("\n");
 }
 static void printvint(char *mes, vint vi) {
  uint32_t u[VECTLENDP];
  vstoreu_v_p_vi((int32_t *)u, vi);
  printf("%s ", mes);
  for(int i=0;i<VECTLENDP;i++) printf("%08x : ", (unsigned)u[i]);
  printf("\n");
 }
 static void printvint64(char *mes, vint64 vi) {
  uint64_t u[VECTLENDP*2];
  vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vreinterpret_vm_vi64(vi)));
  printf("%s ", mes);
  for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
  printf("\n");
 }
 static void printvquad(char *mes, vquad g) {
  uint64_t u[VECTLENDP*2];
  vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vqgetx_vm_vq(g)));
  vstoreu_v_p_vd((double *)&u[VECTLENDP], vreinterpret_vd_vm(vqgety_vm_vq(g)));
  printf("%s ", mes);
  for(int i=0;i<VECTLENDP*2;i++) printf("%016lx : ", (unsigned long)(u[i]));
  printf("\n");
 }
 #endif // #if defined(ENABLE_MAIN)
 ///////////////////////////////////////////////////////////////////////////////////
 // vdouble functions
 static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) {
  return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
 }
 static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) {
  return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));
 }
 static INLINE CONST vopmask visnonfinite_vo_vd(vdouble x) {
  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(INT64_C(0x7ff0000000000000))), vcast_vm_i64(INT64_C(0x7ff0000000000000)));
 }
 static INLINE CONST vmask vsignbit_vm_vd(vdouble d) {
  return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
 }
 static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) {
  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
 }
 static INLINE CONST vdouble vclearlsb_vd_vd_i(vdouble d, int n) {
  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_u64((~UINT64_C(0)) << n)));
 }
 static INLINE CONST VECTOR_CC vdouble vtoward0_vd_vd(vdouble x) { // returns nextafter(x, 0)
  vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(-1)));
  return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);
 }
 #if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
 static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {
  return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
 }
 #endif
 static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) {
  return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);
 }
 #if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
 static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) {
  return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
 }
 static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {
  return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)),
                                          vand_vm_vm_vm   (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));
 }
 #endif
 static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) {
 #ifdef FULL_FP_ROUNDING
  return vtruncate_vd_vd(x);
 #else
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
 #endif
 }
 static INLINE CONST VECTOR_CC vdouble vfloor2_vd_vd(vdouble x) {
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
 }
 static INLINE CONST VECTOR_CC vdouble vceil2_vd_vd(vdouble x) {
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
 }
 static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) {
  vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);
  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
 }
 static INLINE  CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) {
 #ifdef FULL_FP_ROUNDING
  return vrint_vd_vd(d);
 #else
  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);
  return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),
                          d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));
 #endif
 }
 static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) {
  return veq_vo_vd_vd(vrint2_vd_vd(d), d);
 }
 static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) {
  vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5));
  return vneq_vo_vd_vd(vrint2_vd_vd(x), x);
 }
 // ilogb
 #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
 static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) {
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
  vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
  q = vand_vi_vi_vi(q, vcast_vi_i((int)(((1U << 12) - 1) << 20)));
  q = vsrl_vi_vi_i(q, 20);
  q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));
  return q;
 }
 static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) {
  vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
  q = vsrl_vi_vi_i(q, 20);
  q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));
  q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));
  return q;
 }
 #endif
 static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) {
  vmask m = vreinterpret_vm_vd(d);
  m = vsrl64_vm_vm_i(m, 20 + 32);
  m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
  m = vsub64_vm_vm_vm(m, vcast_vm_i64(0x3ff));
  return m;
 }
 static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) {
  vmask m = vreinterpret_vm_vd(d);
  m = vsrl64_vm_vm_i(m, 20 + 32);
  m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
  return m;
 }
 // ldexp
 static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) {
  q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);
  vmask r = vcastu_vm_vi(vsll_vi_vi_i(q, 20));
  return vreinterpret_vd_vm(r);
 }
 static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vm(vmask q) {
  q = vadd64_vm_vm_vm(vcast_vm_i64(0x3ff), q);
  return vreinterpret_vd_vm(vsll64_vm_vm_i(q, 52));
 }
 static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) {
  vint m = vsra_vi_vi_i(q, 31);
  m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);
  q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));
  m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);
  m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);
  m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);
  vmask r = vcastu_vm_vi(vsll_vi_vi_i(m, 20));
  vdouble y = vreinterpret_vd_vm(r);
  return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));
 }
 static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {
  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));
 }
 static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {
  return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vcastu_vm_vi(vsll_vi_vi_i(q, 20))));
 }
 static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) {
  vmask m = vsrl64_vm_vm_i(e, 2);
  e = vsub64_vm_vm_vm(vsub64_vm_vm_vm(vsub64_vm_vm_vm(e, m), m), m);
  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
  d = vmul_vd_vd_vd(d, vpow2i_vd_vm(e));
  return d;
 }
 static INLINE CONST vdouble vldexp2_vd_vd_vm(vdouble d, vmask e) {
  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vm(vsrl64_vm_vm_i(e, 1))), vpow2i_vd_vm(vsub64_vm_vm_vm(e, vsrl64_vm_vm_i(e, 1))));
 }
 static INLINE CONST vdouble vldexp3_vd_vd_vm(vdouble d, vmask q) {
  return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vsll64_vm_vm_i(q, 52)));
 }
 // vmask functions
 static INLINE CONST vdouble vcast_vd_vm(vmask m) { return vcast_vd_vi(vcast_vi_vm(m)); } // 32 bit only
 static INLINE CONST vmask vtruncate_vm_vd(vdouble d) { return vcast_vm_vi(vtruncate_vi_vd(d)); }
 static INLINE CONST vopmask vlt64_vo_vm_vm(vmask x, vmask y) { return vgt64_vo_vm_vm(y, x); }
 static INLINE CONST vopmask vnot_vo64_vo64(vopmask x) {
  return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i64(0), vcast_vm_i64(0)));
 }
 static INLINE CONST vopmask vugt64_vo_vm_vm(vmask x, vmask y) { // unsigned compare
  x = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), x);
  y = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), y);
  return vgt64_vo_vm_vm(x, y);
 }
 static INLINE CONST vmask vilogbk_vm_vd(vdouble d) {
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(4.9090934652977266E-91));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
  vmask q = vreinterpret_vm_vd(d);
  q = vsrl64_vm_vm_i(q, 20 + 32);
  q = vand_vm_vm_vm(q, vcast_vm_i64(0x7ff));
  q = vsub64_vm_vm_vm(q, vsel_vm_vo64_vm_vm(o, vcast_vm_i64(300 + 0x3ff), vcast_vm_i64(0x3ff)));
  return q;
 }
 // vquad functions
 static INLINE CONST vquad sel_vq_vo_vq_vq(vopmask o, vquad x, vquad y) {
  return vqsetxy_vq_vm_vm(vsel_vm_vo64_vm_vm(o, vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vsel_vm_vo64_vm_vm(o, vqgety_vm_vq(x), vqgety_vm_vq(y)));
 }
 static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) {
  vquad r = vqsetxy_vq_vm_vm(vadd64_vm_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vadd64_vm_vm_vm(vqgety_vm_vq(x), vqgety_vm_vq(y)));
  r = vqsety_vq_vq_vm(r, vadd64_vm_vm_vm(vqgety_vm_vq(r), vand_vm_vo64_vm(vugt64_vo_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(r)), vcast_vm_i64(1))));
  return r;
 }
 static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; }
 // imm must be smaller than 64
 #define srl128_vq_vq_i(m, imm)                                  \
  imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), imm), vsll64_vm_vm_i(vqgety_vm_vq(m), 64-imm)), vsrl64_vm_vm_i(vqgety_vm_vq(m), imm))
 // This function is equivalent to :
 // di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) };
 static INLINE CONST di_t rempisub(vdouble x) {
 #ifdef FULL_FP_ROUNDING
  vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4)));
  vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4))));
  return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi);
 #else
  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x);
  vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)),
                                    vmul_vd_vd_vd(vcast_vd_d(4), x),
                                    vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x));
  vdouble rintx  = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)),
                                    x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x));
  return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x),
                          vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x)));
 #endif
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/dd.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/dd.h
@@ -0,0 +1,324 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2024.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
 #if !defined(SLEEF_ENABLE_CUDA)
 typedef struct {
  vdouble x, y;
 } vdouble2;
 #else
 typedef double2 vdouble2;
 #endif
 static INLINE CONST VECTOR_CC vdouble  vd2getx_vd_vd2(vdouble2 v) { return v.x; }
 static INLINE CONST VECTOR_CC vdouble  vd2gety_vd_vd2(vdouble2 v) { return v.y; }
 static INLINE CONST VECTOR_CC vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y)  { vdouble2 v; v.x = x; v.y = y; return v; }
 static INLINE CONST VECTOR_CC vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; }
 static INLINE CONST VECTOR_CC vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; }
 #endif
 #if !defined(SLEEF_ENABLE_CUDA)
 typedef struct {
  double x, y;
 } double2;
 #endif
 static INLINE CONST VECTOR_CC double2 dd(double h, double l) {
  double2 ret = { h, l };
  return ret;
 }
 static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) {
  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));
 }
 static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
  return vd2setxy_vd2_vd_vd(h, l);
 }
 static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {
  return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l));
 }
 static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d2(double2 dd) {
  return vd2setxy_vd2_vd_vd(vcast_vd_d(dd.x), vcast_vd_d(dd.y));
 }
 static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
  return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)),
                            vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
 }
 static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {
  return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0),
                            vsel_vd_vo_d_d(o, y1, y0));
 }
 static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
  return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);
 }
 static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
  return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);
 }
 static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
  return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);
 }
 static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
  return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);
 }
 static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
  return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);
 }
 static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
  return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);
 }
 static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
  return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);
 }
 static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
  return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);
 }
 static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
  return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);
 }
 //
 static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {
  return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {
  return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)),
                         vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)),
                                                          vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)),
                                                                        vreinterpret_vm_vd(vcast_vd_d(-0.0))))));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
  return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_d(vdouble2 d, double s) { return ddscale_vd2_vd2_vd(d, vcast_vd_d(s)); }
 static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(x, y);
  vdouble v = vsub_vd_vd_vd(s, x);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
  vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
  vdouble v = vsub_vd_vd_vd(s, x);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)),
                                                           vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  // |x| >= |y|
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
  vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
  // |x| >= |y|
  vdouble s = vsub_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  // |x| >= |y|
  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s);
  t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y));
  t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x));
  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y)));
 }
 #ifdef ENABLE_FMA_DP
 static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
  vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s);
  vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1)));
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble s = vmul_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s))));
 }
 static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y))));
 }
 static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
  vdouble s = vrec_vd_vd(d);
  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1))));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
  vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d));
  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1)))));
 }
 #else // #ifdef ENABLE_FMA_DP
 static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
  vdouble dh  = vupper_vd_vd(vd2getx_vd_vd2(d)), dl  = vsub_vd_vd_vd(vd2getx_vd_vd2(d),  dh);
  vdouble th  = vupper_vd_vd(t  ), tl  = vsub_vd_vd_vd(t  ,  th);
  vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh);
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
  vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),
                    vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
  return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);
  vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);
  vdouble s = vmul_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble yh = vupper_vd_vd(y  ), yl = vsub_vd_vd_vd(y  , yh);
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y)));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y))));
 }
 static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
  return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x)))));
 }
 static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
  vdouble t = vrec_vd_vd(d);
  vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);
  vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);
  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
  vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
  vdouble th = vupper_vd_vd(t  ), tl = vsub_vd_vd_vd(t  , th);
  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t))));
 }
 #endif // #ifdef ENABLE_FMA_DP
 static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
  vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));
  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {
  vdouble t = vsqrt_vd_vd(d);
  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
 }
 static INLINE CONST VECTOR_CC vdouble2 ddmla_vd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y, vdouble2 z) {
  return ddadd_vd2_vd2_vd2(z, ddmul_vd2_vd2_vd2(x, y));
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/df.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/df.h
@@ -0,0 +1,369 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2024.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
 #if !defined(SLEEF_ENABLE_CUDA)
 typedef struct {
  vfloat x, y;
 } vfloat2;
 #else
 typedef float2 vfloat2;
 #endif
 static INLINE CONST VECTOR_CC vfloat  vf2getx_vf_vf2(vfloat2 v) { return v.x; }
 static INLINE CONST VECTOR_CC vfloat  vf2gety_vf_vf2(vfloat2 v) { return v.y; }
 static INLINE CONST VECTOR_CC vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y)  { vfloat2 v; v.x = x; v.y = y; return v; }
 static INLINE CONST VECTOR_CC vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; }
 static INLINE CONST VECTOR_CC vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; }
 #endif
 static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) {
  return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));
 }
 static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
  return vf2setxy_vf2_vf_vf(h, l);
 }
 static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {
  return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l));
 }
 static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) {
  return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d));
 }
 static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
  return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
 }
 static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {
  return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0));
 }
 static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));
 }
 static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));
 }
 static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {
  return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))),
                         vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));
 }
 static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
  return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);
 }
 static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
  return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);
 }
 static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
  return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);
 }
 static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {
  return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);
 }
 static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {
  return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);
 }
 static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
  return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);
 }
 static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
  return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);
 }
 static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
  return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);
 }
 //
 static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {
  return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {
  return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)),
                         vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
  return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(x, y);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(x, y);
  vfloat v = vsub_vf_vf_vf(s, x);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
  vfloat v = vsub_vf_vf_vf(s, x);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  // |x| >= |y|
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
  // |x| >= |y|
  vfloat s = vsub_vf_vf_vf(x, y);
  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  // |x| >= |y|
  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s);
  t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y));
  t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x));
  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y)));
 }
 #ifdef ENABLE_FMA_SP
 static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
  vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s);
  vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1)));
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat s = vmul_vf_vf_vf(x, y);
  return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s)));
 }
 static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x))));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s))));
 }
 static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y))));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y);
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s)));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
  vfloat s = vrec_vf_vf(d);
  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1))));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
  vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d));
  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1)))));
 }
 #else
 static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
  vfloat dh  = vupper_vf_vf(vf2getx_vf_vf2(d)), dl  = vsub_vf_vf_vf(vf2getx_vf_vf2(d),  dh);
  vfloat th  = vupper_vf_vf(t  ), tl  = vsub_vf_vf_vf(t  ,  th);
  vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh);
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
  vfloat u, w;
  w = vcast_vf_f(-1);
  w = vmla_vf_vf_vf_vf(dh, th, w);
  w = vmla_vf_vf_vf_vf(dh, tl, w);
  w = vmla_vf_vf_vf_vf(dl, th, w);
  w = vmla_vf_vf_vf_vf(dl, tl, w);
  w = vneg_vf_vf(w);
  u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s));
  u = vmla_vf_vf_vf_vf(nhh, tl, u);
  u = vmla_vf_vf_vf_vf(nhl, th, u);
  u = vmla_vf_vf_vf_vf(nhl, tl, u);
  u = vmla_vf_vf_vf_vf(s, w, u);
  return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);
  vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);
  vfloat s = vmul_vf_vf_vf(x, y), t;
  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(xl, yh, t);
  t = vmla_vf_vf_vf_vf(xh, yl, t);
  t = vmla_vf_vf_vf_vf(xl, yl, t);
  return vf2setxy_vf2_vf_vf(s, t);
 }
 static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat yh = vupper_vf_vf(y  ), yl = vsub_vf_vf_vf(y  , yh);
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t;
  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(xl, yh, t);
  t = vmla_vf_vf_vf_vf(xh, yl, t);
  t = vmla_vf_vf_vf_vf(xl, yl, t);
  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t);
  return vf2setxy_vf2_vf_vf(s, t);
 }
 static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t;
  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(xl, yh, t);
  t = vmla_vf_vf_vf_vf(xh, yl, t);
  t = vmla_vf_vf_vf_vf(xl, yl, t);
  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t);
  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t);
  return vf2setxy_vf2_vf_vf(s, t);
 }
 static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
  return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t;
  t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);
  t = vmla_vf_vf_vf_vf(xl, xl, t);
  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t);
  return vf2setxy_vf2_vf_vf(s, t);
 }
 static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
  vfloat t = vrec_vf_vf(d);
  vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);
  vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);
  vfloat u = vcast_vf_f(-1);
  u = vmla_vf_vf_vf_vf(dh, th, u);
  u = vmla_vf_vf_vf_vf(dh, tl, u);
  u = vmla_vf_vf_vf_vf(dl, th, u);
  u = vmla_vf_vf_vf_vf(dl, tl, u);
  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
 }
 static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
  vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
  vfloat th = vupper_vf_vf(t  ), tl = vsub_vf_vf_vf(t  , th);
  vfloat u = vcast_vf_f(-1);
  u = vmla_vf_vf_vf_vf(dh, th, u);
  u = vmla_vf_vf_vf_vf(dh, tl, u);
  u = vmla_vf_vf_vf_vf(dl, th, u);
  u = vmla_vf_vf_vf_vf(dl, tl, u);
  u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u);
  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
 }
 #endif
 static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
 #ifdef ENABLE_RECSQRT_SP
  vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
  vfloat2 r = dfmul_vf2_vf2_vf(d, x);
  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));
 #else
  vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));
 #endif
 }
 static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {
  vfloat t = vsqrt_vf_vf(d);
  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/estrin.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/estrin.h
@@ -0,0 +1,40 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 // These are macros for evaluating polynomials using Estrin's method
 #define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0))
 #define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0)))
 #define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0)))
 #define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0))
 #define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
 #define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
 #define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
 #define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY20(x, x2, x4, x8, x16, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, POLY4(x, x2, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
 #define POLY21(x, x2, x4, x8, x16, d4, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, POLY5(x, x2, x4, d4, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/f128util.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/f128util.h
@@ -0,0 +1,92 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <quadmath.h>
 #include <inttypes.h>
 static __float128 mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd) {
  if (isnan(mpfr_get_d(m, GMP_RNDN))) return __builtin_nan("");
  mpfr_t frr, frd;
  mpfr_inits(frr, frd, NULL);
  mpfr_exp_t e;
  mpfr_frexp(&e, frr, m, GMP_RNDN);
  double d0 = mpfr_get_d(frr, GMP_RNDN);
  mpfr_set_d(frd, d0, GMP_RNDN);
  mpfr_sub(frr, frr, frd, GMP_RNDN);
  double d1 = mpfr_get_d(frr, GMP_RNDN);
  mpfr_set_d(frd, d1, GMP_RNDN);
  mpfr_sub(frr, frr, frd, GMP_RNDN);
  double d2 = mpfr_get_d(frr, GMP_RNDN);
  mpfr_clears(frr, frd, NULL);
  return ldexpq((__float128)d2 + (__float128)d1 + (__float128)d0, e);
 }
 static void mpfr_set_f128(mpfr_t frx, __float128 f, mpfr_rnd_t rnd) {
  char s[128];
  quadmath_snprintf(s, 120, "%.50Qg", f);
  mpfr_set_str(frx, s, 10, rnd);
 }
 static void printf128(__float128 f) {
  char s[128];
  quadmath_snprintf(s, 120, "%.50Qg", f);
  printf("%s", s);
 }
 static char frstr[16][1000];
 static int frstrcnt = 0;
 static char *toBC(double d) {
  union {
    double d;
    uint64_t u64;
    int64_t i64;
  } cnv;
  cnv.d = d;
  int64_t l = cnv.i64;
  int e = (int)((l >> 52) & ~(-1L << 11));
  int s = (int)(l >> 63);
  l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52));
  char *ptr = frstr[(frstrcnt++) & 15];
  sprintf(ptr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52));
  return ptr;
 }
 static char *toBCq(__float128 d) {
  union {
    __float128 d;
    __uint128_t u128;
  } cnv;
  cnv.d = d;
  __uint128_t m = cnv.u128;
  int e = (int)((m >> 112) & ~(-1L << 15));
  int s = (int)(m >> 127);
  m = d == 0 ? 0 : ((m & ((((__uint128_t)1) << 112)-1)) | ((__uint128_t)1 << 112));
  uint64_t h = m / UINT64_C(10000000000000000000);
  uint64_t l = m % UINT64_C(10000000000000000000);
  char *ptr = frstr[(frstrcnt++) & 15];
  sprintf(ptr, "%s%" PRIu64 "%019" PRIu64 "*2^%d", s != 0 ? "-" : "", h, l, (e-0x3fff-112));
  return ptr;
 }
 static int xisnanq(Sleef_quad x) { return x != x; }
 static int xisinfq(Sleef_quad x) { return x == (Sleef_quad)__builtin_inf() || x == -(Sleef_quad)__builtin_inf(); }
 static int xisfiniteq(Sleef_quad x) { return !xisnanq(x) && !isinfq(x); }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/keywords.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/keywords.txt
@@ -0,0 +1,683 @@
 double2
 double3
 float2
 atan2k
 atan2kf
 atan2kf_u1
 atan2k_u1
 cospik
 cospifk
 dd
 dd2
 dd2geta_vd2_dd2
 dd2getb_vd2_dd2
 dd2setab_dd2_vd2_vd2
 ddabs_vd2_vd2
 ddadd2_vd2_vd2_vd
 ddadd2_vd2_vd2_vd2
 ddadd2_vd2_vd_vd
 ddadd2_vd2_vd_vd2
 ddadd_vd2_vd2_vd
 ddadd_vd2_vd2_vd2
 ddadd_vd2_vd_vd
 ddadd_vd2_vd_vd2
 dddiv_vd2_vd2_vd2
 ddi
 ddi_t
 ddigetdd_vd2_ddi
 ddigeti_vi_ddi
 ddisetdd_ddi_ddi_vd2
 ddisetddi_ddi_vd2_vi
 ddmla_vd2_vd2_vd2_vd2
 ddmla_vd2_vd_vd2_vd2
 ddmul_vd2_vd2_vd
 ddmul_vd2_vd2_vd2
 ddmul_vd2_vd_vd
 ddmul_vd_vd2_vd2
 ddneg_vd2_vd2
 ddnormalize_vd2_vd2
 ddrec_vd2_vd
 ddrec_vd2_vd2
 ddscale_vd2_vd2_d
 ddscale_vd2_vd2_vd
 ddsqrt_vd2_vd
 ddsqrt_vd2_vd2
 ddsqu_vd2_vd2
 ddsqu_vd_vd2
 ddsub_vd2_vd2_vd
 ddsub_vd2_vd2_vd2
 ddsub_vd2_vd_vd
 df
 df2
 df2geta_vf2_df2
 df2getb_vf2_df2
 df2setab_df2_vf2_vf2
 dfabs_vf2_vf2
 dfadd2_vf2_vf2_vf
 dfadd2_vf2_vf2_vf2
 dfadd2_vf2_vf_vf
 dfadd2_vf2_vf_vf2
 dfadd_vf2_vf2_vf
 dfadd_vf2_vf2_vf2
 dfadd_vf2_vf_vf
 dfadd_vf2_vf_vf2
 dfdiv_vf2_vf2_vf2
 dfi
 dfi_t
 dfigetdf_vf2_dfi
 dfigeti_vi2_dfi
 dfisetdf_dfi_dfi_vf2
 dfisetdfi_dfi_vf2_vi2
 dfmla_vf2_vf_vf2_vf2
 dfmul_vf2_vf2_vf
 dfmul_vf2_vf2_vf2
 dfmul_vf2_vf_vf
 dfmul_vf_vf2_vf2
 dfneg_vf2_vf2
 dfnormalize_vf2_vf2
 dfrec_vf2_vf
 dfrec_vf2_vf2
 dfscale_vf2_vf2_vf
 dfsqrt_vf2_vf
 dfsqrt_vf2_vf2
 dfsqu_vf2_vf2
 dfsqu_vf_vf2
 dfsub_vf2_vf2_vf
 dfsub_vf2_vf2_vf2
 dfsub_vf2_vf_vf
 di_t
 digetd_vd_di
 digeti_vi_di
 disetdi_di_vd_vi
 expk
 expk2
 expk2f
 expk3f
 expkf
 expm1fk
 expm1k
 fi_t
 figetd_vf_di
 figeti_vi2_di
 fisetdi_fi_vf_vi2
 gammafk
 gammak
 imdvq_vq_vm_vm
 logk
 logk2
 logk2f
 logk3f
 logkf
 poly2dd
 poly2dd_b
 poly2df
 poly2df_b
 poly4dd
 poly4df
 pragma
 rempi
 rempif
 rempisub
 rempisubf
 sinpifk
 sinpik
 td
 tdi_t
 tdigeti_vi_tdi
 tdigettd_vd3_tdi
 tdigetx_vd_tdi
 tdisettdi_tdi_vd3_vi
 tdx
 tdxgetd3_vd3_tdx
 tdxgetd3x_vd_tdx
 tdxgetd3y_vd_tdx
 tdxgetd3z_vd_tdx
 tdxgete_vm_tdx
 tdxsetd3_tdx_tdx_vd3
 tdxsete_tdx_tdx_vm
 tdxseted3_tdx_vm_vd3
 tdxsetexyz_tdx_vm_vd_vd_vd
 tdxsetx_tdx_tdx_vd
 tdxsetxyz_tdx_tdx_vd_vd_vd
 tdxsety_tdx_tdx_vd
 tdxsetz_tdx_tdx_vd
 vabs_vd_vd
 vabs_vf2_vf2
 vabs_vf_vf
 add128_vq_vq_vq
 vadd64_vm_vm_vm
 vadd_vd_3vd
 vadd_vd_4vd
 vadd_vd_5vd
 vadd_vd_6vd
 vadd_vd_7vd
 vadd_vd_vd_vd
 vadd_vf_3vf
 vadd_vf_4vf
 vadd_vf_5vf
 vadd_vf_6vf
 vadd_vf_7vf
 vadd_vf_vf_vf
 vadd_vi2_vi2_vi2
 vadd_vi_vi_vi
 vand_vi2_vi2_vi2
 vand_vi2_vo_vi2
 vand_vi_vi_vi
 vand_vi_vo_vi
 vand_vm_vm_vm
 vand_vm_vo32_vm
 vand_vm_vo64_vm
 vand_vo_vo_vo
 vandnot_vi2_vi2_vi2
 vandnot_vi2_vo_vi2
 vandnot_vi_vi_vi
 vandnot_vi_vo_vi
 vandnot_vm_vm_vm
 vandnot_vm_vo32_vm
 vandnot_vm_vo64_vm
 vandnot_vo_vo_vo
 vargquad
 vavailability_i
 cast_aq_vq
 vcast_d_vd
 vcast_f_vf
 vcast_vd2_d2
 vcast_vd2_d_d
 vcast_vd2_vd_vd
 vcast_vd_d
 vcast_vd_vi
 vcast_vd_vm
 vcast_vf2_d
 vcast_vf2_f_f
 vcast_vf2_vf_vf
 vcast_vf_f
 vcast_vf_vi2
 vcast_vi2_i
 vcast_vi2_i_i
 vcast_vi2_vm
 vcast_vi_i
 vcast_vi_vm
 vcast_vm_i64
 vcast_vm_i_i
 vcast_vm_u64
 vcast_vm_vi
 vcast_vm_vi2
 vcast_vm_vo
 vcast_vo_i
 vcast_vo32_vo64
 vcast_vo64_vo32
 cast_vq_aq
 vclearlsb_vd_vd_i
 vcopysign_vd_vd_vd
 vcopysign_vf_vf_vf
 vd
 vd2getx_vd_vd2
 vd2gety_vd_vd2
 vd2setx_vd2_vd2_vd
 vd2setxy_vd2_vd_vd
 vd2sety_vd2_vd2_vd
 vd3getx_vd_vd3
 vd3gety_vd_vd3
 vd3getz_vd_vd3
 vd3setx_vd3_vd3_vd
 vd3setxyz_vd3_vd_vd_vd
 vd3sety_vd3_vd3_vd
 vd3setz_vd3_vd3_vd
 vdiv_vd_vd_vd
 vdiv_vf_vf_vf
 vdouble
 vdouble2
 vdouble3
 veq64_vo_vm_vm
 veq_vi2_vi2_vi2
 veq_vi_vi_vi
 veq_vo_vd_vd
 veq_vo_vf_vf
 veq_vo_vi2_vi2
 veq_vo_vi_vi
 versatileVector
 vf2getx_vf_vf2
 vf2gety_vf_vf2
 vf2setx_vf2_vf2_vf
 vf2setxy_vf2_vf_vf
 vf2sety_vf2_vf2_vf
 vfloat
 vfloat2
 vfma_vd_vd_vd_vd
 vfma_vf_vf_vf_vf
 vfmann_vd_vd_vd_vd
 vfmann_vf_vf_vf_vf
 vfmanp_vd_vd_vd_vd
 vfmanp_vf_vf_vf_vf
 vfmapn_vd_vd_vd_vd
 vfmapn_vf_vf_vf_vf
 vfmapp_vd_vd_vd_vd
 vfmapp_vf_vf_vf_vf
 vgather_vd_p_vi
 vgather_vf_p_vi2
 vge_vo_vd_vd
 vge_vo_vf_vf
 vgetexp_vd_vd
 vgetexp_vf_vf
 vgetmant_vd_vd
 vgetmant_vf_vf
 vgt64_vo_vm_vm
 vgt_vi2_vi2_vi2
 vgt_vi_vi_vi
 vgt_vo_vd_vd
 vgt_vo_vf_vf
 vgt_vo_vi2_vi2
 vgt_vo_vi_vi
 vilogb2k_vi2_vf
 vilogb2k_vi_vd
 vilogb2k_vm_vd
 vilogb3k_vm_vd
 vilogbk_vi2_vf
 vilogbk_vi_vd
 vilogbk_vm_vd
 vint
 vint2
 vint64
 visinf2_vd_vd_vd
 visinf2_vf_vf_vf
 visinf_vo_vd
 visinf_vo_vf
 visint_vo_vd
 visint_vo_vf
 visminf_vo_vd
 visminf_vo_vf
 visnan_vo_vd
 visnan_vo_vf
 visnegzero_vo_vd
 visnegzero_vo_vf
 visnonfinite_vo_vd
 visnumber_vo_vd
 visnumber_vo_vf
 visodd_vo_vd
 vispinf_vo_vd
 vispinf_vo_vf
 vldexp1_vd_vd_vm
 vldexp2_vd_vd_vi
 vldexp2_vd_vd_vm
 vldexp2_vf_vf_vi2
 vldexp3_vd_vd_vi
 vldexp3_vd_vd_vm
 vldexp3_vf_vf_vi2
 vldexp_vd_vd_vi
 vldexp_vf_vf_vi2
 vle_vo_vd_vd
 vle_vo_vf_vf
 vload_vd_p
 vload_vf_p
 vloadu_vd_p
 vloadu_vf_p
 vloadu_vi2_p
 vloadu_vi_p
 loadu_vq_p
 vlt64_vo_vm_vm
 vlt_vo_vd_vd
 vlt_vo_vf_vf
 vmask
 vmax_vd_vd_vd
 vmax_vf_vf_vf
 vmin_vd_vd_vd
 vmin_vf_vf_vf
 vmla_vd_vd_vd_vd
 vmla_vf_vf_vf_vf
 vmlanp_vd_vd_vd_vd
 vmlanp_vf_vf_vf_vf
 vmlapn_vd_vd_vd_vd
 vmlapn_vf_vf_vf_vf
 vmlsubadd_vd_vd_vd_vd
 vmlsubadd_vf_vf_vf_vf
 vmul_vd_vd_vd
 vmul_vf_vf_vf
 vmulsign_vd_vd_vd
 vmulsign_vf_vf_vf
 vneg64_vm_vm
 vneg_vd_vd
 vneg_vf_vf
 vneg_vi2_vi2
 vneg_vi_vi
 vnegpos_vd_vd
 vnegpos_vf_vf
 vneq_vo_vd_vd
 vneq_vo_vf_vf
 vnot_vo32_vo32
 vnot_vo64_vo64
 vopmask
 vor_vi2_vi2_vi2
 vor_vi_vi_vi
 vor_vm_vm_vm
 vor_vm_vo32_vm
 vor_vm_vo64_vm
 vor_vo_vo_vo
 vorsign_vd_vd_vd
 vorsign_vf_vf_vf
 vposneg_vd_vd
 vposneg_vf_vf
 vpow2i_vd_vi
 vpow2i_vd_vm
 vpow2i_vf_vi2
 vprefetch_v_p
 vptrunc_vd_vd
 vptrunc_vf_vf
 vqgetx_vm_vq
 vqgety_vm_vq
 vqsetx_vq_vq_vm
 vqsetxy_vq_vm_vm
 vqsety_vq_vq_vm
 vquad
 vrec_vd_vd
 vrec_vf_vf
 vreinterpret_vd_vf
 vreinterpret_vd_vm
 vreinterpret_vf_vd
 vreinterpret_vf_vi2
 vreinterpret_vf_vm
 vreinterpret_vi2_vf
 vreinterpret_vi64_vm
 vreinterpret_vm_vd
 vreinterpret_vm_vf
 vreinterpret_vm_vi64
 vreinterpret_vm_vu64
 vreinterpret_vu64_vm
 vrev21_vd_vd
 vrev21_vf_vf
 vreva2_vd_vd
 vreva2_vf_vf
 vrint_vd_vd
 vrint2_vd_vd
 vrint_vf_vf
 vrint_vi2_vf
 vrint_vi_vd
 vrintfk2_vf_vf
 vrintk2_vd_vd
 vscatter2_v_p_i_i_vd
 vscatter2_v_p_i_i_vf
 vsel_vd2_vo_d_d_d_d
 vsel_vd2_vo_vd2_vd2
 vsel_vd_vo_d_d
 vsel_vd_vo_vd_vd
 vsel_vd_vo_vo_d_d_d
 vsel_vd_vo_vo_vo_d_d_d_d
 vsel_vf2_vo_f_f_f_f
 vsel_vf2_vo_vf2_vf2
 vsel_vf2_vo_vo_d_d_d
 vsel_vf2_vo_vo_vo_d_d_d_d
 vsel_vf_vo_f_f
 vsel_vf_vo_vf_vf
 vsel_vf_vo_vo_f_f_f
 vsel_vf_vo_vo_vo_f_f_f_f
 vsel_vi2_vf_vf_vi2_vi2
 vsel_vi2_vf_vi2
 vsel_vi2_vo_vi2_vi2
 vsel_vi_vd_vd_vi_vi
 vsel_vi_vd_vi
 vsel_vi_vo_vi_vi
 vsel_vm_vo64_vm_vm
 sel_vq_vo_vq_vq
 vsign_vd_vd
 vsign_vf_vf
 vsignbit_vm_vd
 vsignbit_vm_vf
 vsignbit_vo_vd
 vsignbit_vo_vf
 vsll_vi2_vi2_i
 vsll_vi_vi_i
 vsqrt_vd_vd
 vsqrt_vf_vf
 vsra_vi2_vi2_i
 vsra_vi_vi_i
 vsrl_vi2_vi2_i
 vsrl_vi_vi_i
 vsscatter2_v_p_i_i_vd
 vsscatter2_v_p_i_i_vf
 vstore_v_p_vd
 vstore_v_p_vf
 vstoreu_v_p_vd
 vstoreu_v_p_vf
 vstoreu_v_p_vi
 vstoreu_v_p_vi2
 storeu_v_p_vq
 vstream_v_p_vd
 vstream_v_p_vf
 vsub64_vm_vm_vm
 vsub_vd_3vd
 vsub_vd_4vd
 vsub_vd_5vd
 vsub_vd_6vd
 vsub_vd_vd_vd
 vsub_vf_3vf
 vsub_vf_4vf
 vsub_vf_5vf
 vsub_vf_vf_vf
 vsub_vi2_vi2_vi2
 vsub_vi_vi_vi
 vsubadd_vd_vd_vd
 vsubadd_vf_vf_vf
 vtestallones_i_vo32
 vtestallones_i_vo64
 vtestallzeros_i_vo64
 vtoward0_vd_vd
 vtoward0_vf_vf
 vtruncate_vd_vd
 vtruncate2_vd_vd
 vtruncate_vf_vf
 vtruncate_vi2_vf
 vtruncate_vi_vd
 vtruncate_vm_vd
 vugt64_vo_vm_vm
 vuint64
 vupper_vd_vd
 vupper_vf_vf
 vxor_vi2_vi2_vi2
 vxor_vi_vi_vi
 vxor_vm_vm_vm
 vxor_vm_vo32_vm
 vxor_vm_vo64_vm
 vxor_vo_vo_vo
 #
 abs_tdx_tdx
 abs_vd3_vd3
 acos_tdx_tdx
 acosh_tdx_tdx
 add2_vd3_vd2_vd3
 add2_vd3_vd3_vd3
 add2_vd3_vd_vd3
 add_tdx_tdx_tdx
 add_vd3_vd2_vd3
 add_vd3_vd_vd3
 asin_tdx_tdx
 asinh_tdx_tdx
 atan2_tdx_tdx_tdx
 atan_tdx_tdx
 atanh_tdx_tdx
 cast_tdx_d
 cast_tdx_d_d_d
 cast_tdx_vd
 cast_tdx_vd3
 cast_tdx_vq
 cast_vd3_d3
 cast_vd3_d_d_d
 cast_vd3_tdx
 cast_vd3_vd_vd_vd
 cast_vd_tdx
 cast_vq_tdx
 cmp_vm_tdx_tdx
 cmpcnv_vq_vq
 cos_tdx_tdx
 cosh_tdx_tdx
 div2_vd3_vd3_vd3
 div_tdx_tdx_tdx
 div_vd3_vd3_vd3
 eq_vo_tdx_tdx
 exp10_tdx_tdx
 exp10i
 exp10tab
 exp2_tdx_tdx
 exp_tdx_tdx
 expm1_tdx_tdx
 fastcast_tdx_vd3
 fastcast_tdx_vq
 fastcast_vq_tdx
 ge_vo_tdx_tdx
 gt_vo_tdx_tdx
 ilogb_vm_tdx
 isinf_vo_vq
 isint_vo_tdx
 isminf_vo_vq
 isnan_vo_tdx
 isnan_vo_vq
 isnonfinite_vo_vq
 isnonfinite_vo_vq_vq
 isnonfinite_vo_vq_vq_vq
 isodd_vo_tdx
 ispinf_vo_vq
 iszero_vo_tdx
 iszero_vo_vq
 le_vo_tdx_tdx
 log10_tdx_tdx
 log1p_tdx_tdx
 log2_tdx_tdx
 log_tdx_tdx
 logk_tdx_tdx
 lt_vo_tdx_tdx
 mla_vd3_vd3_vd3_vd3
 modf_tdx_tdx_ptdx
 mul2_vd3_vd3_vd3
 mul_tdx_tdx_tdx
 mul_vd3_vd2_vd2
 mul_vd3_vd2_vd3
 mul_vd3_vd3_vd
 mul_vd3_vd3_vd2
 mul_vd3_vd3_vd3
 mulsign_tdx_tdx_vd
 mulsign_vd3_vd3_vd
 mulsign_vq_vq_vq
 neg_tdx_tdx
 neg_vd3_vd3
 neq_vo_tdx_tdx
 normalize_vd3_vd3
 poly10dd
 poly10dd_b
 poly11dd
 poly11dd_b
 poly12dd
 poly12dd_b
 poly13dd
 poly13dd_b
 poly14dd
 poly14dd_b
 poly15dd
 poly15dd_b
 poly16dd
 poly16dd_b
 poly17dd
 poly17dd_b
 poly18dd
 poly18dd_b
 poly19dd
 poly19dd_b
 poly20dd
 poly20dd_b
 poly21dd
 poly21dd_b
 poly22dd
 poly22dd_b
 poly23dd
 poly23dd_b
 poly24dd
 poly24dd_b
 poly25dd
 poly25dd_b
 poly26dd
 poly26dd_b
 poly27dd
 poly27dd_b
 poly2d
 poly2td
 poly2td_b
 poly3d
 poly3dd
 poly3dd_b
 poly3td
 poly3td_b
 poly4d
 poly4dd_b
 poly4td
 poly4td_b
 poly5d
 poly5dd
 poly5dd_b
 poly5td
 poly5td_b
 poly6d
 poly6dd
 poly6dd_b
 poly6td
 poly6td_b
 poly7d
 poly7dd
 poly7dd_b
 poly7td
 poly7td_b
 poly8d
 poly8dd
 poly8dd_b
 poly8td
 poly8td_b
 poly9dd
 poly9dd_b
 pow_tdx_tdx_tdx
 quickrenormalize_vd3_vd3
 quicktwosum_vd2_vd_vd
 rec_vd3_vd2
 rec_vd3_vd3
 rempio2q
 scale_vd3_vd3_d
 scale_vd3_vd3_vd
 scaleadd2_vd3_vd3_vd3_vd
 scalesub2_vd3_vd3_vd3_vd
 sel_tdx_vo_tdx_tdx
 sel_vd3_vo_vd3_vd3
 signbit_vo_tdx
 sin_tdx_tdx
 sinh_tdx_tdx
 slowcast_vq_tdx
 snprintquad
 snprintquadhex
 sqrt_tdx_tdx
 sqrt_vd3_vd3
 squ_vd3_vd3
 sub2_vd3_vd3_vd3
 sub_tdx_tdx_tdx
 tan_tdx_tdx
 tanh_tdx_tdx
 twoprod_vd2_vd_vd
 twosub_vd2_vd_vd
 twosubx_vd2_vd_vd_vd
 twosum_vd2_vd_vd
 twosumx_vd2_vd_vd_vd
 vtruncate2_vd_vd
 vfloor2_vd_vd
 vceil2_vd_vd
 vround2_vd_vd
 isinf_vo_tdx
 trunc_tdx_tdx
 rint_tdx_tdx
 fmod_tdx_tdx_tdx
 remainder_tdx_tdx_tdx
 cbrt_tdx_tdx
 frexp_tdx_tdx_pvi
 fma_tdx_tdx_tdx_tdx
 hypot_tdx_tdx_tdx
 ilogb_vi_tdx
 ldexp_tdx_tdx_vi
 Sleef_rempitabsp
 Sleef_rempitabdp
 Sleef_rempitabqp
 vcastu_vm_vi
 vcastu_vi_vm
 rvv_sp_vopmask
 rvv_dp_vopmask
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/main_checkfeature.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/main_checkfeature.c
@@ -0,0 +1,50 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <signal.h>
 #include <setjmp.h>
 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
 static jmp_buf sigjmp;
 #define SETJMP(x) setjmp(x)
 #define LONGJMP longjmp
 #else
 static sigjmp_buf sigjmp;
 #define SETJMP(x) sigsetjmp(x, 1)
 #define LONGJMP siglongjmp
 #endif
 int main2(int argc, char **argv);
 int check_feature(double, float);
 static void sighandler(int signum) {
  LONGJMP(sigjmp, 1);
 }
 int detectFeature() {
  signal(SIGILL, sighandler);
  if (SETJMP(sigjmp) == 0) {
    int r = check_feature(1.0, 1.0f);
    signal(SIGILL, SIG_DFL);
    return r;
  } else {
    signal(SIGILL, SIG_DFL);
    return 0;
  }
 }
 int main(int argc, char **argv) {
  if (!detectFeature()) {
    printf("0\n");
    fclose(stdout);
    exit(0);
  }
  return main2(argc, argv);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/misc.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/misc.h
@@ -0,0 +1,332 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2024.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 //
 #ifndef __MISC_H__
 #define __MISC_H__
 #if !defined(SLEEF_GENHEADER)
 #include <stdint.h>
 #include <string.h>
 #endif
 #ifndef M_PI
 #define M_PI 3.141592653589793238462643383279502884
 #endif
 #ifndef M_PIl
 #define M_PIl 3.141592653589793238462643383279502884L
 #endif
 #ifndef M_1_PI
 #define M_1_PI 0.318309886183790671537767526745028724
 #endif
 #ifndef M_1_PIl
 #define M_1_PIl 0.318309886183790671537767526745028724L
 #endif
 #ifndef M_2_PI
 #define M_2_PI 0.636619772367581343075535053490057448
 #endif
 #ifndef M_2_PIl
 #define M_2_PIl 0.636619772367581343075535053490057448L
 #endif
 #if !defined(SLEEF_GENHEADER)
 #ifndef SLEEF_FP_ILOGB0
 #define SLEEF_FP_ILOGB0 ((int)0x80000000)
 #endif
 #ifndef SLEEF_FP_ILOGBNAN
 #define SLEEF_FP_ILOGBNAN ((int)2147483647)
 #endif
 #endif
 #define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
 #define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
 #define SLEEF_FLT_MIN 0x1p-126
 #define SLEEF_DBL_MIN 0x1p-1022
 #define SLEEF_INT_MAX 2147483647
 #define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
 #define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
 //
 /*
  PI_A to PI_D are constants that satisfy the following two conditions.
  * For PI_A, PI_B and PI_C, the last 28 bits are zero.
  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
  The argument of a trig function is multiplied by 1/PI, and the
  integral part is divided into two parts, each has at most 28
  bits. So, the maximum argument that could be correctly reduced
  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
  double precision calculation, the actual maximum argument that can
  be correctly reduced is around 2^47.
 */
 #define PI_A 3.1415926218032836914
 #define PI_B 3.1786509424591713469e-08
 #define PI_C 1.2246467864107188502e-16
 #define PI_D 1.2736634327021899816e-24
 #define TRIGRANGEMAX 1e+14
 /*
  PI_A2 and PI_B2 are constants that satisfy the following two conditions.
  * The last 3 bits of PI_A2 are zero.
  * PI_A2 + PI_B2 is close to PI as much as possible.
  The argument of a trig function is multiplied by 1/PI, and the
  integral part is multiplied by PI_A2. So, the maximum argument that
  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
  we confirmed that it correctly reduces the argument up to around 15.
 */
 #define PI_A2 3.141592653589793116
 #define PI_B2 1.2246467991473532072e-16
 #define TRIGRANGEMAX2 15
 #define M_2_PI_H 0.63661977236758138243
 #define M_2_PI_L -3.9357353350364971764e-17
 #define SQRT_DBL_MAX 1.3407807929942596355e+154
 #define TRIGRANGEMAX3 1e+9
 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
 #define L2U .69314718055966295651160180568695068359375
 #define L2L .28235290563031577122588448175013436025525412068e-12
 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
 #define L10U 0.30102999566383914498 // log 2 / log 10
 #define L10L 1.4205023227266099418e-13
 #define LOG10_2 3.3219280948873623478703194294893901758648313930
 #define L10Uf 0.3010253906f
 #define L10Lf 4.605038981e-06f
 //
 #define PI_Af 3.140625f
 #define PI_Bf 0.0009670257568359375f
 #define PI_Cf 6.2771141529083251953e-07f
 #define PI_Df 1.2154201256553420762e-10f
 #define TRIGRANGEMAXf 39000
 #define PI_A2f 3.1414794921875f
 #define PI_B2f 0.00011315941810607910156f
 #define PI_C2f 1.9841872589410058936e-09f
 #define TRIGRANGEMAX2f 125.0f
 #define TRIGRANGEMAX4f 8e+6f
 #define SQRT_FLT_MAX 18446743523953729536.0
 #define L2Uf 0.693145751953125f
 #define L2Lf 1.428606765330187045e-06f
 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
 #ifndef M_PIf
 # define M_PIf ((float)M_PI)
 #endif
 //
 #ifndef MIN
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #endif
 #ifndef MAX
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 #endif
 #ifndef ABS
 #define ABS(x) ((x) < 0 ? -(x) : (x))
 #endif
 #define stringify(s) stringify_(s)
 #define stringify_(s) #s
 #if !defined(SLEEF_GENHEADER)
 typedef long double longdouble;
 #endif
 #if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
 #define Sleef_double2_DEFINED
 typedef struct {
  double x, y;
 } Sleef_double2;
 #endif
 #if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
 #define Sleef_float2_DEFINED
 typedef struct {
  float x, y;
 } Sleef_float2;
 #endif
 #if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
 #define Sleef_longdouble2_DEFINED
 typedef struct {
  long double x, y;
 } Sleef_longdouble2;
 #endif
 #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 #define LIKELY(condition) __builtin_expect(!!(condition), 1)
 #define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
 #define RESTRICT __restrict__
 #ifndef __arm__
 #define ALIGNED(x) __attribute__((aligned(x)))
 #else
 #define ALIGNED(x)
 #endif
 #if defined(SLEEF_GENHEADER)
 #define INLINE SLEEF_ALWAYS_INLINE
 #define EXPORT SLEEF_INLINE
 #define CONST SLEEF_CONST
 #define NOEXPORT
 #else // #if defined(SLEEF_GENHEADER)
 #define CONST __attribute__((const))
 #define INLINE __attribute__((always_inline))
 #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
 #ifndef SLEEF_STATIC_LIBS
 #define EXPORT __stdcall __declspec(dllexport)
 #define NOEXPORT
 #else // #ifndef SLEEF_STATIC_LIBS
 #define EXPORT
 #define NOEXPORT
 #endif // #ifndef SLEEF_STATIC_LIBS
 #else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
 #define EXPORT __attribute__((visibility("default")))
 #define NOEXPORT __attribute__ ((visibility ("hidden")))
 #endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
 #endif // #if defined(SLEEF_GENHEADER)
 #define SLEEF_NAN __builtin_nan("")
 #define SLEEF_NANf __builtin_nanf("")
 #define SLEEF_NANl __builtin_nanl("")
 #define SLEEF_INFINITY __builtin_inf()
 #define SLEEF_INFINITYf __builtin_inff()
 #define SLEEF_INFINITYl __builtin_infl()
 #if defined(__INTEL_COMPILER) || defined (__clang__)
 #define SLEEF_INFINITYq __builtin_inf()
 #define SLEEF_NANq __builtin_nan("")
 #else
 #define SLEEF_INFINITYq __builtin_infq()
 #define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
 #endif
 #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 #if defined(SLEEF_GENHEADER)
 #define INLINE SLEEF_ALWAYS_INLINE
 #define CONST SLEEF_CONST
 #define EXPORT SLEEF_INLINE
 #define NOEXPORT
 #else // #if defined(SLEEF_GENHEADER)
 #define INLINE __forceinline
 #define CONST
 #ifndef SLEEF_STATIC_LIBS
 #define EXPORT __declspec(dllexport)
 #define NOEXPORT
 #else
 #define EXPORT
 #define NOEXPORT
 #endif
 #endif // #if defined(SLEEF_GENHEADER)
 #define RESTRICT
 #define ALIGNED(x)
 #define LIKELY(condition) (condition)
 #define UNLIKELY(condition) (condition)
 #if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
 #include <x86intrin.h>
 #endif
 #define SLEEF_INFINITY (1e+300 * 1e+300)
 #define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
 #define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
 #define SLEEF_NANf ((float)SLEEF_NAN)
 #define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
 #define SLEEF_NANl ((long double)SLEEF_NAN)
 #if (defined(_M_AMD64) || defined(_M_X64))
 #ifndef __SSE2__
 #define __SSE2__
 #define __SSE3__
 #define __SSE4_1__
 #endif
 #elif _M_IX86_FP == 2
 #ifndef __SSE2__
 #define __SSE2__
 #define __SSE3__
 #define __SSE4_1__
 #endif
 #elif _M_IX86_FP == 1
 #ifndef __SSE__
 #define __SSE__
 #endif
 #endif
 #endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
 #if !defined(__linux__)
 #define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
 #define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
 #define isnanf(x) ((x) != (x))
 #define isnanl(x) ((x) != (x))
 #endif
 #endif // #ifndef __MISC_H__
 #ifdef ENABLE_AAVPCS
 #define VECTOR_CC __attribute__((aarch64_vector_pcs))
 #else
 #define VECTOR_CC
 #endif
 //
 #if defined (__GNUC__) && !defined(__INTEL_COMPILER)
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wunknown-pragmas"
 #if !defined (__clang__)
 #pragma GCC diagnostic ignored "-Wattribute-alias"
 #pragma GCC diagnostic ignored "-Wlto-type-mismatch"
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 #endif
 #if defined(_MSC_VER)
 #pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
 #pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
 #pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
 #pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
 #pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/quaddef.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/common/quaddef.h
@@ -0,0 +1,99 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #if !defined(SLEEF_GENHEADER)
 #if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)
 #define SLEEF_FLOAT128_IS_IEEEQP
 #endif
 #if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
 #define SLEEF_LONGDOUBLE_IS_IEEEQP
 #endif
 #if !defined(Sleef_quad_DEFINED)
 #define Sleef_quad_DEFINED
 typedef struct { uint64_t x, y; } Sleef_uint64_2t;
 #if defined(SLEEF_FLOAT128_IS_IEEEQP) || defined(ENABLEFLOAT128)
 typedef __float128 Sleef_quad;
 #define SLEEF_QUAD_C(x) (x ## Q)
 #elif defined(SLEEF_LONGDOUBLE_IS_IEEEQP)
 typedef long double Sleef_quad;
 #define SLEEF_QUAD_C(x) (x ## L)
 #else
 typedef Sleef_uint64_2t Sleef_quad;
 #endif
 #endif
 #if !defined(Sleef_quad1_DEFINED)
 #define Sleef_quad1_DEFINED
 typedef union {
  struct {
    Sleef_quad x;
  };
  Sleef_quad s[1];
 } Sleef_quad1;
 #endif
 #if !defined(Sleef_quad2_DEFINED)
 #define Sleef_quad2_DEFINED
 typedef union {
  struct {
    Sleef_quad x, y;
  };
  Sleef_quad s[2];
 } Sleef_quad2;
 #endif
 #if !defined(Sleef_quad4_DEFINED)
 #define Sleef_quad4_DEFINED
 typedef union {
  struct {
    Sleef_quad x, y, z, w;
  };
  Sleef_quad s[4];
 } Sleef_quad4;
 #endif
 #if !defined(Sleef_quad8_DEFINED)
 #define Sleef_quad8_DEFINED
 typedef union {
  Sleef_quad s[8];
 } Sleef_quad8;
 #endif
 #if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED)
 #define Sleef_quadx_DEFINED
 typedef union {
  Sleef_quad s[32];
 } Sleef_quadx;
 #endif
 #else // #if !defined(SLEEF_GENHEADER)
 SLEEFSHARPif !defined(SLEEFXXX__NVCC__) && ((defined(SLEEFXXX__SIZEOF_FLOAT128__) && SLEEFXXX__SIZEOF_FLOAT128__ == 16) || (defined(SLEEFXXX__linux__) && defined(SLEEFXXX__GNUC__) && (defined(SLEEFXXX__i386__) || defined(SLEEFXXX__x86_64__))) || (defined(SLEEFXXX__PPC64__) && defined(SLEEFXXX__GNUC__) && !defined(SLEEFXXX__clang__) && SLEEFXXX__GNUC__ >= 8))
 SLEEFSHARPdefine SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP
 SLEEFSHARPendif
 SLEEFSHARPif !defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP) && !defined(SLEEFXXX__NVCC__) && defined(SLEEFXXX__SIZEOF_LONG_DOUBLE__) && SLEEFXXX__SIZEOF_LONG_DOUBLE__ == 16 && (defined(SLEEFXXX__aarch64__) || defined(SLEEFXXX__zarch__))
 SLEEFSHARPdefine SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP
 SLEEFSHARPendif
 SLEEFSHARPif !defined(SLEEFXXXSleef_quad_DEFINED)
 SLEEFSHARPdefine SLEEFXXXSleef_quad_DEFINED
 typedef struct { uint64_t x, y; } Sleef_uint64_2t;
 SLEEFSHARPif defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP)
 typedef __float128 Sleef_quad;
 SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## Q)
 SLEEFSHARPelif defined(SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP)
 typedef long double Sleef_quad;
 SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## L)
 SLEEFSHARPelse
 typedef Sleef_uint64_2t Sleef_quad;
 SLEEFSHARPendif
 SLEEFSHARPendif
 #endif // #if !defined(SLEEF_GENHEADER)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/CMakeLists.txt
@@ -0,0 +1,201 @@
 # Compiler properties
 set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
 set(COMMON_TARGET_PROPERTIES
  C_STANDARD 99                  # -std=gnu99
  )
 #
 function(add_test_dft TESTNAME)
  if (ARMIE_COMMAND)
    add_test(NAME ${TESTNAME} COMMAND ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS} ${ARGN})
  elseif (NOT EMULATOR AND NOT SDE_COMMAND)
    add_test(NAME ${TESTNAME} COMMAND ${ARGN})
  elseif(NOT EMULATOR)
    add_test(NAME ${TESTNAME} COMMAND ${SDE_COMMAND} "--" ${ARGN})
  else()
    add_test(NAME ${TESTNAME} COMMAND ${EMULATOR} ${ARGN})
  endif()
  set_tests_properties(${TESTNAME} PROPERTIES COST 0.1)
 endfunction()
 # Include directories
 include_directories(${PROJECT_SOURCE_DIR}/include)        # sleefdft.h
 include_directories(${sleef_BINARY_DIR}/include)          # sleef.h
 if (FFTW3_INCLUDE_DIR)
  include_directories(${FFTW3_INCLUDE_DIR})               # fftw3.h
 endif()
 # Link directories
 link_directories(${sleef_BINARY_DIR}/lib)                 # libsleef, libsleefdft
 # Link libraries
 set(COMMON_LINK_LIBRARIES ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
 if (COMPILER_SUPPORTS_OPENMP)
  set(COMMON_LINK_LIBRARIES ${COMMON_LINK_LIBRARIES} ${OpenMP_C_FLAGS})
 endif()
 if((NOT MSVC) AND NOT SLEEF_CLANG_ON_WINDOWS)
  # Target executable naivetestdp
  set(TARGET_NAIVETESTDP "naivetestdp")
  add_executable(${TARGET_NAIVETESTDP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
  add_dependencies(${TARGET_NAIVETESTDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
  target_compile_definitions(${TARGET_NAIVETESTDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
  target_link_libraries(${TARGET_NAIVETESTDP} ${COMMON_LINK_LIBRARIES})
  set_target_properties(${TARGET_NAIVETESTDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  # Target executable naivetestsp
  set(TARGET_NAIVETESTSP "naivetestsp")
  add_executable(${TARGET_NAIVETESTSP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
  add_dependencies(${TARGET_NAIVETESTSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
  target_compile_definitions(${TARGET_NAIVETESTSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
  target_link_libraries(${TARGET_NAIVETESTSP} ${COMMON_LINK_LIBRARIES})
  set_target_properties(${TARGET_NAIVETESTSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  # Test naivetestdp
  add_test_dft(${TARGET_NAIVETESTDP}_1 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 1)
  add_test_dft(${TARGET_NAIVETESTDP}_2 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 2)
  add_test_dft(${TARGET_NAIVETESTDP}_3 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 3)
  add_test_dft(${TARGET_NAIVETESTDP}_4 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 4)
  add_test_dft(${TARGET_NAIVETESTDP}_5 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 5)
  add_test_dft(${TARGET_NAIVETESTDP}_10 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 10)
  # Test naivetestsp
  add_test_dft(${TARGET_NAIVETESTSP}_1 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 1)
  add_test_dft(${TARGET_NAIVETESTSP}_2 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 2)
  add_test_dft(${TARGET_NAIVETESTSP}_3 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 3)
  add_test_dft(${TARGET_NAIVETESTSP}_4 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 4)
  add_test_dft(${TARGET_NAIVETESTSP}_5 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 5)
  add_test_dft(${TARGET_NAIVETESTSP}_10 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 10)
 endif()
 # Target executable roundtriptest1ddp
 set(TARGET_ROUNDTRIPTEST1DDP "roundtriptest1ddp")
 add_executable(${TARGET_ROUNDTRIPTEST1DDP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
 add_dependencies(${TARGET_ROUNDTRIPTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
 target_compile_definitions(${TARGET_ROUNDTRIPTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
 target_link_libraries(${TARGET_ROUNDTRIPTEST1DDP} ${COMMON_LINK_LIBRARIES})
 set_target_properties(${TARGET_ROUNDTRIPTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 # Target executable roundtriptest1dsp
 set(TARGET_ROUNDTRIPTEST1DSP "roundtriptest1dsp")
 add_executable(${TARGET_ROUNDTRIPTEST1DSP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
 add_dependencies(${TARGET_ROUNDTRIPTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
 target_compile_definitions(${TARGET_ROUNDTRIPTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
 target_link_libraries(${TARGET_ROUNDTRIPTEST1DSP} ${COMMON_LINK_LIBRARIES})
 set_target_properties(${TARGET_ROUNDTRIPTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 # Target executable roundtriptest2ddp
 set(TARGET_ROUNDTRIPTEST2DDP "roundtriptest2ddp")
 add_executable(${TARGET_ROUNDTRIPTEST2DDP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
 add_dependencies(${TARGET_ROUNDTRIPTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
 target_compile_definitions(${TARGET_ROUNDTRIPTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
 target_link_libraries(${TARGET_ROUNDTRIPTEST2DDP} ${COMMON_LINK_LIBRARIES})
 set_target_properties(${TARGET_ROUNDTRIPTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 # Target executable roundtriptest2dsp
 set(TARGET_ROUNDTRIPTEST2DSP "roundtriptest2dsp")
 add_executable(${TARGET_ROUNDTRIPTEST2DSP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
 add_dependencies(${TARGET_ROUNDTRIPTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
 target_compile_definitions(${TARGET_ROUNDTRIPTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
 target_link_libraries(${TARGET_ROUNDTRIPTEST2DSP} ${COMMON_LINK_LIBRARIES})
 set_target_properties(${TARGET_ROUNDTRIPTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 if (LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
  # Target executable fftwtest1ddp
  set(TARGET_FFTWTEST1DDP "fftwtest1ddp")
  add_executable(${TARGET_FFTWTEST1DDP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
  add_dependencies(${TARGET_FFTWTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
  target_compile_definitions(${TARGET_FFTWTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
  target_link_libraries(${TARGET_FFTWTEST1DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
  set_target_properties(${TARGET_FFTWTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  # Target executable fftwtest1dsp
  set(TARGET_FFTWTEST1DSP "fftwtest1dsp")
  add_executable(${TARGET_FFTWTEST1DSP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
  add_dependencies(${TARGET_FFTWTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
  target_compile_definitions(${TARGET_FFTWTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
  target_link_libraries(${TARGET_FFTWTEST1DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
  set_target_properties(${TARGET_FFTWTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  # Target executable fftwtest2ddp
  set(TARGET_FFTWTEST2DDP "fftwtest2ddp")
  add_executable(${TARGET_FFTWTEST2DDP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
  add_dependencies(${TARGET_FFTWTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
  target_compile_definitions(${TARGET_FFTWTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
  target_link_libraries(${TARGET_FFTWTEST2DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
  set_target_properties(${TARGET_FFTWTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  # Target executable fftwtest2dsp
  set(TARGET_FFTWTEST2DSP "fftwtest2dsp")
  add_executable(${TARGET_FFTWTEST2DSP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
  add_dependencies(${TARGET_FFTWTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
  target_compile_definitions(${TARGET_FFTWTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
  target_link_libraries(${TARGET_FFTWTEST2DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
  set_target_properties(${TARGET_FFTWTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  # Test fftwtest1ddp
  add_test_dft(${TARGET_FFTWTEST1DDP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 12)
  add_test_dft(${TARGET_FFTWTEST1DDP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 16)
  # Test fftwtest1dsp
  add_test_dft(${TARGET_FFTWTEST1DSP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 12)
  add_test_dft(${TARGET_FFTWTEST1DSP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 16)
  # Test fftwtest2ddp
  add_test_dft(${TARGET_FFTWTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 2 2)
  add_test_dft(${TARGET_FFTWTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 4 4)
  add_test_dft(${TARGET_FFTWTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 8 8)
  add_test_dft(${TARGET_FFTWTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 10 10)
  add_test_dft(${TARGET_FFTWTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 5 15)
  # Test fftwtest2dsp
  add_test_dft(${TARGET_FFTWTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 2 2)
  add_test_dft(${TARGET_FFTWTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 4 4)
  add_test_dft(${TARGET_FFTWTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 8 8)
  add_test_dft(${TARGET_FFTWTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 10 10)
  add_test_dft(${TARGET_FFTWTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 5 15)
 else(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
  if(MSVC OR SLEEF_CLANG_ON_WINDOWS)
    # Test roundtriptestdp
    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 1 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 2 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 3 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 4 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 5 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 10 10)
    # Test roundtriptestsp
    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 1 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 2 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 3 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 4 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 5 10)
    add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 10 10)
  endif()
  add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 12 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 16 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 12 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 16 10)
  # Test roundtriptest2ddp
  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 2 2 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 4 4 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 8 8 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 10 10 2)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 5 15 2)
  # Test roundtriptest2dsp
  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 2 2 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 4 4 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 8 8 10)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 10 10 2)
  add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 5 15 2)
 endif(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/bench1d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/bench1d.c
@@ -0,0 +1,116 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #define _DEFAULT_SOURCE
 #define _XOPEN_SOURCE 700
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <complex.h>
 #include <time.h>
 #include <unistd.h>
 #include <sys/time.h>
 #ifdef USEFFTW
 #include <fftw3.h>
 #include <omp.h>
 #else
 #include "sleef.h"
 #include "sleefdft.h"
 #endif
 typedef double real;
 static uint64_t gettime() {
  struct timespec tp;
  clock_gettime(CLOCK_MONOTONIC, &tp);
  return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
 }
 #define REPEAT 8
 int main(int argc, char **argv) {
  if (argc == 1) {
    fprintf(stderr, "%s <log2n>\n", argv[0]);
    exit(-1);
  }
  int backward = 0;
  int log2n = atoi(argv[1]);
  if (log2n < 0) {
    backward = 1;
    log2n = -log2n;
  }
  const int n = 1 << log2n;
  const int64_t niter = (int)(100000000000.0 / n / log2n);
  printf("Number of iterations = %lld\n", (long long int)niter);
 #ifdef USEFFTW
  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
 #if 0
  int fftw_init_threads(void);
  fftw_plan_with_nthreads(omp_get_max_threads());
 #endif
  fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_MEASURE);
  //fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_PATIENT);
  for(int i=0;i<n;i++) {
    in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
  }
  for(int64_t i=0;i<niter/2;i++) fftw_execute(w);
 #else
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
  real *in  = (real *)Sleef_malloc(n*2 * sizeof(real));
  real *out = (real *)Sleef_malloc(n*2 * sizeof(real));
  int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE; // | SLEEF_MODE_NO_MT;
  if (argc >= 3) mode = SLEEF_MODE_VERBOSE | SLEEF_MODE_ESTIMATE;
  if (backward) mode |= SLEEF_MODE_BACKWARD;
  struct SleefDFT *p = SleefDFT_double_init1d(n, in, out, mode);
  if (argc >= 3) SleefDFT_setPath(p, argv[2]);
  for(int i=0;i<n*2;i++) {
    in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
  }
  for(int64_t i=0;i<niter/2;i++) SleefDFT_double_execute(p, in, out);
 #endif
  for(int rep=0;rep<REPEAT;rep++) {
    uint64_t tm0 = gettime();
    for(int64_t i=0;i<niter;i++) {
 #ifdef USEFFTW
      fftw_execute(w);
 #else
      SleefDFT_double_execute(p, in, out);
 #endif
    }
    uint64_t tm1 = gettime();
    printf("Actual    time = %g ns\n", (double)(tm1 - tm0) / niter);
    double timeus = (tm1 - tm0) / ((double)niter * 1000);
    double mflops = 5 * n * log2n / timeus;
    printf("%g Mflops\n", mflops);
  }
  //
  exit(0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest1d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest1d.c
@@ -0,0 +1,230 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <time.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 #include <fftw3.h>
 #ifndef MODE
 #define MODE SLEEF_MODE_DEBUG
 #endif
 #if BASETYPEID == 1
 #define THRES 1e-30
 #define SleefDFT_init1d SleefDFT_double_init1d
 #define SleefDFT_execute SleefDFT_double_execute
 typedef double real;
 #elif BASETYPEID == 2
 #define THRES 1e-13
 #define SleefDFT_init1d SleefDFT_float_init1d
 #define SleefDFT_execute SleefDFT_float_execute
 typedef float real;
 #else
 #error BASETYPEID not set
 #endif
 static double squ(double x) { return x * x; }
 // complex forward
 double check_cf(int n) {
  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
  fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
  real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
  real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, MODE);
  for(int i=0;i<n;i++) {
    real re = (2.0 * random() - 1) / (real)RAND_MAX;
    real im = (2.0 * random() - 1) / (real)RAND_MAX;
    sx[(i*2+0)] = re;
    sx[(i*2+1)] = im;
    in[i] = re + im * _Complex_I;
  }
  SleefDFT_execute(p, NULL, NULL);
  fftw_execute(w);
  double rmsn = 0, rmsd = 0;
  for(int i=0;i<n;i++) {
    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
  }
  fftw_destroy_plan(w);
  fftw_free(in);
  fftw_free(out);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  return rmsn / rmsd;
 }
 // complex backward
 double check_cb(int n) {
  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
  fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
  real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
  real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_BACKWARD | MODE);
  for(int i=0;i<n;i++) {
    real re = (2.0 * random() - 1) / (real)RAND_MAX;
    real im = (2.0 * random() - 1) / (real)RAND_MAX;
    sx[(i*2+0)] = re;
    sx[(i*2+1)] = im;
    in[i] = re + im * _Complex_I;
  }
  SleefDFT_execute(p, NULL, NULL);
  fftw_execute(w);
  double rmsn = 0, rmsd = 0;
  for(int i=0;i<n;i++) {
    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
  }
  fftw_destroy_plan(w);
  fftw_free(in);
  fftw_free(out);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  return rmsn / rmsd;
 }
 // real forward
 double check_rf(int n) {
  double       *in  = (double *)      fftw_malloc(sizeof(double) * n);
  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
  fftw_plan w       = fftw_plan_dft_r2c_1d(n, in, out, FFTW_ESTIMATE);
  real *sx = (real *)Sleef_malloc(n*sizeof(real));
  real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | MODE);
  for(int i=0;i<n;i++) {
    real re = (2.0 * random() - 1) / (real)RAND_MAX;
    sx[i] = re;
    in[i] = re;
  }
  SleefDFT_execute(p, NULL, NULL);
  fftw_execute(w);
  double rmsn = 0, rmsd = 0;
  for(int i=0;i<n/2+1;i++) {
    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
  }
  fftw_destroy_plan(w);
  fftw_free(in);
  fftw_free(out);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  return rmsn / rmsd;
 }
 // real backward
 double check_rb(int n) {
  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
  double       *out = (double *)      fftw_malloc(sizeof(double) * n);
  fftw_plan w = fftw_plan_dft_c2r_1d(n, in, out, FFTW_ESTIMATE);
  real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
  real *sy = (real *)Sleef_malloc(sizeof(real)*n);
  struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
  for(int i=0;i<n/2;i++) {
    if (i == 0) {
      in[0  ] = (2.0 * (rand() / (real)RAND_MAX) - 1);
      in[n/2] = (2.0 * (rand() / (real)RAND_MAX) - 1);
    } else {
      in[i  ] = (2.0 * (rand() / (real)RAND_MAX) - 1) + (2.0 * (rand() / (real)RAND_MAX) - 1) * _Complex_I;
    }
  }
  for(int i=0;i<n/2+1;i++) {
    sx[2*i+0] = creal(in[i]);
    sx[2*i+1] = cimag(in[i]);
  }
  SleefDFT_execute(p, NULL, NULL);
  fftw_execute(w);
  double rmsn = 0, rmsd = 0;
  for(int i=0;i<n;i++) {
    rmsn += squ(sy[i] - out[i]);
    rmsd += squ(        out[i]);
  }
  fftw_destroy_plan(w);
  fftw_free(in);
  fftw_free(out);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  return rmsn / rmsd;
 }
 int main(int argc, char **argv) {
  if (argc != 2) {
    fprintf(stderr, "%s <log2n>\n", argv[0]);
    exit(-1);
  }
  const int n = 1 << atoi(argv[1]);
  srand((unsigned int)time(NULL));
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
  //
  int success = 1;
  double e;
  e = check_cf(n);
  success = success && e < THRES;
  printf("complex forward   : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  e = check_cb(n);
  success = success && e < THRES;
  printf("complex backward  : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  e = check_rf(n);
  success = success && e < THRES;
  printf("real forward      : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  e = check_rb(n);
  success = success && e < THRES;
  printf("real backward     : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  exit(success ? 0 : -1);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest2d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/fftwtest2d.c
@@ -0,0 +1,143 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <time.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 #include <fftw3.h>
 #ifndef MODE
 #define MODE SLEEF_MODE_DEBUG
 #endif
 #if BASETYPEID == 1
 #define THRES 1e-30
 #define SleefDFT_init2d SleefDFT_double_init2d
 #define SleefDFT_execute SleefDFT_double_execute
 typedef double real;
 #elif BASETYPEID == 2
 #define THRES 1e-13
 #define SleefDFT_init2d SleefDFT_float_init2d
 #define SleefDFT_execute SleefDFT_float_execute
 typedef float real;
 #else
 #error BASETYPEID not set
 #endif
 static double squ(double x) { return x * x; }
 // complex forward
 double check_cf(int n, int m) {
  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
  fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
  real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
  real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
  struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, MODE);
  for(int i=0;i<n*m;i++) {
    double re = (2.0 * random() - 1) / (double)RAND_MAX;
    double im = (2.0 * random() - 1) / (double)RAND_MAX;
    sx[(i*2+0)] = re;
    sx[(i*2+1)] = im;
    in[i] = re + im * _Complex_I;
  }
  SleefDFT_execute(p, NULL, NULL);
  fftw_execute(w);
  double rmsn = 0, rmsd = 0;
  for(int i=0;i<n*m;i++) {
    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
  }
  fftw_destroy_plan(w);
  fftw_free(in);
  fftw_free(out);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  return rmsn / rmsd;
 }
 // complex backward
 double check_cb(int n, int m) {
  fftw_complex *in  = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
  fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
  fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
  real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
  real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
  struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, SLEEF_MODE_BACKWARD | MODE);
  for(int i=0;i<n*m;i++) {
    double re = (2.0 * random() - 1) / (double)RAND_MAX;
    double im = (2.0 * random() - 1) / (double)RAND_MAX;
    sx[(i*2+0)] = re;
    sx[(i*2+1)] = im;
    in[i] = re + im * _Complex_I;
  }
  SleefDFT_execute(p, NULL, NULL);
  fftw_execute(w);
  double rmsn = 0, rmsd = 0;
  for(int i=0;i<n*m;i++) {
    rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
    rmsd += squ(            creal(out[i])) + squ(            cimag(out[i]));
  }
  fftw_destroy_plan(w);
  fftw_free(in);
  fftw_free(out);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  return rmsn / rmsd;
 }
 int main(int argc, char **argv) {
  if (argc != 3) {
    fprintf(stderr, "%s <log2n> <log2m>\n", argv[0]);
    exit(-1);
  }
  const int n = 1 << atoi(argv[1]);
  const int m = 1 << atoi(argv[2]);
  srand((unsigned int)time(NULL));
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
  //
  int success = 1;
  double e;
  e = check_cf(n, m);
  success = success && e < THRES;
  printf("complex forward   : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  e = check_cb(n, m);
  success = success && e < THRES;
  printf("complex backward  : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  exit(success ? 0 : -1);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/measuredft.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/measuredft.c
@@ -0,0 +1,175 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #define _DEFAULT_SOURCE
 #define _XOPEN_SOURCE 700
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <time.h>
 #include <unistd.h>
 #include <sys/time.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 static uint64_t gettime() {
  struct timespec tp;
  clock_gettime(CLOCK_MONOTONIC, &tp);
  return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
 }
 int mode[] = { SLEEF_MODE_MEASURE | SLEEF_MODE_NO_MT, SLEEF_MODE_MEASURE};
 #define ENABLE_SP
 //#define ROUNDTRIP
 #define REPEAT 2
 //#define ENABLE_SLEEP
 //#define WARMUP
 int main(int argc, char **argv) {
  int start = 1, end = 18;
  if (argc > 1) start = atoi(argv[1]);
  if (argc > 2) end = atoi(argv[2]);
  double *din  = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
  double *dout = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
  float *sin  = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
  float *sout = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
  for(int log2n=start;log2n<=end;log2n++) {
    const int n = 1 << log2n;
    int64_t niter = (int64_t)(1000000000.0 / REPEAT / n / log2n);
    printf("%d ", n);
    for(int m=0;m<2;m++) {
 #ifdef ENABLE_SLEEP
      sleep(1);
 #endif
      struct SleefDFT *pf = SleefDFT_double_init1d(n, NULL, NULL, mode[m]);
 #ifdef ROUNDTRIP
      struct SleefDFT *pb = SleefDFT_double_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
 #endif
      for(int i=0;i<n*2;i++) {
        din[i] = 0;
      }
 #ifdef ENABLE_SLEEP
      sleep(1);
 #endif
 #ifdef WARMUP
      for(int64_t i=0;i<niter/2;i++) {
        SleefDFT_double_execute(pf, din, dout);
 #ifdef ROUNDTRIP
        SleefDFT_double_execute(pb, dout, din);
 #endif
      }
 #endif
      uint64_t best = 1LL << 62;
      //printf("\n");
      for(int rep=0;rep<REPEAT;rep++) {
        uint64_t tm0 = gettime();
        for(int64_t i=0;i<niter;i++) {
          SleefDFT_double_execute(pf, din, dout);
 #ifdef ROUNDTRIP
          SleefDFT_double_execute(pb, dout, din);
 #endif
        }
        uint64_t tm1 = gettime();
        if (tm1 - tm0 < best) best = tm1 - tm0;
        //printf("%g\n", (double)(tm1 - tm0));
      }
      SleefDFT_dispose(pf);
 #ifdef ROUNDTRIP
      SleefDFT_dispose(pb);
 #endif
      double timeus = best / ((double)niter * 1000);
 #ifdef ROUNDTRIP
      double mflops = 10 * n * log2n / timeus;
 #else
      double mflops = 5 * n * log2n / timeus;
 #endif
      printf("%g ", mflops);
    }
 #ifdef ENABLE_SP
    for(int m=0;m<2;m++) {
 #ifdef ENABLE_SLEEP
      sleep(1);
 #endif
      struct SleefDFT *pf = SleefDFT_float_init1d(n, NULL, NULL, mode[m]);
 #ifdef ROUNDTRIP
      struct SleefDFT *pb = SleefDFT_float_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
 #endif
      for(int i=0;i<n*2;i++) {
        sin[i] = 0;
      }
 #ifdef ENABLE_SLEEP
      sleep(1);
 #endif
 #ifdef WARMUP
      for(int64_t i=0;i<niter/2;i++) {
        SleefDFT_float_execute(pf, sin, sout);
 #ifdef OUNDTRIP
        SleefDFT_float_execute(pb, sout, sin);
 #endif
      }
 #endif
      uint64_t best = 1LL << 62;
      for(int rep=0;rep<REPEAT;rep++) {
        uint64_t tm0 = gettime();
        for(int64_t i=0;i<niter;i++) {
          SleefDFT_float_execute(pf, sin, sout);
 #ifdef ROUNDTRIP
          SleefDFT_float_execute(pb, sout, sin);
 #endif
        }
        uint64_t tm1 = gettime();
        if (tm1 - tm0 < best) best = tm1 - tm0;
      }
      SleefDFT_dispose(pf);
 #ifdef ROUNDTRIP
      SleefDFT_dispose(pb);
 #endif
      double timeus = best / ((double)niter * 1000);
 #ifdef ROUNDTRIP
      double mflops = 10 * n * log2n / timeus;
 #else
      double mflops = 5 * n * log2n / timeus;
 #endif
      printf("%g ", mflops);
    }
 #endif
    printf("\n");
  }
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/naivetest.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/naivetest.c
@@ -0,0 +1,484 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <time.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 #include "misc.h"
 #ifndef MODE
 #define MODE SLEEF_MODE_DEBUG
 #endif
 #define THRES 1e-4
 #if BASETYPEID == 1
 #define SleefDFT_init SleefDFT_double_init1d
 #define SleefDFT_execute SleefDFT_double_execute
 typedef double real;
 typedef double complex cmpl;
 cmpl omega(double n, double kn) {
  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
 }
 #elif BASETYPEID == 2
 #define SleefDFT_init SleefDFT_float_init1d
 #define SleefDFT_execute SleefDFT_float_execute
 typedef float real;
 typedef double complex cmpl;
 cmpl omega(double n, double kn) {
  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
 }
 #elif BASETYPEID == 3
 #define SleefDFT_init SleefDFT_longdouble_init1d
 #define SleefDFT_execute SleefDFT_longdouble_execute
 typedef double real;
 typedef double complex cmpl;
 cmpl omega(double n, double kn) {
  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
 }
 #elif BASETYPEID == 4
 #include <quadmath.h>
 #define SleefDFT_init SleefDFT_quad_init1d
 #define SleefDFT_execute SleefDFT_quad_execute
 typedef Sleef_quad real;
 typedef double complex cmpl;
 cmpl omega(double n, double kn) {
  return cexp((-2 * M_PIl * _Complex_I / n) * kn);
 }
 #else
 #error No BASETYPEID specified
 #endif
 void forward(cmpl *ts, cmpl *fs, int len) {
  int k, n;
  for(k=0;k<len;k++) {
    fs[k] = 0;
    for(n=0;n<len;n++) {
      fs[k] += ts[n] * omega(len, n*k);
    }
  }
 }
 void backward(cmpl *fs, cmpl *ts, int len) {
  int k, n;
  for(k=0;k<len;k++) {
    ts[k] = 0;
    for(n=0;n<len;n++) {
      ts[k] += fs[n] * omega(-len, n*k);
    }
  }
 }
 // complex forward
 int check_cf(int n) {
  int i;
  real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
  real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  //
  for(i=0;i<n;i++) {
    ts[i] = 0.5 * ((2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I);
    sx[(i*2+0)] = creal(ts[i]);
    sx[(i*2+1)] = cimag(ts[i]);
  }
  //
  forward(ts, fs, n);
  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_VERBOSE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  //
  int success = 1;
  double rmsn = 0, rmsd = 0;
  for(i=0;i<n;i++) {
    if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
        (fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
      success = 0;
    }
    double t;
    t = (sy[(i*2+0)] - creal(fs[i]));
    rmsn += t*t;
    t = (sy[(i*2+1)] - cimag(fs[i]));
    rmsn += t*t;
    rmsd += creal(fs[i]) * creal(fs[i]) + cimag(fs[i]) * cimag(fs[i]);
  }
  //
  free(fs);
  free(ts);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  //
  return success;
 }
 // complex backward
 int check_cb(int n) {
  int i;
  real *sx = (real *)Sleef_malloc(sizeof(real)*n*2);
  real *sy = (real *)Sleef_malloc(sizeof(real)*n*2);
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  //
  for(i=0;i<n;i++) {
    fs[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
    sx[(i*2+0)] = creal(fs[i]);
    sx[(i*2+1)] = cimag(fs[i]);
  }
  backward(fs, ts, n);
  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_BACKWARD | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  //
  int success = 1;
  for(i=0;i<n;i++) {
    if ((fabs(sy[(i*2+0)] - creal(ts[i])) > THRES) ||
        (fabs(sy[(i*2+1)] - cimag(ts[i])) > THRES)) {
      success = 0;
    }
  }
  //
  free(fs);
  free(ts);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  //
  return success;
 }
 // real forward
 int check_rf(int n) {
  int i;
  real *sx = (real *)Sleef_malloc(n * sizeof(real));
  real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  //
  for(i=0;i<n;i++) {
    ts[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
    sx[i] = creal(ts[i]);
  }
  //
  forward(ts, fs, n);
  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_NO_MT | SLEEF_MODE_REAL | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  //
  int success = 1;
  for(i=0;i<n/2+1;i++) {
    if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
    if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
  }
  //
  free(fs);
  free(ts);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  //
  return success;
 }
 // real backward
 int check_rb(int n) {
  int i;
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  //
  for(i=0;i<n/2;i++) {
    if (i == 0) {
      fs[0  ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
      fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
    } else {
      fs[i  ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
      fs[n-i] = conj(fs[i]);
    }
  }
  real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
  real *sy = (real *)Sleef_malloc(sizeof(real)*n);
  for(i=0;i<n/2+1;i++) {
    sx[2*i+0] = creal(fs[i]);
    sx[2*i+1] = cimag(fs[i]);
  }
  //
  backward(fs, ts, n);
  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  //
  int success = 1;
  for(i=0;i<n;i++) {
    if (fabs(cimag(ts[i])) > THRES) {
      success = 0;
    }
    if ((fabs(sy[i] - creal(ts[i])) > THRES)) {
      success = 0;
    }
  }
  //
  free(fs);
  free(ts);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  //
  return success;
 }
 int check_arf(int n) {
  int i;
  real *sx = (real *)Sleef_malloc(n * sizeof(real));
  real *sy = (real *)Sleef_malloc(n * sizeof(real));
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  //
  for(i=0;i<n;i++) {
    ts[i] = 2 * (rand() / (real)RAND_MAX) - 1;
    sx[i] = creal(ts[i]);
  }
  //
  backward(ts, fs, n);
  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_ALT | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  //
  int success = 1;
  for(i=0;i<n/2;i++) {
    if (i == 0) {
      if (fabs(sy[(2*0+0)] - creal(fs[0  ])) > THRES) success = 0;
      if (fabs(sy[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0;
    } else {
      if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
      if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
    }
  }
  //
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  //
  return success;
 }
 int check_arb(int n) {
  int i;
  real *sx = (real *)Sleef_malloc(n * sizeof(real));
  real *sy = (real *)Sleef_malloc(n * sizeof(real));
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  //
  for(i=0;i<n/2;i++) {
    if (i == 0) {
      fs[0  ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
      fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
    } else {
      fs[i  ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
      fs[n-i] = conj(fs[i]);
    }
  }
  for(i=0;i<n/2;i++) {
    if (i == 0) {
      sx[2*0+0] = creal(fs[0  ]);
      sx[2*0+1] = creal(fs[n/2]);
    } else {
      sx[2*i+0] = creal(fs[i]);
      sx[2*i+1] = cimag(fs[i]);
    }
  }
  //
  forward(fs, ts, n);
  struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | SLEEF_MODE_ALT | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  //
  int success = 1;
  for(i=0;i<n;i++) {
    if (fabs(cimag(ts[i])) > THRES) {
      success = 0;
    }
    if ((fabs(sy[i]*2 - creal(ts[i])) > THRES)) {
      success = 0;
    }
  }
  //
  free(fs);
  free(ts);
  Sleef_free(sx);
  Sleef_free(sy);
  SleefDFT_dispose(p);
  //
  return success;
 }
 int main(int argc, char **argv) {
  if (argc != 2) {
    fprintf(stderr, "%s <log2n>\n", argv[0]);
    exit(-1);
  }
  const int n = 1 << atoi(argv[1]);
  srand((unsigned int)time(NULL));
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
  //
  int success = 1;
  printf("complex  forward   : %s\n", (success &= check_cf(n))  ? "OK" : "NG");
  printf("complex  backward  : %s\n", (success &= check_cb(n))  ? "OK" : "NG");
  printf("real     forward   : %s\n", (success &= check_rf(n))  ? "OK" : "NG");
  printf("real     backward  : %s\n", (success &= check_rb(n))  ? "OK" : "NG");
  printf("real alt forward   : %s\n", (success &= check_arf(n)) ? "OK" : "NG");
  printf("real alt backward  : %s\n", (success &= check_arb(n)) ? "OK" : "NG");
  exit(!success);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest1d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest1d.c
@@ -0,0 +1,174 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <time.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 #ifndef MODE
 #define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
 #endif
 #if BASETYPEID == 1
 #define THRES 1e-30
 #define SleefDFT_init SleefDFT_double_init1d
 #define SleefDFT_execute SleefDFT_double_execute
 typedef double real;
 #elif BASETYPEID == 2
 #define THRES 1e-13
 #define SleefDFT_init SleefDFT_float_init1d
 #define SleefDFT_execute SleefDFT_float_execute
 typedef float real;
 #else
 #error BASETYPEID not set
 #endif
 static double squ(double x) { return x * x; }
 // complex transforms
 double check_c(int n) {
  struct SleefDFT *p;
  real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
  real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
  real *sz = (real *)Sleef_malloc(n*2 * sizeof(real));
  for(int i=0;i<n*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
  //
  p = SleefDFT_init(n, NULL, NULL, MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    exit(-1);
  }
  SleefDFT_execute(p, sx, sy);
  SleefDFT_dispose(p);
  //
  p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    exit(-1);
  }
  SleefDFT_execute(p, sy, sz);
  SleefDFT_dispose(p);
  //
  double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
  for(int i=0;i<n;i++) {
    rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
    rmsd += squ(                    sx[i*2+0]) + squ(                    sx[i*2+1]);
  }
  //
  Sleef_free(sx);
  Sleef_free(sy);
  Sleef_free(sz);
  //
  return rmsn / rmsd;
 }
 // real transforms
 double check_r(int n) {
  struct SleefDFT *p;
  real *sx = (real *)Sleef_malloc(n * sizeof(real));
  real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
  real *sz = (real *)Sleef_malloc(n * sizeof(real));
  for(int i=0;i<n;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
  //
  p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sx, sy);
  SleefDFT_dispose(p);
  //
  p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    return 0;
  }
  SleefDFT_execute(p, sy, sz);
  SleefDFT_dispose(p);
  //
  double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
  for(int i=0;i<n;i++) {
    rmsn += squ(scale * sz[i] - sx[i]);
    rmsd += squ(                sx[i]);
  }
  //
  Sleef_free(sx);
  Sleef_free(sy);
  Sleef_free(sz);
  //
  return rmsn / rmsd;
 }
 int main(int argc, char **argv) {
  if (argc < 2) {
    fprintf(stderr, "%s <log2n> [<nloop>]\n", argv[0]);
    exit(-1);
  }
  const int n = 1 << atoi(argv[1]);
  const int nloop = argc >= 3 ? atoi(argv[2]) : 1;
  srand((unsigned int)time(NULL));
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
  //
  int success = 1;
  double e;
  for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
    e = check_c(n);
    success = success && e < THRES;
    printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
    e = check_r(n);
    success = success && e < THRES;
    printf("real    : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  }
  exit(!success);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest2d.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/roundtriptest2d.c
@@ -0,0 +1,118 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <time.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 #ifndef MODE
 #define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
 #endif
 #if BASETYPEID == 1
 #define THRES 1e-30
 #define SleefDFT_init2d SleefDFT_double_init2d
 #define SleefDFT_execute SleefDFT_double_execute
 typedef double real;
 #elif BASETYPEID == 2
 #define THRES 1e-13
 #define SleefDFT_init2d SleefDFT_float_init2d
 #define SleefDFT_execute SleefDFT_float_execute
 typedef float real;
 #else
 #error BASETYPEID not set
 #endif
 static double squ(double x) { return x * x; }
 // complex transforms
 double check_c(int n, int m) {
  struct SleefDFT *p;
  real *sx = (real *)Sleef_malloc(n*m*2 * sizeof(real));
  real *sy = (real *)Sleef_malloc(n*m*2 * sizeof(real));
  real *sz = (real *)Sleef_malloc(n*m*2 * sizeof(real));
  for(int i=0;i<n*m*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
  //
  p = SleefDFT_init2d(n, m, NULL, NULL, MODE);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    exit(-1);
  }
  SleefDFT_execute(p, sx, sy);
  SleefDFT_dispose(p);
  //
  p = SleefDFT_init2d(n, m, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    exit(-1);
  }
  SleefDFT_execute(p, sy, sz);
  SleefDFT_dispose(p);
  //
  double rmsn = 0, rmsd = 0, scale = 1 / (n*(double)m);
  for(int i=0;i<n*m;i++) {
    rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
    rmsd += squ(                    sx[i*2+0]) + squ(                    sx[i*2+1]);
  }
  //
  Sleef_free(sx);
  Sleef_free(sy);
  Sleef_free(sz);
  //
  return rmsn / rmsd;
 }
 int main(int argc, char **argv) {
  if (argc < 3) {
    fprintf(stderr, "%s <log2n> <log2m> [<nloop>]\n", argv[0]);
    exit(-1);
  }
  const int n = 1 << atoi(argv[1]);
  const int m = 1 << atoi(argv[2]);
  const int nloop = argc >= 4 ? atoi(argv[3]) : 1;
  srand((unsigned int)time(NULL));
  SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
  //
  int success = 1;
  double e;
  for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
    e = check_c(n, m);
    success = success && e < THRES;
    printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
  }
  exit(!success);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/tutorial.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft-tester/tutorial.c
@@ -0,0 +1,80 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 // gcc tutorial.c -lsleef -lsleefdft -lm
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <math.h>
 #include <complex.h>
 #include "sleef.h"
 #include "sleefdft.h"
 #define THRES 1e-4
 typedef double complex cmpl;
 cmpl omega(double n, double kn) {
  return cexp((-2 * M_PI * _Complex_I / n) * kn);
 }
 void forward(cmpl *ts, cmpl *fs, int len) {
  for(int k=0;k<len;k++) {
    fs[k] = 0;
    for(int n=0;n<len;n++) fs[k] += ts[n] * omega(len, n*k);
  }
 }
 int main(int argc, char **argv) {
  int n = 256;
  if (argc == 2) n = 1 << atoi(argv[1]);
  SleefDFT_setPlanFilePath("plan.txt", NULL, SLEEF_PLAN_AUTOMATIC);
  double *sx = (double *)Sleef_malloc(n*2 * sizeof(double));
  double *sy = (double *)Sleef_malloc(n*2 * sizeof(double));
  struct SleefDFT *p = SleefDFT_double_init1d(n, sx, sy, SLEEF_MODE_FORWARD);
  if (p == NULL) {
    printf("SleefDFT initialization failed\n");
    exit(-1);
  }
  cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
  cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
  for(int i=0;i<n;i++) {
    ts[i] =
      (2.0 * (rand() / (double)RAND_MAX) - 1) * 1.0 +
      (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
    sx[(i*2+0)] = creal(ts[i]);
    sx[(i*2+1)] = cimag(ts[i]);
  }
  forward(ts, fs, n);
  SleefDFT_double_execute(p, NULL, NULL);
  int success = 1;
  for(int i=0;i<n;i++) {
    if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
        (fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
      success = 0;
    }
  }
  printf("%s\n", success ? "OK" : "NG");
  free(fs); free(ts);
  Sleef_free(sy); Sleef_free(sx);
  SleefDFT_dispose(p);
  exit(success);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/CMakeLists.txt
@@ -0,0 +1,425 @@
 # Options
 if (COMPILER_SUPPORTS_SVE)
  set(SLEEFDFT_MAXBUTWIDTH 6 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
 else()
  set(SLEEFDFT_MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
 endif()
 if (SLEEFDFT_MAXBUTWIDTH GREATER 7)
  message(FATAL_ERROR "SLEEFDFT_MAXBUTWIDTH has to be smaller than 8." )
 endif()
 option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF)
 # Settings
 # Constants definition
 set(LISTSHORTTYPENAME "dp" "sp")
 set(LISTLONGTYPENAME "double" "float")
 set(LISTTYPEID "1" "2")
 set(MACRODEF_vecextdp BASETYPEID=1 ENABLE_VECEXT CONFIG=1)
 set(CFLAGS_vecextdp ${FLAGS_ENABLE_VECEXT})
 set(MACRODEF_vecextsp BASETYPEID=2 ENABLE_VECEXT CONFIG=1)
 set(CFLAGS_vecextsp ${FLAGS_ENABLE_VECEXT})
 set(MACRODEF_vecextld BASETYPEID=3 ENABLE_VECEXT CONFIG=1)
 set(CFLAGS_vecextld ${FLAGS_ENABLE_VECEXT})
 set(MACRODEF_vecextqp BASETYPEID=4 ENABLE_VECEXT CONFIG=1)
 set(CFLAGS_vecextqp ${FLAGS_ENABLE_VECEXT})
 set(MACRODEF_purecdp BASETYPEID=1 ENABLE_PUREC CONFIG=1)
 set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC})
 set(MACRODEF_purecsp BASETYPEID=2 ENABLE_PUREC CONFIG=1)
 set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC})
 set(MACRODEF_purecld BASETYPEID=3 ENABLE_PUREC CONFIG=1)
 set(CFLAGS_purecld ${FLAGS_ENABLE_PUREC})
 set(MACRODEF_purecqp BASETYPEID=4 ENABLE_PUREC CONFIG=1)
 set(CFLAGS_purecqp ${FLAGS_ENABLE_PUREC})
 set(MACRODEF_sse2dp BASETYPEID=1 ENABLE_SSE2 CONFIG=4)
 set(CFLAGS_sse2dp ${FLAGS_ENABLE_SSE4})
 set(MACRODEF_sse2sp BASETYPEID=2 ENABLE_SSE2 CONFIG=4)
 set(CFLAGS_sse2sp ${FLAGS_ENABLE_SSE4})
 set(MACRODEF_avxdp BASETYPEID=1 ENABLE_AVX CONFIG=1)
 set(CFLAGS_avxdp ${FLAGS_ENABLE_AVX})
 set(MACRODEF_avxsp BASETYPEID=2 ENABLE_AVX CONFIG=1)
 set(CFLAGS_avxsp ${FLAGS_ENABLE_AVX})
 set(MACRODEF_avx2dp BASETYPEID=1 ENABLE_AVX2 CONFIG=1)
 set(CFLAGS_avx2dp ${FLAGS_ENABLE_AVX2})
 set(MACRODEF_avx2sp BASETYPEID=2 ENABLE_AVX2 CONFIG=1)
 set(CFLAGS_avx2sp ${FLAGS_ENABLE_AVX2})
 set(MACRODEF_avx512fdp BASETYPEID=1 ENABLE_AVX512F CONFIG=1)
 set(CFLAGS_avx512fdp ${FLAGS_ENABLE_AVX512F})
 set(MACRODEF_avx512fsp BASETYPEID=2 ENABLE_AVX512F CONFIG=1)
 set(CFLAGS_avx512fsp ${FLAGS_ENABLE_AVX512F})
 set(MACRODEF_advsimddp BASETYPEID=1 ENABLE_ADVSIMD CONFIG=1)
 set(CFLAGS_advsimddp ${FLAGS_ENABLE_ADVSIMD})
 set(MACRODEF_advsimdsp BASETYPEID=2 ENABLE_ADVSIMD CONFIG=1)
 set(CFLAGS_advsimdsp ${FLAGS_ENABLE_ADVSIMD})
 set(MACRODEF_neon32sp BASETYPEID=2 ENABLE_NEON32 CONFIG=1)
 set(CFLAGS_neon32sp ${FLAGS_ENABLE_NEON32})
 set(MACRODEF_sve256dp BASETYPEID=1 ENABLE_SVE CONFIG=8)
 set(CFLAGS_sve256dp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve256sp BASETYPEID=2 ENABLE_SVE CONFIG=8)
 set(CFLAGS_sve256sp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve512dp BASETYPEID=1 ENABLE_SVE CONFIG=9)
 set(CFLAGS_sve512dp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve512sp BASETYPEID=2 ENABLE_SVE CONFIG=9)
 set(CFLAGS_sve512sp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve1024dp BASETYPEID=1 ENABLE_SVE CONFIG=10)
 set(CFLAGS_sve1024dp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve1024sp BASETYPEID=2 ENABLE_SVE CONFIG=10)
 set(CFLAGS_sve1024sp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11)
 set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11)
 set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE})
 set(MACRODEF_rvvm1128dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=7)
 set(CFLAGS_rvvm1128dp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm1128sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=7)
 set(CFLAGS_rvvm1128sp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm1256dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=8)
 set(CFLAGS_rvvm1256dp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm1256sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=8)
 set(CFLAGS_rvvm1256sp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm1512dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=9)
 set(CFLAGS_rvvm1512dp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm1512sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=9)
 set(CFLAGS_rvvm1512sp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm11024dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=10)
 set(CFLAGS_rvvm11024dp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm11024sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=10)
 set(CFLAGS_rvvm11024sp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm12048dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=11)
 set(CFLAGS_rvvm12048dp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm12048sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=11)
 set(CFLAGS_rvvm12048sp ${FLAGS_ENABLE_RVVM1})
 set(MACRODEF_rvvm2128dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=7)
 set(CFLAGS_rvvm2128dp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm2128sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=7)
 set(CFLAGS_rvvm2128sp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm2256dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=8)
 set(CFLAGS_rvvm2256dp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm2256sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=8)
 set(CFLAGS_rvvm2256sp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm2512dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=9)
 set(CFLAGS_rvvm2512dp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm2512sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=9)
 set(CFLAGS_rvvm2512sp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm21024dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=10)
 set(CFLAGS_rvvm21024dp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm21024sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=10)
 set(CFLAGS_rvvm21024sp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm22048dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=11)
 set(CFLAGS_rvvm22048dp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_rvvm22048sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=11)
 set(CFLAGS_rvvm22048sp ${FLAGS_ENABLE_RVVM2})
 set(MACRODEF_vsxdp BASETYPEID=1 ENABLE_VSX CONFIG=1)
 set(CFLAGS_vsxdp ${FLAGS_ENABLE_VSX})
 set(MACRODEF_vsxsp BASETYPEID=2 ENABLE_VSX CONFIG=1)
 set(CFLAGS_vsxsp ${FLAGS_ENABLE_VSX})
 set(MACRODEF_vsx3dp BASETYPEID=1 ENABLE_VSX3 CONFIG=1)
 set(CFLAGS_vsx3dp ${FLAGS_ENABLE_VSX3})
 set(MACRODEF_vsx3sp BASETYPEID=2 ENABLE_VSX3 CONFIG=1)
 set(CFLAGS_vsx3sp ${FLAGS_ENABLE_VSX3})
 set(MACRODEF_vxedp BASETYPEID=1 ENABLE_VXE CONFIG=140)
 set(CFLAGS_vxedp ${FLAGS_ENABLE_VXE})
 set(MACRODEF_vxesp BASETYPEID=2 ENABLE_VXE CONFIG=140)
 set(CFLAGS_vxesp ${FLAGS_ENABLE_VXE})
 set(MACRODEF_vxe2dp BASETYPEID=1 ENABLE_VXE2 CONFIG=150)
 set(CFLAGS_vxe2dp ${FLAGS_ENABLE_VXE2})
 set(MACRODEF_vxe2sp BASETYPEID=2 ENABLE_VXE2 CONFIG=150)
 set(CFLAGS_vxe2sp ${FLAGS_ENABLE_VXE2})
 # List all available scalar data types
 set(ISALIST_SP purecsp)
 set(ISALIST_DP purecdp)
 set(LIST_SUPPORTED_FPTYPE 0 1)
 if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
  set(ISALIST_SP vecextsp)
  set(ISALIST_DP vecextdp)
 endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
 # List all available vector data types
 if (COMPILER_SUPPORTS_SSE4)
  set(ISALIST_SP ${ISALIST_SP} sse2sp)
  set(ISALIST_DP ${ISALIST_DP} sse2dp)
 endif(COMPILER_SUPPORTS_SSE4)
 if (COMPILER_SUPPORTS_AVX)
  set(ISALIST_SP ${ISALIST_SP} avxsp)
  set(ISALIST_DP ${ISALIST_DP} avxdp)
 endif(COMPILER_SUPPORTS_AVX)
 if (COMPILER_SUPPORTS_AVX2)
  set(ISALIST_SP ${ISALIST_SP} avx2sp)
  set(ISALIST_DP ${ISALIST_DP} avx2dp)
 endif(COMPILER_SUPPORTS_AVX2)
 if (COMPILER_SUPPORTS_AVX512F)
  set(ISALIST_SP ${ISALIST_SP} avx512fsp)
  set(ISALIST_DP ${ISALIST_DP} avx512fdp)
 endif(COMPILER_SUPPORTS_AVX512F)
 if (COMPILER_SUPPORTS_ADVSIMD)
  set(ISALIST_SP ${ISALIST_SP} advsimdsp)
  set(ISALIST_DP ${ISALIST_DP} advsimddp)
 endif(COMPILER_SUPPORTS_ADVSIMD)
 if (COMPILER_SUPPORTS_SVE)
  set(ISALIST_SP ${ISALIST_SP} sve256sp sve512sp sve1024sp sve2048sp)
  set(ISALIST_DP ${ISALIST_DP} sve256dp sve512dp sve1024dp sve2048dp)
 endif(COMPILER_SUPPORTS_SVE)
 if (COMPILER_SUPPORTS_NEON32)
  set(ISALIST_SP ${ISALIST_SP} neon32sp)
 endif(COMPILER_SUPPORTS_NEON32)
 if (COMPILER_SUPPORTS_RVVM1)
  set(ISALIST_SP ${ISALIST_SP} rvvm1128sp rvvm1256sp rvvm1512sp rvvm11024sp rvvm12048sp)
  set(ISALIST_DP ${ISALIST_DP} rvvm1128dp rvvm1256dp rvvm1512dp rvvm11024dp rvvm12048dp)
 endif(COMPILER_SUPPORTS_RVVM1)
 if (COMPILER_SUPPORTS_RVVM2)
  set(ISALIST_SP ${ISALIST_SP} rvvm2128sp rvvm2256sp rvvm2512sp rvvm21024sp rvvm22048sp)
  set(ISALIST_DP ${ISALIST_DP} rvvm2128dp rvvm2256dp rvvm2512dp rvvm21024dp rvvm22048dp)
 endif(COMPILER_SUPPORTS_RVVM2)
 if (COMPILER_SUPPORTS_VSX)
  set(ISALIST_SP ${ISALIST_SP} vsxsp)
  set(ISALIST_DP ${ISALIST_DP} vsxdp)
 endif(COMPILER_SUPPORTS_VSX)
 if (COMPILER_SUPPORTS_VSX3)
  set(ISALIST_SP ${ISALIST_SP} vsx3sp)
  set(ISALIST_DP ${ISALIST_DP} vsx3dp)
 endif(COMPILER_SUPPORTS_VSX3)
 if (COMPILER_SUPPORTS_VXE)
  set(ISALIST_SP ${ISALIST_SP} vxesp)
  set(ISALIST_DP ${ISALIST_DP} vxedp)
 endif(COMPILER_SUPPORTS_VXE)
 if (COMPILER_SUPPORTS_VXE2)
  set(ISALIST_SP ${ISALIST_SP} vxe2sp)
  set(ISALIST_DP ${ISALIST_DP} vxe2dp)
 endif(COMPILER_SUPPORTS_VXE2)
 if(SLEEFDFT_ENABLE_STREAM)
  set(NLIST 0 1 2 3)
 else()
  set(NLIST 0 2)
 endif()
 #
 # Compiler properties
 set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
 set(COMMON_TARGET_PROPERTIES
  C_STANDARD 99                  # -std=gnu99
  )
 if (BUILD_SHARED_LIBS)
  list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON)   # -fPIC
 endif()
 set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} MAXBUTWIDTH=${SLEEFDFT_MAXBUTWIDTH})
 if (SLEEFDFT_ENABLE_STREAM)
  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=1)
 else()
  set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=0)
 endif()
 if(COMPILER_SUPPORTS_OPENMP)
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 endif(COMPILER_SUPPORTS_OPENMP)
 # Include directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(${PROJECT_BINARY_DIR}/include)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # Target mkunroll
 set(TARGET_MKUNROLL "mkunroll")
 add_host_executable(${TARGET_MKUNROLL} mkunroll.c)
 set_target_properties(${TARGET_MKUNROLL} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 if (NOT CMAKE_CROSSCOMPILING)
  target_compile_definitions(${TARGET_MKUNROLL} PRIVATE ${COMMON_TARGET_DEFINITIONS})
 endif()
 # Target mkdispatch
 set(TARGET_MKDISPATCH "mkdispatch")
 add_host_executable(${TARGET_MKDISPATCH} mkdispatch.c)
 set_target_properties(${TARGET_MKDISPATCH} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 if (NOT CMAKE_CROSSCOMPILING)
  target_compile_definitions(${TARGET_MKDISPATCH} PRIVATE ${COMMON_TARGET_DEFINITIONS})
 endif()
 # Target dispatchparam.h
 add_custom_command(OUTPUT dispatchparam.h
  COMMENT "Generating dispatchparam.h"
  COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> paramonly ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_DP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h
  DEPENDS ${TARGET_MKDISPATCH}
  )
 add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h)
 # Target dispatch*.h
 foreach(T ${LIST_SUPPORTED_FPTYPE})
  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
  string(TOUPPER ${ST} CST)                                 # CST is "DP"
  list(GET LISTLONGTYPENAME ${T} LT)                        # LT is "double"
  list(GET LISTTYPEID ${T} ID)                              # ID is 1
  string(CONCAT S "dispatch" ${ST} ".h")                    # S is dispatchdp.h
  add_custom_command(OUTPUT ${S}
    COMMENT "Generating ${S}"
    COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> ${LT} ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_${CST}} > ${S}
    DEPENDS ${TARGET_MKDISPATCH}
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
    )
  string(CONCAT G ${S} "_generated")                        # G is dispatchdp.h_generated
  add_custom_target(${G} SOURCES ${S})
 endforeach()
 # Target dftcommon.o
 add_library(dftcommon_obj OBJECT dftcommon.c dftcommon.h ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h)
 add_dependencies(dftcommon_obj ${TARGET_HEADERS} dispatchparam.h_generated)
 set_source_files_properties(${sleef_BINARY_DIR}/include/sleef.h PROPERTIES GENERATED TRUE)
 set_target_properties(dftcommon_obj PROPERTIES ${COMMON_TARGET_PROPERTIES})
 target_compile_definitions(dftcommon_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
 # Target dft*.o
 foreach(T ${LIST_SUPPORTED_FPTYPE})
  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
  string(CONCAT G "dft" ${ST} "_obj")                       # G is "dftdp_obj"
  string(CONCAT S "dispatch" ${ST} ".h")                    # S is "dispatchdp.h"
  add_library(${G} OBJECT dft.c dftcommon.h ${S})
  string(CONCAT SG ${S} "_generated")                       # SG is "dispatchdp.h_generated"
  add_dependencies(${G} ${SG} ${TARGET_HEADERS})
  set_target_properties(${G} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  list(GET LISTTYPEID ${T} ID)                              # ID is 1
  target_compile_definitions(${G} PRIVATE BASETYPEID=${ID} ${COMMON_TARGET_DEFINITIONS})
 endforeach()
 # Copy unroll0.org to ${CMAKE_CURRENT_BINARY_DIR}
 add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org ${CMAKE_CURRENT_BINARY_DIR}
  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org)
 add_custom_target(unroll0.org.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org)
 # Target unroll*.c
 foreach(T ${LIST_SUPPORTED_FPTYPE})
  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
  string(TOUPPER ${ST} CST)                                 # CST is "DP"
  list(GET LISTLONGTYPENAME ${T} LT)                        # LT is "double"
  foreach(E ${ISALIST_${CST}})                              # E is "sse2dp"
    foreach(N ${NLIST})
      string(CONCAT UC unroll_ ${N} _ ${E} ".c")            # UC is "unroll_0_sse2dp.c"
      set(UNROLL_TARGET_${CST} ${UNROLL_TARGET_${CST}} ${UC})
    endforeach()
  endforeach()
  message(STATUS "Unroll target for ${CST} : ${UNROLL_TARGET_${CST}}")
  if(UNROLL_TARGET_${CST})
    add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}}
      COMMENT "Generating ${UNROLL_TARGET_${CST}}"
      COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> ${LT} ${ISALIST_${CST}}
      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
      DEPENDS ${TARGET_MKUNROLL} unroll0.org.copied
      )
    add_custom_target(unroll_target_${ST} DEPENDS ${UNROLL_TARGET_${CST}})
  endif()
 endforeach()
 # Target unroll*.o
 foreach(T ${LIST_SUPPORTED_FPTYPE})
  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
  string(TOUPPER ${ST} CST)                                 # CST is "DP"
  list(GET LISTLONGTYPENAME ${T} LT)                        # LT is "double"
  foreach(E ${ISALIST_${CST}})                              # E is "sse2dp"
    foreach(N ${NLIST})
      string(CONCAT U unroll_ ${N} _ ${E})                  # U is "unroll_0_sse2dp"
      string(CONCAT UG ${U} "_obj")                         # UG is "unroll_0_sse2dp_obj"
      string(CONCAT UC ${U} ".c")                           # UC is "unroll_0_sse2dp.c"
      add_library(${UG} OBJECT ${UC})
      set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES})
      target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
      target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}})
      target_compile_options(${UG} PRIVATE ${CFLAGS_${E}})
      add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST})
    endforeach()
  endforeach()
 endforeach()
 # Target libdft
 add_library(${TARGET_LIBDFT} $<TARGET_OBJECTS:dftcommon_obj> $<TARGET_OBJECTS:${TARGET_LIBARRAYMAP_OBJ}>)
 target_link_libraries(${TARGET_LIBDFT} ${TARGET_LIBSLEEF} ${LIBM})
 foreach(T ${LIST_SUPPORTED_FPTYPE})
  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
  string(CONCAT G "dft" ${ST} "_obj")                       # G is "dftdp_obj"
  target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${G}>)
 endforeach()
 foreach(T ${LIST_SUPPORTED_FPTYPE})
  list(GET LISTSHORTTYPENAME ${T} ST)                       # ST is "dp", for example
  string(TOUPPER ${ST} CST)                                 # CST is "DP"
  foreach(E ${ISALIST_${CST}})                              # E is "sse2dp"
    foreach(N ${NLIST})
      string(CONCAT UG unroll_ ${N} _ ${E} "_obj")          # U is "unroll_0_sse2dp_obj"
      target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${UG}>)
    endforeach()
  endforeach()
 endforeach()
 set_target_properties(${TARGET_LIBDFT} PROPERTIES
  VERSION ${SLEEF_VERSION}
  SOVERSION ${SLEEF_SOVERSION}
  PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/sleefdft.h
  ${COMMON_TARGET_PROPERTIES}
  )
 # Install
 install(
    TARGETS ${TARGET_LIBDFT}
    EXPORT sleefTargets
    PUBLIC_HEADER #
    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
    COMPONENT sleef_Development
    LIBRARY #
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT sleef_Runtime
    NAMELINK_COMPONENT sleef_Development
    ARCHIVE #
    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
    COMPONENT sleef_Development
    RUNTIME #
    DESTINATION "${CMAKE_INSTALL_BINDIR}"
    COMPONENT sleef_Runtime
    INCLUDES #
    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 )
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dft.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dft.c
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.c
@@ -0,0 +1,423 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <ctype.h>
 #include <inttypes.h>
 #include <assert.h>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #include "misc.h"
 #include "sleef.h"
 #define IMPORT_IS_EXPORT
 #include "sleefdft.h"
 #include "dispatchparam.h"
 #include "dftcommon.h"
 #include "common.h"
 #include "arraymap.h"
 #define MAGIC_FLOAT 0x31415926
 #define MAGIC_DOUBLE 0x27182818
 #define MAGIC2D_FLOAT 0x22360679
 #define MAGIC2D_DOUBLE 0x17320508
 const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" };
 static int parsePathStr(char *p, int *path, int *config, int pathLenMax, int log2len) {
  int pathLen = 0, l2l = 0;
  for(;;) {
    while(*p == ' ') p++;
    if (*p == '\0') break;
    if (!isdigit((int)*p)) return -1;
    pathLen++;
    if (pathLen >= pathLenMax) return -2;
    int n = 0;
    while(isdigit((int)*p)) n = n * 10 + *p++ - '0';
    if (n > MAXBUTWIDTH) return -6;
    path[pathLen-1] = n;
    l2l += n;
    config[pathLen-1] = 0;
    if (*p != '(') continue;
    int c;
    for(c=3;c>=0;c--) if (strncmp(p+1, configStr[c], strlen(configStr[c])) == 0) break;
    if (c == -1) return -3;
    p += strlen(configStr[c]) + 1;
    if (*p != ')') return -4;
    p++;
    config[pathLen-1] = c;
  }
  if (l2l != log2len) return -5;
  return pathLen;
 }
 EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) {
  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
  int path[32], config[32];
  int pathLen = parsePathStr(pathStr, path, config, 31, p->log2len);
  if (pathLen < 0) {
    if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Error %d in parsing path string : %s\n", pathLen, pathStr);
    return;
  }
  for(uint32_t j = 0;j <= p->log2len;j++) p->bestPath[j] = 0;
  for(int level = p->log2len, j=0;level > 0 && j < pathLen;) {
    p->bestPath[level] = path[j];
    p->bestPathConfig[level] = config[j];
    level -= path[j];
    j++;
  }
  p->pathLen = 0;
  for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
  if ((p->mode & SLEEF_MODE_VERBOSE) != 0) {
    printf("Set path : ");
    for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]);
    printf("\n");
  }
 }
 void freeTables(SleefDFT *p) {
  for(int N=1;N<=MAXBUTWIDTH;N++) {
    for(uint32_t level=N;level<=p->log2len;level++) {
      Sleef_free(p->tbl[N][level]);
    }
    free(p->tbl[N]);
    p->tbl[N] = NULL;
  }
 }
 EXPORT void SleefDFT_dispose(SleefDFT *p) {
  if (p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)) {
    Sleef_free(p->tBuf);
    SleefDFT_dispose(p->instH);
    if (p->hlen != p->vlen) SleefDFT_dispose(p->instV);
    p->magic = 0;
    free(p);
    return;
  }
  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
  if (p->log2len <= 1) {
    p->magic = 0;
    free(p);
    return;
  }
  if ((p->mode & SLEEF_MODE_REAL) != 0) {
    Sleef_free(p->rtCoef1);
    Sleef_free(p->rtCoef0);
    p->rtCoef0 = p->rtCoef1 = NULL;
  }
  for(int level = p->log2len;level >= 1;level--) {
    Sleef_free(p->perm[level]);
  }
  free(p->perm);
  p->perm = NULL;
  freeTables(p);
  p->magic = 0;
  free(p);
 }
 uint32_t ilog2(uint32_t q) {
  static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
  uint32_t r = 0,qq;
  if (q & 0xffff0000) r = 16;
  q >>= r;
  qq = q | (q >> 1);
  qq |= (qq >> 2);
  qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
  return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
 }
 //
 char *dftPlanFilePath = NULL;
 char *archID = NULL;
 uint64_t planMode = SLEEF_PLAN_REFERTOENVVAR;
 ArrayMap *planMap = NULL;
 int planFilePathSet = 0, planFileLoaded = 0;
 #ifdef _OPENMP
 omp_lock_t planMapLock;
 int planMapLockInitialized = 0;
 #endif
 static void initPlanMapLock() {
 #ifdef _OPENMP
 #pragma omp critical
  {
    if (!planMapLockInitialized) {
      planMapLockInitialized = 1;
      omp_init_lock(&planMapLock);
    }
  }
 #endif
 }
 static void planMap_clear() {
  if (planMap != NULL) ArrayMap_dispose(planMap);
  planMap = NULL;
 }
 EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
  initPlanMapLock();
  if ((mode & SLEEF_PLAN_RESET) != 0) {
    planMap_clear();
    planFileLoaded = 0;
    planFilePathSet = 0;
  }
  if (dftPlanFilePath != NULL) free(dftPlanFilePath);
  if (path != NULL) {
    dftPlanFilePath = malloc(strlen(path)+10);
    strcpy(dftPlanFilePath, path);
  } else {
    dftPlanFilePath = NULL;
  }
  if (archID != NULL) free(archID);
  if (arch == NULL) arch = Sleef_getCpuIdString();
  archID = malloc(strlen(arch)+10);
  strcpy(archID, arch);
  planMode = mode;
  planFilePathSet = 1;
 }
 static void loadPlanFromFile() {
  if (planFilePathSet == 0 && (planMode & SLEEF_PLAN_REFERTOENVVAR) != 0) {
    char *s = getenv(ENVVAR);
    if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode);
  }
  if (planMap != NULL) ArrayMap_dispose(planMap);
  if (dftPlanFilePath != NULL && (planMode & SLEEF_PLAN_RESET) == 0) {
    planMap = ArrayMap_load(dftPlanFilePath, archID, PLANFILEID, (planMode & SLEEF_PLAN_NOLOCK) == 0);
  }
  if (planMap == NULL) planMap = initArrayMap();
  planFileLoaded = 1;
 }
 static void savePlanToFile() {
  assert(planFileLoaded);
  if ((planMode & SLEEF_PLAN_READONLY) == 0 && dftPlanFilePath != NULL) {
    ArrayMap_save(planMap, dftPlanFilePath, archID, PLANFILEID);
  }
 }
 #define CATBIT 8
 #define BASETYPEIDBIT 2
 #define LOG2LENBIT 8
 #define DIRBIT 1
 #define BUTSTATBIT 16
 static uint64_t keyButStat(int baseTypeID, int log2len, int dir, int butStat) {
  dir = (dir & SLEEF_MODE_BACKWARD) == 0;
  int cat = 0;
  uint64_t k = 0;
  k = (k << BUTSTATBIT) | (butStat & ~(~(uint64_t)0 << BUTSTATBIT));
  k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
  return k;
 }
 #define LEVELBIT LOG2LENBIT
 #define BUTCONFIGBIT 8
 #define TRANSCONFIGBIT 8
 static uint64_t keyTrans(int baseTypeID, int hlen, int vlen, int transConfig) {
  int max = MAX(hlen, vlen), min = MIN(hlen, vlen);
  int cat = 2;
  uint64_t k = 0;
  k = (k << TRANSCONFIGBIT) | (transConfig & ~(~(uint64_t)0 << TRANSCONFIGBIT));
  k = (k << LOG2LENBIT) | (max & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << LOG2LENBIT) | (min & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
  return k;
 }
 static uint64_t keyPath(int baseTypeID, int log2len, int dir, int level, int config) {
  dir = (dir & SLEEF_MODE_BACKWARD) == 0;
  int cat = 3;
  uint64_t k = 0;
  k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
  k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
  k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
  return k;
 }
 static uint64_t keyPathConfig(int baseTypeID, int log2len, int dir, int level, int config) {
  dir = (dir & SLEEF_MODE_BACKWARD) == 0;
  int cat = 4;
  uint64_t k = 0;
  k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
  k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
  k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
  k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
  k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
  return k;
 }
 static uint64_t planMap_getU64(uint64_t key) {
  char *s = ArrayMap_get(planMap, key);
  if (s == NULL) return 0;
  uint64_t ret;
  if (sscanf(s, "%" SCNx64, &ret) != 1) return 0;
  return ret;
 }
 static void planMap_putU64(uint64_t key, uint64_t value) {
  char *s = malloc(100);
  sprintf(s, "%" PRIx64, value);
  s = ArrayMap_put(planMap, key, s);
  if (s != NULL) free(s);
 }
 int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat) {
  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
  initPlanMapLock();
 #ifdef _OPENMP
  omp_set_lock(&planMapLock);
 #endif
  if (!planFileLoaded) loadPlanFromFile();
  int stat = planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10));
  if (stat == 0) {
 #ifdef _OPENMP
    omp_unset_lock(&planMapLock);
 #endif
    return 0;
  }
  int ret = 1;
  for(int j = p->log2len;j >= 0;j--) {
    p->bestPath[j] = planMap_getU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat));
    p->bestPathConfig[j] = planMap_getU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat));
    if (p->bestPath[j] > MAXBUTWIDTH) ret = 0;
  }
  p->pathLen = 0;
  for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
 #ifdef _OPENMP
  omp_unset_lock(&planMapLock);
 #endif
  return ret;
 }
 void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat) {
  assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
  initPlanMapLock();
 #ifdef _OPENMP
  omp_set_lock(&planMapLock);
 #endif
  if (!planFileLoaded) loadPlanFromFile();
  if (planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)) != 0) {
 #ifdef _OPENMP
    omp_unset_lock(&planMapLock);
 #endif
    return;
  }
  for(int j = p->log2len;j >= 0;j--) {
    planMap_putU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPath[j]);
    planMap_putU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPathConfig[j]);
  }
  planMap_putU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10), 1);
  if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
 #ifdef _OPENMP
  omp_unset_lock(&planMapLock);
 #endif
 }
 int PlanManager_loadMeasurementResultsT(SleefDFT *p) {
  assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
  initPlanMapLock();
 #ifdef _OPENMP
  omp_set_lock(&planMapLock);
 #endif
  if (!planFileLoaded) loadPlanFromFile();
  p->tmNoMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0));
  p->tmMT   = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1));
 #ifdef _OPENMP
  omp_unset_lock(&planMapLock);
 #endif
  return p->tmNoMT != 0;
 }
 void PlanManager_saveMeasurementResultsT(SleefDFT *p) {
  assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
  initPlanMapLock();
 #ifdef _OPENMP
  omp_set_lock(&planMapLock);
 #endif
  if (!planFileLoaded) loadPlanFromFile();
  planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0), p->tmNoMT);
  planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1), p->tmMT  );
  if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
 #ifdef _OPENMP
  omp_unset_lock(&planMapLock);
 #endif
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/dftcommon.h
@@ -0,0 +1,69 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #define CONFIGMAX 4
 #define CONFIG_STREAM 1
 #define CONFIG_MT 2
 #define MAXLOG2LEN 32
 typedef struct SleefDFT {
  uint32_t magic;
  uint64_t mode, mode2, mode3;
  int baseTypeID;
  const void *in;
  void *out;
  union {
    struct {
      uint32_t log2len;
      void **tbl[MAXBUTWIDTH+1];
      void *rtCoef0, *rtCoef1;
      uint32_t **perm;
      void **x0, **x1;
      int isa;
      int planMode;
      int vecwidth, log2vecwidth;
      int nThread;
      uint64_t tm[CONFIGMAX][(MAXBUTWIDTH+1)*32];
      uint64_t bestTime;
      int16_t bestPath[32], bestPathConfig[32], pathLen;
    };
    struct {
      int32_t hlen, vlen;
      int32_t log2hlen, log2vlen;
      uint64_t tmNoMT, tmMT;
      struct SleefDFT *instH, *instV;
      void *tBuf;
    };
  };
 } SleefDFT;
 #define SLEEF_MODE2_MT1D       (1 << 0)
 #define SLEEF_MODE3_MT2D       (1 << 0)
 #define PLANFILEID "SLEEFDFT0\n"
 #define ENVVAR "SLEEFDFTPLAN"
 #define SLEEF_MODE_MEASUREBITS (3 << 20)
 void freeTables(SleefDFT *p);
 uint32_t ilog2(uint32_t q);
 //int PlanManager_loadMeasurementResultsB(SleefDFT *p);
 //void PlanManager_saveMeasurementResultsB(SleefDFT *p, int butStat);
 int PlanManager_loadMeasurementResultsT(SleefDFT *p);
 void PlanManager_saveMeasurementResultsT(SleefDFT *p);
 int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat);
 void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat);
 #define GETINT_VECWIDTH 100
 #define GETINT_DFTPRIORITY 101
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkdispatch.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkdispatch.c
@@ -0,0 +1,193 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #ifndef ENABLE_STREAM
 #error ENABLE_STREAM not defined
 #endif
 int main(int argc, char **argv) {
  if (argc < 3) {
    fprintf(stderr, "Usage : %s <basetype> <unrollmax> <unrollmax2> <maxbutwidth> <isa> ...\n", argv[0]);
    exit(-1);
  }
  const char *basetype = argv[1];
  const int maxbutwidth = atoi(argv[2]);
  const int isastart = 3;
  const int isamax = argc - isastart;
 #if ENABLE_STREAM == 1
  const int enable_stream = 1;
 #else
  const int enable_stream = 0;
 #endif
  printf("#define MAXBUTWIDTH %d\n", maxbutwidth);
  printf("\n");
  if (strcmp(basetype, "paramonly") == 0) exit(0);
  printf("#define ISAMAX %d\n", isamax);
  printf("#define CONFIGMAX 4\n");
  for(int k=isastart;k<argc;k++) {
    for(int config=0;config<4;config++) {
 #if ENABLE_STREAM == 0
      if ((config & 1) != 0) continue;
 #endif
      for(int j=1;j<=maxbutwidth;j++) {
        printf("void dft%df_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
        printf("void dft%db_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
        printf("void tbut%df_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
        printf("void tbut%db_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
        printf("void but%df_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
        printf("void but%db_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
      }
    }
    printf("void realSub0_%s(real *, const real *, const int, const real *, const real *);\n", argv[k]);
    printf("void realSub1_%s(real *, const real *, const int, const real *, const real *, const int);\n", argv[k]);
    printf("int getInt_%s(int);\n", argv[k]);
    printf("const void *getPtr_%s(int);\n", argv[k]);
  }
  printf("\n");
  printf("void (*dftf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
  for(int config=0;config<4;config++) {
    printf("  {\n");
    for(int k=isastart;k<argc;k++) {
      printf("    {NULL, ");
      for(int i=1;i<=maxbutwidth;i++) {
        if (enable_stream || (config & 1) == 0) {
          printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
        } else {
          printf("NULL, ");
        }
      }
      printf("},\n");
    }
    printf("},\n");
  }
  printf("};\n\n");
  printf("void (*dftb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
  for(int config=0;config<4;config++) {
    printf("  {\n");
    for(int k=isastart;k<argc;k++) {
      printf("    {NULL, ");
      for(int i=1;i<=maxbutwidth;i++) {
        if (enable_stream || (config & 1) == 0) {
          if (i == 1) {
            printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
          } else {
            printf("dft%db_%d_%s, ", 1 << i, config, argv[k]);
          }
        } else {
          printf("NULL, ");
        }
      }
      printf("},\n");
    }
    printf("},\n");
  }
  printf("};\n\n");
  printf("void (*tbutf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
  for(int config=0;config<4;config++) {
    printf("  {\n");
    for(int k=isastart;k<argc;k++) {
      printf("    {NULL, ");
      for(int i=1;i<=maxbutwidth;i++) {
        if (enable_stream || (config & 1) == 0) {
          printf("tbut%df_%d_%s, ", 1 << i, config, argv[k]);
        } else {
          printf("NULL, ");
        }
      }
      printf("},\n");
    }
    printf("},\n");
  }
  printf("};\n\n");
  printf("void (*tbutb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
  for(int config=0;config<4;config++) {
    printf("  {\n");
    for(int k=isastart;k<argc;k++) {
      printf("    {NULL, ");
      for(int i=1;i<=maxbutwidth;i++) {
        if (enable_stream || (config & 1) == 0) {
          printf("tbut%db_%d_%s, ", 1 << i, config, argv[k]);
        } else {
          printf("NULL, ");
        }
      }
      printf("},\n");
    }
    printf("},\n");
  }
  printf("};\n\n");
  printf("void (*butf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
  for(int config=0;config<4;config++) {
    printf("  {\n");
    for(int k=isastart;k<argc;k++) {
      printf("    {NULL, ");
      for(int i=1;i<=maxbutwidth;i++) {
        if (enable_stream || (config & 1) == 0) {
          printf("but%df_%d_%s, ", 1 << i, config, argv[k]);
        } else {
          printf("NULL, ");
        }
      }
      printf("},\n");
    }
    printf("},\n");
  }
  printf("};\n\n");
  printf("void (*butb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
  for(int config=0;config<4;config++) {
    printf("  {\n");
    for(int k=isastart;k<argc;k++) {
      printf("    {NULL, ");
      for(int i=1;i<=maxbutwidth;i++) {
        if (enable_stream || (config & 1) == 0) {
          printf("but%db_%d_%s, ", 1 << i, config, argv[k]);
        } else {
          printf("NULL, ");
        }
      }
      printf("},\n");
    }
    printf("},\n");
  }
  printf("};\n\n");
  //
  printf("void (*realSub0_%s[ISAMAX])(real *, const real *, const int, const real *, const real *) = {\n  ", basetype);
  for(int k=isastart;k<argc;k++) printf("realSub0_%s, ", argv[k]);
  printf("\n};\n\n");
  printf("void (*realSub1_%s[ISAMAX])(real *, const real *, const int, const real *, const real *, const int) = {\n  ", basetype);
  for(int k=isastart;k<argc;k++) printf("realSub1_%s, ", argv[k]);
  printf("\n};\n\n");
  printf("int (*getInt_%s[16])(int) = {\n  ", basetype);
  for(int k=isastart;k<argc;k++) printf("getInt_%s, ", argv[k]);
  for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
  printf("\n};\n\n");
  printf("const void *(*getPtr_%s[16])(int) = {\n  ", basetype);
  for(int k=isastart;k<argc;k++) printf("getPtr_%s, ", argv[k]);
  for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
  printf("\n};\n\n");
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkunroll.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/mkunroll.c
@@ -0,0 +1,104 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #define CONFIGMAX 4
 char *replaceAll(const char *in, const char *pat, const char *replace) {
  const int replaceLen = (int)strlen(replace);
  const int patLen = (int)strlen(pat);
  char *str = malloc(strlen(in)+1);
  strcpy(str, in);
  for(;;) {
    char *p = strstr(str, pat);
    if (p == NULL) return str;
    int replace_pos = (int)(p - str);
    int tail_len = (int)strlen(p + patLen);
    char *newstr = malloc(strlen(str) + (replaceLen - patLen) + 1);
    memcpy(newstr, str, replace_pos);
    memcpy(newstr + replace_pos, replace, replaceLen);
    memcpy(newstr + replace_pos + replaceLen, str + replace_pos + patLen, tail_len+1);
    free(str);
    str = newstr;
  }
  return str;
 }
 #define LEN 1024
 char line[LEN+10];
 int main(int argc, char **argv) {
  if (argc < 2) {
    fprintf(stderr, "Usage : %s <Base type> <ISA> ...\n", argv[0]);
    exit(-1);
  }
  const char *baseType = argv[1];
  const int isastart = 2;
  for(int config=0;config<CONFIGMAX;config++) {
 #if ENABLE_STREAM == 0
    if ((config & 1) != 0) continue;
 #endif
    for(int isa=isastart;isa<argc;isa++) {
      char *isaString = argv[isa];
      char configString[100];
      sprintf(configString, "%d", config);
      FILE *fpin = fopen("unroll0.org", "r");
      sprintf(line, "unroll_%d_%s.c", config, isaString);
      FILE *fpout = fopen(line, "w");
      fputs("#include \"vectortype.h\"\n\n", fpout);
      fprintf(fpout, "extern %s ctbl_%s[];\n", baseType, baseType);
      fprintf(fpout, "#define ctbl ctbl_%s\n\n", baseType);
      for(;;) {
        if (fgets(line, LEN, fpin) == NULL) break;
        char *s;
        if ((config & 1) == 0) {
          char *s0 = replaceAll(line, "%ISA%", isaString);
          s = replaceAll(s0, "%CONFIG%", configString);
          free(s0);
        } else {
          char *s0 = replaceAll(line, "%ISA%", isaString);
          char *s1 = replaceAll(s0, "%CONFIG%", configString);
          char *s2 = replaceAll(s1, "store(", "stream(");
          s = replaceAll(s2, "scatter(", "scstream(");
          free(s0); free(s1); free(s2);
        }
        if ((config & 2) == 0) {
          char *s0 = replaceAll(s, "#pragma", "//");
          free(s);
          s = s0;
        }
        if (config == 0) {
          char *s0 = replaceAll(s, "#undef EMITREALSUB", "#define EMITREALSUB");
          free(s);
          s = s0;
        }
        fputs(s, fpout);
        free(s);
      }
      fclose(fpin);
      fclose(fpout);
    }
  }
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/unroll0.org
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/unroll0.org
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/vectortype.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/dft/vectortype.h
@@ -0,0 +1,145 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #ifndef __VECTORTYPE_H__
 #define __VECTORTYPE_H__
 #include <math.h>
 #include "sleef.h"
 #ifdef ENABLE_SSE2
 #include "helpersse2.h"
 #endif
 #ifdef ENABLE_AVX
 #include "helperavx.h"
 #endif
 #ifdef ENABLE_AVX2
 #include "helperavx2.h"
 #endif
 #ifdef ENABLE_AVX512F
 #include "helperavx512f.h"
 #endif
 #ifdef ENABLE_NEON32
 #include "helperneon32.h"
 #endif
 #ifdef ENABLE_ADVSIMD
 #include "helperadvsimd.h"
 #endif
 #ifdef ENABLE_SVE
 #include "helpersve.h"
 #endif
 #if defined(ENABLE_RVVM1) || defined(ENABLE_RVVM2)
 #include "helperrvv.h"
 #endif
 #ifdef ENABLE_VSX
 #include "helperpower_128.h"
 #endif
 #ifdef ENABLE_VSX3
 #include "helperpower_128.h"
 #endif
 #ifdef ENABLE_VXE
 #include "helpers390x_128.h"
 #endif
 #ifdef ENABLE_VXE2
 #include "helpers390x_128.h"
 #endif
 #ifdef ENABLE_VECEXT
 #include "helpervecext.h"
 #endif
 #ifdef ENABLE_PUREC
 #include "helperpurec.h"
 #endif
 #define IMPORT_IS_EXPORT
 #include "sleefdft.h"
 #if BASETYPEID == 1
 #define LOG2VECWIDTH (LOG2VECTLENDP-1)
 #define VECWIDTH (1 << LOG2VECWIDTH)
 typedef double real;
 typedef vdouble real2;
 static int available(int name) { return vavailability_i(name); }
 static INLINE real2 uminus(real2 d0) { return vneg_vd_vd(d0); }
 static INLINE real2 uplusminus(real2 d0) { return vposneg_vd_vd(d0); }
 static INLINE real2 uminusplus(real2 d0) { return vnegpos_vd_vd(d0); }
 static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vd_vd_vd(d0, d1); }
 static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vd_vd_vd(d0, d1); }
 static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vd_vd_vd(d0, d1); }
 static INLINE real2 times(real2 d0, real2 d1) { return vmul_vd_vd_vd(d0, d1); }
 static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, d2, d1); }
 static INLINE real2 ctimes(real2 d0, real d) { return vmul_vd_vd_vd(d0, vcast_vd_d(d)); }
 static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, vcast_vd_d(c), d1); }
 static INLINE real2 reverse(real2 d0) { return vrev21_vd_vd(d0); }
 static INLINE real2 reverse2(real2 d0) { return vreva2_vd_vd(d0); }
 static INLINE real2 loadc(real c) { return vcast_vd_d(c); }
 static INLINE real2 load(const real *ptr, int offset) { return vload_vd_p(&ptr[2*offset]); }
 static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vd_p(&ptr[2*offset]); }
 static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vd(&ptr[2*offset], v); }
 static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vd(&ptr[2*offset], v); }
 static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vd(&ptr[2*offset], v); }
 static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
 static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vd(ptr, offset, step, v); }
 static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
 #elif BASETYPEID == 2
 #define LOG2VECWIDTH (LOG2VECTLENSP-1)
 #define VECWIDTH (1 << LOG2VECWIDTH)
 typedef float real;
 typedef vfloat real2;
 static int available(int name) { return vavailability_i(name); }
 static INLINE real2 uminus(real2 d0) { return vneg_vf_vf(d0); }
 static INLINE real2 uplusminus(real2 d0) { return vposneg_vf_vf(d0); }
 static INLINE real2 uminusplus(real2 d0) { return vnegpos_vf_vf(d0); }
 static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vf_vf_vf(d0, d1); }
 static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vf_vf_vf(d0, d1); }
 static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vf_vf_vf(d0, d1); }
 static INLINE real2 times(real2 d0, real2 d1) { return vmul_vf_vf_vf(d0, d1); }
 static INLINE real2 ctimes(real2 d0, real d) { return vmul_vf_vf_vf(d0, vcast_vf_f(d)); }
 static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, d2, d1); }
 static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, vcast_vf_f(c), d1); }
 static INLINE real2 reverse(real2 d0) { return vrev21_vf_vf(d0); }
 static INLINE real2 reverse2(real2 d0) { return vreva2_vf_vf(d0); }
 static INLINE real2 loadc(real c) { return vcast_vf_f(c); }
 static INLINE real2 load(const real *ptr, int offset) { return vload_vf_p(&ptr[2*offset]); }
 static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vf_p(&ptr[2*offset]); }
 static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vf(&ptr[2*offset], v); }
 static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vf(&ptr[2*offset], v); }
 static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vf(&ptr[2*offset], v); }
 static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vf(ptr, offset, step, v); }
 static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
 #else
 #error No BASETYPEID specified
 #endif
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/Makefile
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/Makefile
@@ -0,0 +1,16 @@
 .PHONY: all
 all : gencoef mkrempitab mkrempitabqp
 gencoef : gencoef.c simplexfr.c sp.h dp.h ld.h qp.h
        gcc -O gencoef.c simplexfr.c -o gencoef -lmpfr -lm
 mkrempitab : mkrempitab.c
        gcc -O mkrempitab.c -o mkrempitab -lmpfr
 mkrempitabqp : mkrempitabqp.c
        gcc -O mkrempitabqp.c -o mkrempitabqp -lmpfr
 .PHONY: clean
 clean :
        rm -f gencoef gencoefdp gencoefld mkrempitab mkrempitabqp a.out *~
        rm -f *.obj *.lib *.dll *.exp *.exe
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/dp.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/dp.h
@@ -0,0 +1,196 @@
 // This is part of SLEEF, written by Naoki
 // Shibata. http://shibatch.sourceforge.net
 // The code in this file is distributed under the Creative Commons
 // Attribution 4.0 International License.
 #define PREC_TARGET 53
 #if 0
 #define N 8           // Degree of equation
 #define S 40          // Number of samples for phase 1
 #define L 4           // Number of high precision coefficients
 #define MIN 0.0       // Min argument
 #define MAX (M_PI/4)  // Max argument
 #define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0  // Fix coef 0 to 1.0
 #endif
 #if 0
 #define N 10
 #define S 40
 #define L 2
 #define MIN 0.0
 #define MAX (M_PI/4)
 void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
  mpfr_t x;
  mpfr_init(x);
  mpfr_cos(ret, a, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clear(x);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #define FIXCOEF0 (-0.5)
 #endif
 #if 0 // for xsincospi4_u05
 #define S 40
 #define N 8
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_sin(ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0 // for xsincospi4_u05
 #define N 8
 #define S 40
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_cos(ret, x, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #endif
 #if 0 // for xsincospi4
 #define N 7
 #define S 40
 #define L 0
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_sin(ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0
 #define N 17
 #define S 60
 #define L 0
 #define MIN 0.0
 #define MAX (M_PI/4)
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
 #if 0
 #define N 11
 #define S 35
 #define L 2
 #define MIN 1 //0.75
 #define MAX 1.5
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t frd, mpfr_t fra) {
  mpfr_t tmp, one;
  mpfr_inits(tmp, one, NULL);
  mpfr_set_d(one, 1, GMP_RNDN);
  mpfr_add(tmp, fra, one, GMP_RNDN);
  mpfr_sub(frd, fra, one, GMP_RNDN);
  mpfr_div(frd, frd, tmp, GMP_RNDN);
  mpfr_clears(tmp, one, NULL);
 }
 #define FIXCOEF0 2.0
 #endif
 #if 1
 #define N 12
 #define S 50
 #define L 2
 #define MIN -0.347
 #define MAX 0.347 // 0.5 log 2
 #define PMUL 1
 #define PADD 0
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #define FIXCOEF1 1.0
 //#define FIXCOEF2 0.5
 #endif
 #if 0
 #define N 21
 #define S 100
 #define L 1
 #define P 1.1
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
 #if 0
 #define N 20
 #define S 100
 #define L 0
 #define P 1.54
 #define MIN 0.0
 #define MAX 0.708
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_asin(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.c
@@ -0,0 +1,375 @@
 // This is part of SLEEF, written by Naoki Shibata. http://shibatch.sourceforge.net
 // Since the original code for simplex algorithm is developed by Haruhiko Okumura and
 // the code is distributed under the Creative Commons Attribution 4.0 International License,
 // the contents under this directory are also distributed under the same license.
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <math.h>
 #include <float.h>
 #include <time.h>
 #include <mpfr.h>
 //#include "sp.h"
 #include "dp.h"
 //#include "ld.h"
 //#include "qp.h"
 #undef VERBOSE
 #define PREC 4096
 #define EPS 1e-50
 #define PREC2 (PREC_TARGET*4)
 #ifndef P
 #define P 1
 #endif
 #ifndef Q
 #define Q 10000
 #endif
 void mpfr_zinit(mpfr_t m);
 void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result);
 char *mpfrToStr(mpfr_t m) {
  mpfr_t fra;
  mpfr_init2(fra, mpfr_get_prec(m));
  mpfr_abs(fra, m, GMP_RNDN);
  mpfr_exp_t e;
  char *s = mpfr_get_str(NULL, &e, 10, 0, fra, GMP_RNDN);
  char *ret = malloc(strlen(s) + 20);
  if (mpfr_sgn(m) == -1) ret[0] = '-'; else ret[0] = '+';
  ret[1] = '0';
  ret[2] = '.';
  strcpy(&ret[3], s);
  mpfr_free_str(s);
  char estr[10];
  sprintf(estr, "e%+d", (int)e);
  strcat(ret, estr);
  mpfr_clears(fra, NULL);
  return ret;
 }
 double countULP(mpfr_t d, mpfr_t c) {
  mpfr_t fry, frw;
  mpfr_inits(fry, frw, NULL);
  double c2 = mpfr_get_d(c, GMP_RNDN);
  if (c2 == 0 && mpfr_cmp_d(d, 0) != 0) return 10000;
  long e;
  mpfr_get_d_2exp(&e, c, GMP_RNDN);
  mpfr_set_ui_2exp(frw, 1, e-PREC_TARGET, GMP_RNDN);
  mpfr_sub(fry, d, c, GMP_RNDN);
  mpfr_div(fry, fry, frw, GMP_RNDN);
  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
  mpfr_clears(fry, frw, NULL);
  return u;
 }
 void func(mpfr_t s, mpfr_t x, mpfr_t *coef, int n) {
  mpfr_set_prec(s, PREC_TARGET);
  mpfr_set(s, coef[n-1], GMP_RNDN);
  for(int i=n-1;i>0;i--) {
    if (i == L-1) {
      mpfr_t t;
      mpfr_init2(t, PREC2);
      mpfr_set(t, s, GMP_RNDN);
      mpfr_set_prec(s, PREC2);
      mpfr_set(s, t, GMP_RNDN);
      mpfr_clear(t);
    }
    mpfr_mul(s, s, x, GMP_RNDN);
    mpfr_add(s, s, coef[i-1], GMP_RNDN);
  }
 }
 int main(int argc, char **argv)
 {
  int i, j;
  int n, m;
  double p;
  mpfr_set_default_prec(PREC);
 #if 0
  {
    mpfr_t a, b;
    mpfr_inits(a, b, NULL);
    float x = M_PI;
    mpfr_set_d(a, x, GMP_RNDN);
    x = nexttowardf(x, 100);
    x = nexttowardf(x, 100);
    x = nexttowardf(x, 100);
    mpfr_set_d(b, x, GMP_RNDN);
    printf("%g\n", countULP(b, a));
    mpfr_clears(a, b, NULL);
    exit(0);
  }
 #endif
 #if 0
  {
    mpfr_t a, b;
    mpfr_inits(a, b, NULL);
    double x = M_PI;
    mpfr_set_d(a, x, GMP_RNDN);
    x = nexttoward(x, 100);
    x = nexttoward(x, 100);
    x = nexttoward(x, 100);
    mpfr_set_d(b, x, GMP_RNDN);
    printf("%g\n", countULP(b, a));
    mpfr_clears(a, b, NULL);
    exit(0);
  }
 #endif
 #if 0
  {
    mpfr_t a, b;
    mpfr_inits(a, b, NULL);
    long double x = M_PI;
    mpfr_set_ld(a, x, GMP_RNDN);
    x = nexttowardl(x, 100);
    x = nexttowardl(x, 100);
    x = nexttowardl(x, 100);
    mpfr_set_ld(b, x, GMP_RNDN);
    printf("%g\n", countULP(b, a));
    mpfr_clears(a, b, NULL);
    exit(0);
  }
 #endif
 #if 0
  {
    mpfr_t a, b;
    mpfr_inits(a, b, NULL);
    __float128 x = M_PI;
    mpfr_set_f128(a, x, GMP_RNDN);
    x = nextafterq(x, 100);
    x = nextafterq(x, 100);
    x = nextafterq(x, 100);
    mpfr_set_f128(b, x, GMP_RNDN);
    printf("%g\n", countULP(b, a));
    mpfr_clears(a, b, NULL);
    exit(0);
  }
 #endif
  m = N+1;
  n = argc >= 2 ? atoi(argv[1]) : S;
  p = argc >= 3 ? atof(argv[2]) : P;
  mpfr_t **x, *result;  // x[m][n], result[m]
  x = calloc(sizeof(mpfr_t *), m);
  result = calloc(sizeof(mpfr_t), m);
  for(i=0;i<m;i++) {
    x[i] = calloc(sizeof(mpfr_t), n);
    for(j=0;j<n;j++) mpfr_zinit(x[i][j]);
    mpfr_zinit(result[i]);
  }
  mpfr_t fra, frb, frc, frd, fre;
  mpfr_zinit(fra);
  mpfr_zinit(frb);
  mpfr_zinit(frc);
  mpfr_zinit(frd);
  mpfr_zinit(fre);
  for(i=0;i<n;i++) {
    double b = 1.0 - pow((double)i / (n-1), p);
    double a = ((double)MAX - MIN) * b + MIN;
    mpfr_set_d(fra, a, GMP_RNDN);
    CFUNC(frd, fra);
    for(j=0;j<m-1;j++) {
      mpfr_set_d(frb, (double)j*PMUL+PADD, GMP_RNDN);
      mpfr_pow(x[j][i], frd, frb, GMP_RNDN);
      //printf("%g ", mpfr_get_d(x[j][i], GMP_RNDN));
    }
    TARGET(x[m-1][i], fra);
    //printf(" : %g\n", mpfr_get_d(x[m-1][i], GMP_RNDN));
  }
  for(i=0;i<m-1;i++) mpfr_set_d(result[i], 0, GMP_RNDN);
  regressMinRelError_fr(n, m-1, x, result);
  for(i=m-2;i>=0;i--) {
    mpfr_set_prec(fra, PREC_TARGET+4);
    mpfr_set(fra, result[i], GMP_RNDN);
    char *s;
    printf("%s, \n", s = mpfrToStr(fra));
    free(s);
  }
  printf("\n");
  mpfr_set_prec(fra, PREC);
  double emax = 0;
  for(i=0;i<=n*10;i++) {
    double a = i * (double)(MAX - MIN) / (n*10.0) + MIN;
    mpfr_set_d(fra, a, GMP_RNDN);
    CFUNC(frd, fra);
    mpfr_set_d(frb, 0, GMP_RNDN);
    for(j=m-1;j>=0;j--) {
      mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
      mpfr_pow(frc, frd, frc, GMP_RNDN);
      mpfr_mul(frc, frc, result[j], GMP_RNDN);
      mpfr_add(frb, frb, frc, GMP_RNDN);
    }
    TARGET(frc, fra);
    double u = countULP(frb, frc);
    if (u > emax) emax = u;
  }
  printf("Phase 1 : Max error = %g ULP\n\n", emax);
  fflush(stdout);
  //
  mpfr_t bestcoef[N], curcoef[N];
  for(i=0;i<N;i++) {
    mpfr_init2(bestcoef[i], i >= L ? PREC_TARGET : PREC2);
    mpfr_set(bestcoef[i], result[i], GMP_RNDN);
    mpfr_init2(curcoef[i], i >= L ? PREC_TARGET : PREC2);
    mpfr_set(curcoef[i], result[i], GMP_RNDN);
  }
  srandom(time(NULL));
  mpfr_set_default_prec(PREC2);
  static mpfr_t a[Q], v[Q], am[Q], aa[Q];
  for(i=0;i<Q;i++) {
    mpfr_inits(a[i], v[i], am[i], aa[i], NULL);
    mpfr_set_d(fra, ((double)MAX - (double)MIN) * i / (double)(Q-1) + (double)MIN, GMP_RNDN);
    TARGET(v[i], fra);
    CFUNC(a[i], fra);
    mpfr_set_d(frb, PMUL, GMP_RNDN);
    mpfr_pow(am[i], a[i], frb, GMP_RNDN);
    mpfr_set_d(frb, PADD, GMP_RNDN);
    mpfr_pow(aa[i], a[i], frb, GMP_RNDN);
    mpfr_clears(a[i], v[i], am[i], aa[i], NULL);
  }
  double best = 1e+100, bestsum = 1e+100, bestworstx;
  for(int k=0;k<10000;k++) {
    double emax = 0, esum = 0, worstx = 0;
 #ifdef FIXCOEF0
    mpfr_set_d(curcoef[0], FIXCOEF0, GMP_RNDN);
 #endif
 #ifdef FIXCOEF1
    mpfr_set_d(curcoef[1], FIXCOEF1, GMP_RNDN);
 #endif
 #ifdef FIXCOEF2
    mpfr_set_d(curcoef[2], FIXCOEF2, GMP_RNDN);
 #endif
    for(i=0;i<Q;i++) {
      if (mpfr_cmp_d(v[i], 0) == 0) continue;
      mpfr_set_d(frb, 0, GMP_RNDN);
      for(j=N-1;j>=0;j--) {
        mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
        mpfr_pow(frc, a[i], frc, GMP_RNDN);
        mpfr_mul(frc, frc, curcoef[j], GMP_RNDN);
        mpfr_add(frb, frb, frc, GMP_RNDN);
      }
      double e = countULP(frb, v[i]);
      //printf("c = %.20g, t = %.20g, ulp = %g\n", mpfr_get_d(v[i], GMP_RNDN), mpfr_get_d(frb, GMP_RNDN), e);
      if (!isfinite(e)) continue;
      if (e > emax) { emax = e; worstx = mpfr_get_d(a[i], GMP_RNDN); }
      esum += e;
    }
    mpfr_set_prec(frb, PREC);
    //printf("emax = %g\n", emax);
    if (emax < best || (emax == best && esum < bestsum)) {
      for(i=0;i<N;i++) {
        mpfr_set(bestcoef[i], curcoef[i], GMP_RNDN);
      }
      if (best == 1e+100 || k > 10) printf("Max error = %g ULP, Sum error = %g (Max error at %g)\n", emax, esum, worstx);
      if ((best - emax) / best > 0.0001) k = 0;
      best = emax;
      bestsum = esum;
      bestworstx = worstx;
    }
    for(i=0;i<N;i++) {
      mpfr_set(curcoef[i], bestcoef[i], GMP_RNDN);
    }
    for(i=0;i<N;i++) {
      static int tab[] = {0, 0, 0, 0, 0, 0, 1, -1};
      //static int tab[] = {0, 0, 0, 0, 2, -2, 1, -1};
      int r = tab[random() & 7];
      if (r > 0) {
        for(int j=0;j<r;j++) mpfr_nextabove(curcoef[i]);
      } else if (r < 0) {
        for(int j=0;j>r;j--) mpfr_nextbelow(curcoef[i]);
      }
    }
  }
  printf("\n");
  for(i=N-1;i>=0;i--) {
    mpfr_set_prec(fra, i >= L ? PREC_TARGET+4 : PREC2);
    mpfr_set(fra, bestcoef[i], GMP_RNDN);
    char *s;
    printf("%s, \n", s = mpfrToStr(fra));
    free(s);
  }
  printf("\nPhase 2 : max error = %g ULP at %g\n", best, bestworstx);
  exit(0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/gencoef.txt
@@ -0,0 +1,43 @@
 With this small tool, the coefficients for polynomial approximation
 used in kernels can be generated.
 Usage
 Edit gencoefdp.c. In the beginning of the file, specifications of the
 parameters for generating coefficients are listed. Enable one of them
 by changing #if. Then, run make to compile the source code. Run the
 gencoef, and it will show the generated coefficients in a few minutes.
 How it works
 There are two phases of the program.
 The first phase is the regression for minimizing the maximum relative
 error. This problem can be reduced to a linear programming problem,
 and the Simplex method is used in this implementation. This requires
 multi-precision calculation, and the implementation uses the MPFR
 library to do this. In this phase, only a small number of values
 (specified by S macro, usually 40 or so) of the function to
 approximate are sampled within the argument range. The function to
 approximate can be given by FRFUNC function. Specifying higher values
 for S does not always give better results.
 The second phase is to optimize the coefficients so that it gives good
 accuracy with double precision calculation. In this phase, it checks
 100000 points (specified by Q macro) within the specified argument
 range to see if the polynomial gives good error bound. In some cases,
 the last few terms have to be calculated in higher precision in order
 to achieve 1 ULP overall accuracy, and this implementation can take
 care of that. The L parameter specifies the number of high precision
 coefficients.
 In some cases, it is desirable to fix the last few coefficients to
 values like 1. This can be specified if you define FIXCOEF0
 macro. This sometimes does not work, however. In this case, you need
 to specify the function to approximate as shown in the definition for
 cos.
 Finding a set of good parameters is not a straightforward process. You
 usually need many iterations of trial and error.
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/ld.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/ld.h
@@ -0,0 +1,178 @@
 // This is part of SLEEF, written by Naoki
 // Shibata. http://shibatch.sourceforge.net
 // The code in this file is distributed under the Creative Commons
 // Attribution 4.0 International License.
 #define PREC_TARGET 64
 #if 0
 #define N 8           // Degree of equation
 #define S 40          // Number of samples for phase 1
 #define L 4           // Number of high precision coefficients
 #define MIN 0.0       // Min argument
 #define MAX (M_PI/4)  // Max argument
 #define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0  // Fix coef 0 to 1.0
 #endif
 #if 0
 #define N 10
 #define S 40
 #define L 2
 #define MIN 0.0
 #define MAX (M_PI/4)
 void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
  mpfr_t x;
  mpfr_init(x);
  mpfr_cos(ret, a, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clear(x);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #define FIXCOEF0 (-0.5)
 #endif
 #if 0 // for xsincospi4_u05
 #define N 9
 #define S 40
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_sin(ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0 // for xsincospi4_u05
 #define N 9
 #define S 40
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_cos(ret, x, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #endif
 #if 0 // for xsincospi4
 #define N 7
 #define S 40
 #define L 0
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_sin(ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0
 #define N 17
 #define S 40
 #define L 0
 #define MIN 0.0
 #define MAX (M_PI/4)
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
 #if 0
 #define N 9
 #define S 40
 #define L 2
 #define MIN 1 //0.75
 #define MAX 1.5
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t frd, mpfr_t fra) {
  mpfr_t tmp, one;
  mpfr_inits(tmp, one, NULL);
  mpfr_set_d(one, 1, GMP_RNDN);
  mpfr_add(tmp, fra, one, GMP_RNDN);
  mpfr_sub(frd, fra, one, GMP_RNDN);
  mpfr_div(frd, frd, tmp, GMP_RNDN);
  mpfr_clear(tmp, one, NULL);
 }
 #define FIXCOEF0 2.0
 #endif
 #if 0
 #define N 12
 #define S 50
 #define L 0
 #define MIN -0.347
 #define MAX 0.347 // 0.5 log 2
 #define PMUL 1
 #define PADD 0
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #define FIXCOEF1 1.0
 #define FIXCOEF2 0.5
 #endif
 #if 0
 #define N 22
 #define S 100
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitab.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitab.c
@@ -0,0 +1,121 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <math.h>
 #include <mpfr.h>
 static int64_t doubleToRawLongBits(double d) {
  union {
    double f;
    int64_t i;
  } tmp;
  tmp.f = d;
  return tmp.i;
 }
 static double longBitsToDouble(int64_t i) {
  union {
    double f;
    int64_t i;
  } tmp;
  tmp.i = i;
  return tmp.f;
 }
 static double removelsb(double d) {
  return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffffffffffeLL);
 }
 static int32_t floatToRawIntBits(float d) {
  union {
    float f;
    int32_t i;
  } tmp;
  tmp.f = d;
  return tmp.i;
 }
 static float intBitsToFloat(int32_t i) {
  union {
    float f;
    int32_t i;
  } tmp;
  tmp.i = i;
  return tmp.f;
 }
 static float removelsbf(float x) {
  return intBitsToFloat(0xfffffffc & floatToRawIntBits(x));
 }
 int main(int argc, char **argv) {
  mpfr_set_default_prec(2048);
  mpfr_t pi, rpi, xrpi, x, y, z, r;
  mpfr_inits(pi, rpi, xrpi, x, y, z, r, NULL);
  mpfr_const_pi(pi, GMP_RNDN);
  mpfr_set_d(x, 0.5, GMP_RNDN);
  mpfr_div(rpi, x, pi, GMP_RNDN);
  printf("NOEXPORT ALIGNED(64) const double rempitabdp[] = {\n");
  for(int i=55;i<1024;i++) {
    int M = i > 700 ? -64 : 0;
    int ex = i - 53;
    if (ex < -52) ex = -52;
    mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
    mpfr_mul(y, x, rpi, GMP_RNDN);
    mpfr_frac(xrpi, y, GMP_RNDN);
    mpfr_div(xrpi, xrpi, x, GMP_RNDN);
    mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
    mpfr_set(x, xrpi, GMP_RNDN);
    double rpi0 = removelsb(mpfr_get_d(x, GMP_RNDN));
    mpfr_set_d(y, rpi0, GMP_RNDN);
    mpfr_sub(x, x, y, GMP_RNDN);
    double rpi1 = removelsb(mpfr_get_d(x, GMP_RNDN));
    mpfr_set_d(y, rpi1, GMP_RNDN);
    mpfr_sub(x, x, y, GMP_RNDN);
    double rpi2 = removelsb(mpfr_get_d(x, GMP_RNDN));
    mpfr_set_d(y, rpi2, GMP_RNDN);
    mpfr_sub(x, x, y, GMP_RNDN);
    double rpi3 = mpfr_get_d(x, GMP_RNDN);
    printf("  %.20g, %.20g, %.20g, %.20g,\n", rpi0, rpi1, rpi2, rpi3);
  }
  printf("};\n\n");
  printf("NOEXPORT ALIGNED(64) const float rempitabsp[] = {\n");
  for(int i=25;i<128;i++) {
    int M = i > 90 ? -64 : 0;
    int ex = i - 23;
    mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
    mpfr_mul(y, x, rpi, GMP_RNDN);
    mpfr_frac(xrpi, y, GMP_RNDN);
    mpfr_div(xrpi, xrpi, x, GMP_RNDN);
    mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
    mpfr_set(x, xrpi, GMP_RNDN);
    float rpi20 = removelsbf(mpfr_get_d(x, GMP_RNDN));
    mpfr_set_d(y, rpi20, GMP_RNDN);
    mpfr_sub(x, x, y, GMP_RNDN);
    float rpi21 = removelsbf(mpfr_get_d(x, GMP_RNDN));
    mpfr_set_d(y, rpi21, GMP_RNDN);
    mpfr_sub(x, x, y, GMP_RNDN);
    float rpi22 = removelsbf(mpfr_get_d(x, GMP_RNDN));
    mpfr_set_d(y, rpi22, GMP_RNDN);
    mpfr_sub(x, x, y, GMP_RNDN);
    float rpi23 = mpfr_get_d(x, GMP_RNDN);
    printf("  %.10g, %.10g, %.10g, %.10g,\n", rpi20, rpi21, rpi22, rpi23);
  }
  printf("};\n");
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitabqp.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/mkrempitabqp.c
@@ -0,0 +1,63 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <math.h>
 #include <mpfr.h>
 #include <quadmath.h>
 #define N 8
 #define B 8
 #define NCOL (53-B)
 #define NROW ((16385+(53-B)*N-106)/NCOL+1)
 static double *rempitabqp = NULL;
 void generateRempitabqp() {
  rempitabqp = calloc(16385-106+(53-B)*(N+1), sizeof(double));
  int orgprec = mpfr_get_default_prec();
  mpfr_set_default_prec(18000);
  mpfr_t pi, m, n, o;
  mpfr_inits(pi, m, n, o, NULL);
  mpfr_const_pi(pi, GMP_RNDN);
  mpfr_d_div(n, 0.5, pi, GMP_RNDN);
  for(int e=106;e<16385+(53-B)*N;e++) {
    mpfr_set(m, n, GMP_RNDN);
    mpfr_set_ui_2exp(o, 1, -(113 - e), GMP_RNDN);
    mpfr_mul(m, m, o, GMP_RNDN);
    mpfr_frac(m, m, GMP_RNDN);
    mpfr_set_ui_2exp(o, 1, (53-B), GMP_RNDN);
    mpfr_mul(m, m, o, GMP_RNDN);
    mpfr_trunc(m, m);
    mpfr_set_ui_2exp(o, 1, 7-(53-B), GMP_RNDN);
    mpfr_mul(m, m, o, GMP_RNDN);
    int col = (e - 106) % NCOL;
    int row = (e - 106) / NCOL;
    rempitabqp[col * NROW + row] = mpfr_get_d(m, GMP_RNDN);
  }
  mpfr_clears(pi, m, n, o, NULL);
  mpfr_set_default_prec(orgprec);
 }
 int main(int argc, char **argv) {
  generateRempitabqp();
  printf("NOEXPORT const double Sleef_rempitabqp[] = {\n  ");
  for(int i=0;i<16385-106+(53-B)*(N+1);i++) {
    printf("%.20g, ", rempitabqp[i]);
    if ((i & 3) == 3) printf("\n  ");
  }
  printf("\n};\n");
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/qp.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/qp.h
@@ -0,0 +1,161 @@
 // This is part of SLEEF, written by Naoki
 // Shibata. http://shibatch.sourceforge.net
 // The code in this file is distributed under the Creative Commons
 // Attribution 4.0 International License.
 #define PREC_TARGET 113
 //
 #if 0
 #define N 15          // Degree of equation
 #define S 150         // Number of samples for phase 1
 #define L 0           // Number of high precision coefficients
 #define P 0.37
 #define MIN 0.0       // Min argument
 #define MAX (M_PI/2)  // Max argument
 #define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
 #define PADD 3
 void TARGET(mpfr_t ret, mpfr_t a) { // The function to approximate
  mpfr_sin(ret, a, GMP_RNDN);
  mpfr_sub(ret, ret, a, GMP_RNDN); // ret = sin(a) - a
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0
 #define N 15
 #define S 150
 #define L 0
 #define MIN 0.0
 #define MAX (M_PI/2)
 void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
  mpfr_t x;
  mpfr_init(x);
  mpfr_cos(ret, a, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clear(x);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 //#define FIXCOEF0 (-0.5)
 #endif
 #if 0 // for xsincospi4_u05
 #define N 13
 #define S 150
 #define L 2
 #define P 0.9
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_sin(ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0 // for xsincospi4_u05
 #define N 13
 #define S 150
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_cos(ret, x, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #endif
 #if 0 // running
 #define N 31
 #define S 100
 #define P 1.7
 #define L 0
 #define MIN 0.0
 #define MAX (M_PI/4)
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
 #if 0 // running
 #define N 20
 #define S 110
 #define L 2
 #define MIN 1 //0.75
 #define MAX 1.5
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t frd, mpfr_t fra) {
  mpfr_t tmp, one;
  mpfr_inits(tmp, one, NULL);
  mpfr_set_d(one, 1, GMP_RNDN);
  mpfr_add(tmp, fra, one, GMP_RNDN);
  mpfr_sub(frd, fra, one, GMP_RNDN);
  mpfr_div(frd, frd, tmp, GMP_RNDN);
  mpfr_clears(tmp, one, NULL);
 }
 #define FIXCOEF0 2.0
 #endif
 #if 1
 #define N 22
 #define S 140
 #define L 2
 #define MIN -0.347
 #define MAX 0.347 // 0.5 log 2
 #define PMUL 1
 #define PADD 0
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #define FIXCOEF1 1.0
 //#define FIXCOEF2 0.5
 #endif
 #if 0 // running
 #define N 45
 #define S 100
 #define P 1.55
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/simplexfr.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/simplexfr.c
@@ -0,0 +1,459 @@
 // The original code for simplex algorithm is taken from Haruhiko Okumura's book.
 // https://oku.edu.mie-u.ac.jp/~okumura/algo/
 // The code is distributed under the Creative Commons Attribution 4.0 International License.
 // https://creativecommons.org/licenses/by/4.0/
 // The code is modified by Naoki Shibata to process arbitrary precision numbers.
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <math.h>
 #include <float.h>
 #include <time.h>
 #include <mpfr.h>
 #define PREC 4096
 #define EPS 1e-50
 #define OK 0
 #define MAXIMIZABLE_TO_INFINITY 1
 #define NOT_FEASIBLE 2
 #define ERROR (-1)
 #define NOP (-1)
 #define EQU (0)
 #define LEQ 1
 #define GEQ 2
 static int m, n, n1, n2, n3, jmax;
 static int *col, *row, *nonzero_row, *inequality;
 static mpfr_t **a, *c, **q, *pivotcolumn;
 static mpfr_t zero, one, eps, minuseps, large;
 void mpfr_zinit(mpfr_t m) {
  mpfr_init(m);
  mpfr_set_d(m, 0, GMP_RNDN);
 }
 static void init(int n0, int m0) {
  int i, j;
  m = m0; n = n0;
  mpfr_init(zero); mpfr_set_d(zero, 0, GMP_RNDN);
  mpfr_init(one); mpfr_set_d(one, 1, GMP_RNDN);
  mpfr_init(eps);
  mpfr_set_d(eps, EPS, GMP_RNDN);
  mpfr_init(minuseps);
  mpfr_set_d(minuseps, -EPS, GMP_RNDN);
  mpfr_init(large);
  mpfr_set_d(large, 1.0 / EPS, GMP_RNDN);
  a = malloc(sizeof(mpfr_t *) * (m + 1));
  for(i=0;i < m+1;i++) {
    a[i] = malloc(sizeof(mpfr_t) * (n + 1));
    for(j=0;j < (n+1);j++) {
      mpfr_zinit(a[i][j]);
    }
  }
  q = malloc(sizeof(mpfr_t *) * (m + 1));
  for(i=0;i < m+1;i++) {
    q[i] = malloc(sizeof(mpfr_t) * (m + 1));
    for(j=0;j < m+1;j++) {
      mpfr_zinit(q[i][j]);
    }
  }
  c = malloc(sizeof(mpfr_t) * (n + 1));
  for(j=0;j < (n+1);j++) {
    mpfr_zinit(c[j]);
  }
  pivotcolumn = malloc(sizeof(mpfr_t) * (m + 1));
  for(j=0;j < (m+1);j++) {
    mpfr_zinit(pivotcolumn[j]);
  }
  col = calloc(m+1, sizeof(int));
  row = calloc(n+2*m+1, sizeof(int));
  nonzero_row = calloc(n+2*m+1, sizeof(int));
  inequality = calloc(m+1, sizeof(int));
 }
 static void dispose() {
  mpfr_clears(zero, one, eps, minuseps, large, (mpfr_ptr)0);
  int i, j;
  for(i=0;i < m+1;i++) {
    for(j=0;j < m+1;j++) {
      mpfr_clear(q[i][j]);
    }
    free(q[i]);
  }
  free(q);
  for(i=0;i < m+1;i++) {
    for(j=0;j < n+1;j++) {
      mpfr_clear(a[i][j]);
    }
    free(a[i]);
  }
  free(a);
  for(j=0;j < n+1;j++) {
    mpfr_clear(c[j]);
  }
  free(c);
  for(j=0;j < m+1;j++) {
    mpfr_clear(pivotcolumn[j]);
  }
  free(pivotcolumn);
  free(col);
  free(row);
  free(nonzero_row);
  free(inequality);
 }
 static void prepare() {
  int i;
  n1 = n;
  for (i = 1; i <= m; i++)
    if (inequality[i] == GEQ) {
      n1++;  nonzero_row[n1] = i;
    }
  n2 = n1;
  for (i = 1; i <= m; i++)
    if (inequality[i] == LEQ) {
      n2++;  col[i] = n2;
      nonzero_row[n2] = row[n2] = i;
    }
  n3 = n2;
  for (i = 1; i <= m; i++)
    if (inequality[i] != LEQ) {
      n3++;  col[i] = n3;
      nonzero_row[n3] = row[n3] = i;
    }
  for (i = 0; i <= m; i++) {
    mpfr_set_d(q[i][i], 1, GMP_RNDN);
  }
 }
 static void tableau(mpfr_t ret, int i, int j) {
  int k;
  if (col[i] < 0) { mpfr_set_d(ret, 0, GMP_RNDN); return; }
  if (j <= n) {
    mpfr_t s;
    mpfr_zinit(s);
    mpfr_set_d(s, 0, GMP_RNDN);
    mpfr_t *tab = malloc(sizeof(mpfr_t) * (m + 1));
    mpfr_ptr *ptab = malloc(sizeof(mpfr_ptr) * (m + 1));
    for (k = 0; k <= m; k++) {
      mpfr_zinit(tab[k]);
      ptab[k] = (mpfr_ptr)&tab[k];
      mpfr_mul(tab[k], q[i][k], a[k][j], GMP_RNDN);
    }
    mpfr_sum(s, ptab, m+1, GMP_RNDN);
    for (k = 0; k <= m; k++) {
      mpfr_clear(tab[k]);
    }
    free(ptab);
    free(tab);
    mpfr_set(ret, s, GMP_RNDN);
    mpfr_clear(s);
    return;
  }
  mpfr_set(ret, q[i][nonzero_row[j]], GMP_RNDN);
  if (j <= n1) { mpfr_neg(ret, ret, GMP_RNDN); return; }
  if (j <= n2 || i != 0) return;
  mpfr_add(ret, ret, one, GMP_RNDN);
  return;
 }
 static void pivot(int ipivot, int jpivot) {
  int i, j;
  mpfr_t u;
  mpfr_zinit(u);
  mpfr_set(u, pivotcolumn[ipivot], GMP_RNDN);
  for (j = 1; j <= m; j++) {
    mpfr_div(q[ipivot][j], q[ipivot][j], u, GMP_RNDN);
  }
  for (i = 0; i <= m; i++)
    if (i != ipivot) {
      mpfr_set(u, pivotcolumn[i], GMP_RNDN);
      for (j = 1; j <= m; j++) {
        mpfr_fms(q[i][j], q[ipivot][j], u, q[i][j], GMP_RNDN);
        mpfr_neg(q[i][j], q[i][j], GMP_RNDN);
      }
    }
  row[col[ipivot]] = 0;
  col[ipivot] = jpivot;  row[jpivot] = ipivot;
  mpfr_clear(u);
 }
 static int minimize() {
  int i, ipivot, jpivot;
  mpfr_t t, u;
  mpfr_inits(t, u, (mpfr_ptr)0);
  for (;;) {
    for (jpivot = 1; jpivot <= jmax; jpivot++) {
      if (row[jpivot] == 0) {
        tableau(pivotcolumn[0], 0, jpivot);
        if (mpfr_cmp(pivotcolumn[0], minuseps) < 0) break;
      }
    }
    if (jpivot > jmax) {
      mpfr_clears(t, u, (mpfr_ptr)0);
      return 1;
    }
    mpfr_set(u, large, GMP_RNDN);
    ipivot = 0;
    for (i = 1; i <= m; i++) {
      tableau(pivotcolumn[i], i, jpivot);
      if (mpfr_cmp(pivotcolumn[i], eps) > 0) {
        tableau(t, i, 0);
        mpfr_div(t, t, pivotcolumn[i], GMP_RNDN);
        if (mpfr_cmp(t, u) < 0) { ipivot = i; mpfr_set(u, t, GMP_RNDN); }
      }
    }
    if (ipivot == 0) {
      mpfr_clears(t, u, (mpfr_ptr)0);
      return 0; // the objective function can be minimized to -infinite
    }
    pivot(ipivot, jpivot);
  }
 }
 static int phase1() {
  int i, j;
  mpfr_t u;
  mpfr_zinit(u);
  jmax = n3;
  for (i = 0; i <= m; i++) {
    if (col[i] > n2) mpfr_set_d(q[0][i], -1, GMP_RNDN);
  }
  minimize();
  tableau(u, 0, 0);
  if (mpfr_cmp(u, minuseps) < 0) {
    mpfr_clear(u);
    return 0;
  }
  for (i = 1; i <= m; i++) {
    if (col[i] > n2) {
      col[i] = -1;
    }
  }
  mpfr_set_d(q[0][0], 1, GMP_RNDN);
  for (j = 1; j <= m; j++) mpfr_set_d(q[0][j], 0, GMP_RNDN);
  for (i = 1; i <= m; i++) {
    if ((j = col[i]) > 0 && j <= n && mpfr_cmp_d(c[j], 0) != 0) {
      mpfr_set(u, c[j], GMP_RNDN);
      for (j = 1; j <= m; j++) {
        mpfr_fms(q[0][j], q[i][j], u, q[0][j], GMP_RNDN);
        mpfr_neg(q[0][j], q[0][j], GMP_RNDN);
      }
    }
  }
  mpfr_clear(u);
  return 1;
 }
 static int phase2() {
  int j;
  jmax = n2;
  for (j = 0; j <= n; j++) {
    mpfr_set(a[0][j], c[j], GMP_RNDN);
  }
  return minimize();
 }
 int solve_fr(mpfr_t *result, int n0, int m0, mpfr_t **a0, int *ineq0, mpfr_t *c0) {
  int i,j;
  m = m0;   // number of inequations
  n = n0+1; // number of variables
  init(n, m);
  mpfr_t csum;
  mpfr_zinit(csum);
  for(j=0;j<n0+1;j++) {
    mpfr_set(c[j], c0[j], GMP_RNDN);
  }
  for(j=1;j<n0+1;j++) {
    mpfr_add(csum, csum, c0[j], GMP_RNDN);
  }
  mpfr_set(c[n], csum, GMP_RNDN);
  mpfr_neg(c[n], c[n], GMP_RNDN);
  for(i=0;i<m;i++) {
    mpfr_set_d(csum, 0, GMP_RNDN);
    for(j=0;j<n0+1;j++) mpfr_set(a[i+1][j], a0[i][j], GMP_RNDN);
    mpfr_neg(a[i+1][0], a[i+1][0], GMP_RNDN);
    for(j=1;j<n0+1;j++) {
      mpfr_add(csum, csum, a0[i][j], GMP_RNDN);
    }
    mpfr_set(a[i+1][n], csum, GMP_RNDN);
    mpfr_neg(a[i+1][n], a[i+1][n], GMP_RNDN);
    inequality[i+1] = ineq0[i];
    if (mpfr_cmp_d(a[i+1][0], 0) < 0) {
      if      (inequality[i+1] == GEQ) inequality[i+1] = LEQ;
      else if (inequality[i+1] == LEQ) inequality[i+1] = GEQ;
      for (j = 0; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
    } else if (mpfr_cmp_d(a[i+1][0], 0) == 0 && inequality[i+1] == GEQ) {
      inequality[i+1] = LEQ;
      for (j = 1; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
    }
  }
  int p1r = 1;
  prepare();
  if (n3 != n2) p1r = phase1();
  if (!p1r) {
    dispose();
    return NOT_FEASIBLE;
  }
  int b = phase2();
  mpfr_t *s = calloc(sizeof(mpfr_t), n);
  for(j=0;j<n;j++) {
    mpfr_zinit(s[j]);
  }
  for (j = 1; j < n; j++) {
    if ((i = row[j]) != 0) {
      tableau(s[j], i, 0);
    }
  }
  mpfr_t cs;
  mpfr_zinit(cs);
  if (row[n] != 0) tableau(cs, row[n], 0);
  for (j = 1; j < n; j++) {
    mpfr_sub(s[j], s[j], cs, GMP_RNDN);
  }
  for(j=0;j<n;j++) {
    mpfr_set(result[j], s[j], GMP_RNDN);
  }
  mpfr_clear(cs);
  for(j=0;j<n;j++) mpfr_clear(s[j]);
  free(s);
  dispose();
  return b ? OK : MAXIMIZABLE_TO_INFINITY;
 }
 void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result) {
  int m0 = n * 3, n0 = m + 2 * n, i, j;
  mpfr_t **a0, *c0, *result0;
  int in0[m0];
  a0 = malloc(sizeof(mpfr_t *) * m0);
  for(i=0;i<m0;i++) {
    a0[i] = calloc(n0+1, sizeof(mpfr_t));
    for(j=0;j<n0+1;j++) mpfr_zinit(a0[i][j]);
  }
  c0 = calloc(n0+1, sizeof(mpfr_t));
  result0 = calloc(n0+1, sizeof(mpfr_t));
  for(j=0;j<n0+1;j++) {
    mpfr_zinit(c0[j]);
    mpfr_zinit(result0[j]);
  }
  for(i=0;i<n;i++) {
    long double ld = mpfr_get_ld(x[m][i], GMP_RNDN);
    if (ld < DBL_MIN) ld = 1;
 #if 1
    mpfr_set_ld(c0[m+i  +1], 1.0/fabsl(ld), GMP_RNDN);
    mpfr_set_ld(c0[m+n+i+1], 1.0/fabsl(ld), GMP_RNDN);
 #else
    int e;
    frexpl(ld, &e);
    ld = 1.0 / ldexpl(1.0, e);
    mpfr_set_ld(c0[m+i  +1], ld, GMP_RNDN);
    mpfr_set_ld(c0[m+n+i+1], ld, GMP_RNDN);
 #endif
    mpfr_set_d(a0[i*3+0][m+i+1], 1, GMP_RNDN);
    in0[i*3+0] = GEQ;
    mpfr_set_d(a0[i*3+1][m+n+i+1], 1, GMP_RNDN);
    in0[i*3+1] = GEQ;
    for(j=0;j<m;j++) {
      mpfr_set(a0[i*3+2][j+1], x[j][i], GMP_RNDN);
    }
    mpfr_set_d(a0[i*3+2][m+i+1], 1, GMP_RNDN);
    mpfr_set_d(a0[i*3+2][m+n+i+1], -1, GMP_RNDN);
    in0[i*3+2] = EQU;
    mpfr_set(a0[i*3+2][0], x[m][i], GMP_RNDN);
    mpfr_neg(a0[i*3+2][0], a0[i*3+2][0], GMP_RNDN);
  }
  int status = solve_fr(result0, n0, m0, a0, in0, c0);
  if (status == NOT_FEASIBLE) {
    printf("not feasible\n");
  } else {
    if (status == MAXIMIZABLE_TO_INFINITY) printf("maximizable to inf\n");
  }
  for(i=0;i<m;i++) {
    mpfr_set(result[i], result0[i+1], GMP_RNDN);
  }
  free(result0);
  free(c0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/sp.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/gencoef/sp.h
@@ -0,0 +1,159 @@
 // This is part of SLEEF, written by Naoki
 // Shibata. http://shibatch.sourceforge.net
 // The code in this file is distributed under the Creative Commons
 // Attribution 4.0 International License.
 #define PREC_TARGET 24
 #if 1
 #define N 5           // Degree of equation
 #define S 81          // Number of samples for phase 1
 #define L 0           // Number of high precision coefficients
 #define P 0.37
 #define MIN 0.0       // Min argument
 #define MAX (M_PI/2)  // Max argument
 #define PMUL 2        // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0  // Fix coef 0 to 1.0
 #endif
 #if 0
 #define N 5
 #define S 40
 #define L 0
 #define MIN 0.0
 #define MAX (M_PI/2)
 void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
  mpfr_t x;
  mpfr_init(x);
  mpfr_cos(ret, a, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clear(x);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #define FIXCOEF0 (-0.5)
 #endif
 #if 0
 // xsincospi4
 #define N 5
 #define S 30
 #define P 0.69
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_sin(ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #endif
 #if 0
 // xsincospi4
 #define N 5
 #define S 60
 #define P 0.7
 #define L 1
 #define MIN 0.0
 #define MAX 1.0
 void TARGET(mpfr_t ret, mpfr_t a) {
  mpfr_t x, y;
  mpfr_inits(x, y, NULL);
  mpfr_const_pi(x, GMP_RNDN);
  mpfr_set_d(y, 1.0/4, GMP_RNDN);
  mpfr_mul(x, x, y, GMP_RNDN);
  mpfr_mul(x, x, a, GMP_RNDN);
  mpfr_cos(ret, x, GMP_RNDN);
  mpfr_set_ld(x, 1, GMP_RNDN);
  mpfr_sub(ret, ret, x, GMP_RNDN);
  mpfr_clears(x, y, NULL);
 }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define PMUL 2
 #define PADD 2
 #endif
 #if 0
 #define N 7
 #define S 40
 #define L 2
 #define MIN 0.0
 #define MAX (M_PI/4)
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
 #if 0
 #define N 5
 #define S 40
 #define L 2
 #define MIN 1 //0.75
 #define MAX 1.5
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t frd, mpfr_t fra) {
  mpfr_t tmp, one;
  mpfr_inits(tmp, one, NULL);
  mpfr_set_d(one, 1, GMP_RNDN);
  mpfr_add(tmp, fra, one, GMP_RNDN);
  mpfr_sub(frd, fra, one, GMP_RNDN);
  mpfr_div(frd, frd, tmp, GMP_RNDN);
  mpfr_clears(tmp, one, NULL);
 }
 #define FIXCOEF0 2.0
 #endif
 #if 0
 #define N 7
 #define S 50
 #define L 0
 #define MIN -0.347
 #define MAX 0.347 // 0.5 log 2
 #define PMUL 1
 #define PADD 0
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #define FIXCOEF1 1.0
 //#define FIXCOEF2 0.5
 #endif
 #if 0
 #define N 10
 #define S 100
 #define L 2
 #define MIN 0.0
 #define MAX 1.0
 #define PMUL 2
 #define PADD 1
 void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
 void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
 #define FIXCOEF0 1.0
 #endif
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/Makefile
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/Makefile
@@ -0,0 +1,153 @@
 ICCAVAILABLE := $(shell command -v icc 2> /dev/null)
 ARCH := $(shell uname -p)
 all :
 ifndef BUILDDIR
        @echo
        @echo Please set the build directory to BUILDDIR environment variable and run make once again.
        @echo e.g. export BUILDDIR='`pwd`'/../../build
        @echo
 else
        @echo
        @echo You can start measurement by "'"make measure"'".
 ifdef ICCAVAILABLE
        @echo You can start measurement with SVML by "'"make measureSVML"'".
 endif
        @echo Then, you can plot the results of measurement by "'"make plot"'".
        @echo
        @echo You have to install java and gnuplot to do plotting.
        @echo Stop all tasks on the computer before starting measurement.
        @echo
 endif
 benchsvml128_10.o : benchsvml128.c bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_10.o
 benchsvml128_40.o : benchsvml128.c bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_40.o
 benchsvml256_10.o : benchsvml256.c bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_10.o
 benchsvml256_40.o : benchsvml256.c bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_40.o
 benchsvml512_10.o : benchsvml512.c bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_10.o
 benchsvml512_40.o : benchsvml512.c bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_40.o
 benchsvml_10 : benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_10
 benchsvml_40 : benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o bench.h
        -command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_40
 #
 ifeq ($(ARCH),aarch64)
 benchsleef : benchsleef.c benchsleef128.o bench.h
        $(CC) benchsleef.c benchsleef128.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
 benchsleef128.o : benchsleef128.c bench.h
        $(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
 else ifeq ($(ARCH),s390x)
 benchsleef : benchsleef.c benchsleef128.o bench.h
        $(CC) benchsleef.c benchsleef128.o -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
 benchsleef128.o : benchsleef128.c bench.h
        $(CC) benchsleef128.c -Wall -mzvector -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
 else ifeq ($(ARCH),ppc64le)
 benchsleef : benchsleef.c benchsleef128.o bench.h
        $(CC) benchsleef.c benchsleef128.o -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
 benchsleef128.o : benchsleef128.c bench.h
        $(CC) benchsleef128.c -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
 else
 benchsleef : benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o bench.h
        $(CC) benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
 benchsleef128.o : benchsleef128.c bench.h
        $(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
 benchsleef256.o : benchsleef256.c bench.h
        $(CC) benchsleef256.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
 benchsleef512.o : benchsleef512.c bench.h
        $(CC) benchsleef512.c -Wall -mavx512f -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
 endif
 #
 ProcessData.class : ProcessData.java
        javac ProcessData.java
 #
 ifndef BUILDDIR
 measure :
        @echo
        @echo Please set the build directory to BUILDDIR environment variable and run make once again.
        @echo e.g. export BUILDDIR='`pwd`'/../../build
        @echo
 else
 measure : benchsleef
        chmod +x ./measure.sh
        LD_LIBRARY_PATH=$(BUILDDIR)/lib ./measure.sh ./benchsleef
        @echo
        @echo Now, you can plot the results of measurement by "'"make plot"'".
        @echo You can do another measurement by "'"make measure"'".
 ifdef ICCAVAILABLE
        @echo You can start another measurement with SVML by "'"make measureSVML"'".
 endif
        @echo You can start over by "'"make restart"'".
        @echo
 endif
 measureSVML : all benchsvml_10 benchsvml_40
        chmod +x ./measure.sh
        ./measure.sh ./benchsvml_10 ./benchsvml_40
        @echo
        @echo Now, you can plot the results of measurement by "'"make plot"'".
        @echo You can do another measurement by "'"make measure"'".
 ifdef ICCAVAILABLE
        @echo You can start another measurement with SVML by "'"make measureSVML"'".
 endif
        @echo You can start over by "'"make restart"'".
        @echo
 plot : ProcessData.class counter.txt
        java ProcessData *dptrig*.out
        gnuplot script.out
        mv output.png trigdp.png
        java ProcessData *dpnontrig*.out
        gnuplot script.out
        mv output.png nontrigdp.png
        java ProcessData *sptrig*.out
        gnuplot script.out
        mv output.png trigsp.png
        java ProcessData *spnontrig*.out
        gnuplot script.out
        mv output.png nontrigsp.png
        @echo
        @echo Plotted results are in trigdp.png, nontrigdp.png, trigsp.png and nontrigsp.png.
        @echo
 clean :
        rm -f *~ a.out *.so *.so.* *.a *.s *.o
        rm -rf *.dSYM *.dylib
        rm -f *.obj *.lib *.dll *.exp *.exe *.stackdump
        rm -f *.class *.png benchsleef benchsvml_10 benchsvml_40 *.out counter.txt
 restart :
        rm -f *.out counter.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java
@@ -0,0 +1,193 @@
 import java.util.*;
 import java.io.*;
 public class ProcessData {
    static final int DP = 64, SP = 32;
    static LinkedHashMap<String, Integer> funcNameOrder = new LinkedHashMap<String, Integer>();
    static class Key {
        final String funcName;
        final int prec, bits;
        final ArrayList<Double> range = new ArrayList<Double>();
        final double ulps;
        Key(String s) {
            String[] a = s.split(",");
            funcName = a[0].trim();
            if (funcNameOrder.get(funcName) == null) {
                funcNameOrder.put(funcName, funcNameOrder.size());
            }
            prec =
                a[1].trim().equals("DP") ? DP :
                a[1].trim().equals("SP") ? SP :
                0;
            bits = Integer.parseInt(a[2].trim());
            int c;
            for(c = 3;;c++) {
                if (a[c].trim().endsWith("ulps")) break;
                range.add(Double.parseDouble(a[c]));
            }
            ulps = Double.parseDouble(a[c].trim().replace("ulps", ""));
        }
        public int hashCode() {
            int h = funcName.hashCode();
            h ^= prec ^ bits;
            return h;
        }
        public boolean equals(Object o) {
            if (this == o) return true;
            Key k = (Key) o;
            if (funcName.compareTo(k.funcName) != 0) return false;
            if (prec != k.prec) return false;
            if (bits != k.bits) return false;
            if (range.size() != k.range.size()) return false;
            for(int i=0;i<range.size();i++) {
                if ((double)range.get(i) != (double)k.range.get(i)) return false;
            }
            if (ulps != k.ulps) return false;
            return true;
        }
        public String toString() {
            String s = funcName + " ";
            s += prec == DP ? "DP " : "SP ";
            s += bits + "bit ";
            s += String.format(" %.0fulp ", ulps);
            for(int i=0;i<range.size();i+=2) {
                s += "[" + String.format("%.3g", range.get(i)) + ", " + String.format("%.3g", range.get(i+1)) + "]";
                if (i + 2 < range.size()) s += " ";
            }
            return s;
        }
    }
    static class KeyComparator implements Comparator<Key> {
        public int compare(Key d0, Key d1) {
            if (d0 == d1) return 0;
            if (d0.prec < d1.prec) return  1;
            if (d0.prec > d1.prec) return -1;
            if (d0.ulps > d1.ulps) return  1;
            if (d0.ulps < d1.ulps) return -1;
            int fc = (int)funcNameOrder.get(d0.funcName) - (int)funcNameOrder.get(d1.funcName);
            if (fc != 0) return fc;
            if (d0.bits > d1.bits) return  1;
            if (d0.bits < d1.bits) return -1;
            if (d0.range.size() > d1.range.size()) return  1;
            if (d0.range.size() < d1.range.size()) return -1;
            for(int i=0;i<d0.range.size();i++) {
                if (d0.range.get(i) > d1.range.get(i)) return  1;
                if (d0.range.get(i) < d1.range.get(i)) return -1;
            }
            return 0;
        }
    }
    public static void main(String[] args) throws Exception {
        LinkedHashMap<Key, LinkedHashMap<String, Double>> allData = new LinkedHashMap<Key, LinkedHashMap<String, Double>>();
        TreeSet<Key> allKeys = new TreeSet<Key>(new KeyComparator());
        LinkedHashSet<String> allColumnTitles = new LinkedHashSet<String>();
        double maximum = 0;
        for(int i=0;i<args.length;i++) {
            LineNumberReader lnr = new LineNumberReader(new FileReader(args[i]));
            String columnTitle = lnr.readLine();
            allColumnTitles.add(columnTitle);
            for(;;) {
                String s = lnr.readLine();
                if (s == null) break;
                Key key = new Key(s);
                allKeys.add(key);
                LinkedHashMap<String, Double> v = allData.get(key);
                if (v == null) {
                    v = new LinkedHashMap<String, Double>();
                    allData.put(key, v);
                }
                String[] a = s.split(",");
                double time = Double.parseDouble(a[a.length-1]);
                v.put(columnTitle, time);
                maximum = Math.max(maximum, time);
            }
            lnr.close();
        }
        PrintStream ps = new PrintStream("data.out");
        for(Key k : allKeys) {
            ps.print("\"" + k + "\" ");
            LinkedHashMap<String, Double> v = allData.get(k);
            for(String s : allColumnTitles) {
                Double d = v.get(s);
                if (d != null) ps.print(d);
                if (d == null) ps.print("0");
                ps.print("\t");
            }
            ps.println();
        }
        ps.close();
        ps = new PrintStream("script.out");
        ps.println("set terminal pngcairo size 1280, 800 font \",10\"");
        ps.println("set output \"output.png\"");
        ps.println("color00 = \"#FF5050\";"); // red
        ps.println("color01 = \"#0066FF\";"); // blue
        ps.println("color02 = \"#00FF00\";"); // green
        ps.println("color03 = \"#FF9900\";"); // orange
        ps.println("color04 = \"#CC00CC\";"); // purple
        ps.println("color05 = \"#880000\";"); // brown
        ps.println("color06 = \"#003300\";"); // dark green
        ps.println("color07 = \"#000066\";"); // dark blue
        ps.println("set style data histogram");
        ps.println("set style histogram cluster gap 1");
        ps.println("set style fill solid 1.00");
        ps.println("set boxwidth 0.9");
        ps.println("set xtics format \"\"");
        ps.println("set xtics rotate by -90");
        ps.println("set grid ytics");
        ps.println("set ylabel \"Execution time in micro sec.\"");
        ps.println("set yrange [0:*]");
        ps.println("set bmargin 24");
        ps.println("set title \"Single execution time in micro sec.\"");
        ps.print("plot");
        int i = 0;
        for(String s : allColumnTitles) {
            ps.print("\"data.out\" using " + (i+2) + ":xtic(1) title \"" + s +
                     "\" linecolor rgb color" + String.format("%02d", i));
            if (i != allColumnTitles.size()-1) ps.print(", ");
            i++;
        }
        ps.println();
        ps.close();
    }
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/bench.h
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/bench.h
@@ -0,0 +1,58 @@
 #define NITER1 100000
 #define NITER2 10000
 #define NITER (NITER1 * NITER2)
 #define callFuncSLEEF1_1(funcName, name, xmin, xmax, ulp, arg, type) ({ \
      printf("%s\n", #funcName);                                        \
      uint64_t t = Sleef_currentTimeMicros();                           \
      for(int j=0;j<NITER2;j++) {                                       \
        type *p = (type *)(arg);                                        \
        for(int i=0;i<NITER1;i++) funcName(*p++);                       \
      }                                                                 \
      fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n",                    \
              (double)xmin, (double)xmax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
    })
 #define callFuncSLEEF1_2(funcName, name, xmin, xmax, ymin, ymax, ulp, arg1, arg2, type) ({ \
      printf("%s\n", #funcName);                                        \
      uint64_t t = Sleef_currentTimeMicros();                           \
      for(int j=0;j<NITER2;j++) {                                       \
        type *p1 = (type *)(arg1), *p2 = (type *)(arg2);                \
        for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++);               \
      }                                                                 \
      fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n",        \
              (double)xmin, (double)xmax, (double)ymin, (double)ymax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
    })
 #define callFuncSVML1_1(funcName, name, xmin, xmax, arg, type) ({       \
      printf("%s\n", #funcName);                                        \
      uint64_t t = Sleef_currentTimeMicros();                           \
      for(int j=0;j<NITER2;j++) {                                       \
        type *p = (type *)(arg);                                        \
        for(int i=0;i<NITER1;i++) funcName(*p++);                       \
      }                                                                 \
      fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n",                    \
              (double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
    })
 #define callFuncSVML2_1(funcName, name, xmin, xmax, arg, type) ({       \
      printf("%s\n", #funcName);                                        \
      uint64_t t = Sleef_currentTimeMicros();                           \
      for(int j=0;j<NITER2;j++) {                                       \
        type *p = (type *)(arg), c;                                     \
        for(int i=0;i<NITER1;i++) funcName(&c, *p++);                   \
      }                                                                 \
      fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n",                    \
              (double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
    })
 #define callFuncSVML1_2(funcName, name, xmin, xmax, ymin, ymax, arg1, arg2, type) ({ \
      printf("%s\n", #funcName);                                        \
      uint64_t t = Sleef_currentTimeMicros();                           \
      for(int j=0;j<NITER2;j++) {                                       \
        type *p1 = (type *)(arg1), *p2 = (type *)(arg2);                \
        for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++);               \
      }                                                                 \
      fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n",        \
              (double)xmin, (double)xmax, (double)ymin, (double)ymax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
    })
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef.c
@@ -0,0 +1,144 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <sleef.h>
 #include "bench.h"
 int veclen = 16;
 double *abufdp, *bbufdp;
 float *abufsp, *bbufsp;
 FILE *fp;
 #if defined(__i386__) || defined(__x86_64__)
 void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
  uint32_t a, b, c, d;
  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
 }
 int cpuSupportsAVX() {
    int32_t reg[4];
    x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 28)) != 0;
 }
 int cpuSupportsAVX512F() {
    int32_t reg[4];
    x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 16)) != 0;
 }
 #endif
 void fillDP(double *buf, double min, double max) {
  for(int i=0;i<NITER1*veclen;i++) {
    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
    buf[i] = r * (max - min) + min;
  }
 }
 void fillSP(float *buf, double min, double max) {
  for(int i=0;i<NITER1*veclen;i++) {
    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
    buf[i] = r * (max - min) + min;
  }
 }
 void benchSleef128_DPTrig();
 void benchSleef256_DPTrig();
 void benchSleef512_DPTrig();
 void benchSleef128_DPNontrig();
 void benchSleef256_DPNontrig();
 void benchSleef512_DPNontrig();
 void benchSleef128_SPTrig();
 void benchSleef256_SPTrig();
 void benchSleef512_SPTrig();
 void benchSleef128_SPNontrig();
 void benchSleef256_SPNontrig();
 void benchSleef512_SPNontrig();
 //
 int main(int argc, char **argv) {
  char *columnTitle = "SLEEF", *fnBase = "sleef";
  char fn[1024];
  if (argc != 1) columnTitle = argv[1];
  if (argc >= 3) fnBase = argv[2];
  srandom(time(NULL));
 #if defined(__i386__) || defined(__x86_64__)
  int do128bit = 1;
  int do256bit = cpuSupportsAVX();
  int do512bit = cpuSupportsAVX512F();
 #elif defined(__ARM_NEON) || defined(__VSX__) || defined(__VX__)
  int do128bit = 1;
 #else
 #error Unsupported architecture
 #endif
  posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
  posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
  abufsp = (float *)abufdp;
  bbufsp = (float *)bbufdp;
  sprintf(fn, "%sdptrig.out", fnBase);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do128bit) benchSleef128_DPTrig();
 #if defined(__i386__) || defined(__x86_64__)
  if (do256bit) benchSleef256_DPTrig();
  if (do512bit) benchSleef512_DPTrig();
 #endif
  fclose(fp);
  sprintf(fn, "%sdpnontrig.out", fnBase);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do128bit) benchSleef128_DPNontrig();
 #if defined(__i386__) || defined(__x86_64__)
  if (do256bit) benchSleef256_DPNontrig();
  if (do512bit) benchSleef512_DPNontrig();
 #endif
  fclose(fp);
  sprintf(fn, "%ssptrig.out", fnBase);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do128bit) benchSleef128_SPTrig();
 #if defined(__i386__) || defined(__x86_64__)
  if (do256bit) benchSleef256_SPTrig();
  if (do512bit) benchSleef512_SPTrig();
 #endif
  fclose(fp);
  sprintf(fn, "%sspnontrig.out", fnBase);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do128bit) benchSleef128_SPNontrig();
 #if defined(__i386__) || defined(__x86_64__)
  if (do256bit) benchSleef256_SPNontrig();
  if (do512bit) benchSleef512_SPNontrig();
 #endif
  fclose(fp);
  exit(0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c
@@ -0,0 +1,195 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <sleef.h>
 void fillDP(double *buf, double min, double max);
 void fillSP(float *buf, double min, double max);
 extern char x86BrandString[256], versionString[1024];
 extern int veclen;
 extern double *abufdp, *bbufdp;
 extern float *abufsp, *bbufsp;
 extern FILE *fp;
 #include "bench.h"
 #ifdef __SSE2__
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 typedef __m128d vdouble;
 typedef __m128 vfloat;
 #define ENABLED
 #elif defined(__ARM_NEON)
 #include <arm_neon.h>
 typedef float64x2_t vdouble;
 typedef float32x4_t vfloat;
 #define ENABLED
 #elif defined(__VSX__)
 #include <altivec.h>
 typedef __vector double vdouble;
 typedef __vector float  vfloat;
 #define ENABLED
 #elif defined(__VX__)
 #include <vecintrin.h>
 typedef __vector double vdouble;
 typedef __vector float  vfloat;
 #define ENABLED
 #endif
 #ifdef ENABLED
 void benchSleef128_DPTrig() {
  fillDP(abufdp, 0, 6.28);
  callFuncSLEEF1_1(Sleef_sind2_u10   , "sin, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd2_u10   , "cos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand2_u10   , "tan, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind2_u35   , "sin, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd2_u35   , "cos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand2_u35   , "tan, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+6);
  callFuncSLEEF1_1(Sleef_sind2_u10   , "sin, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd2_u10   , "cos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand2_u10   , "tan, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind2_u35   , "sin, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd2_u35   , "cos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand2_u35   , "tan, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+100);
  callFuncSLEEF1_1(Sleef_sind2_u10   , "sin, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd2_u10   , "cos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand2_u10   , "tan, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind2_u35   , "sin, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd2_u35   , "cos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand2_u35   , "tan, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
 }
 void benchSleef128_DPNontrig() {
  fillDP(abufdp, 0, 1e+300);
  callFuncSLEEF1_1(Sleef_logd2_u10  , "log, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_log10d2_u10, "log10, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_log1pd2_u10, "log1p, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_logd2_u35  , "log, DP, 128", 0, 1e+300, 4.0, abufdp, vdouble);
  fillDP(abufdp, -700, 700);
  callFuncSLEEF1_1(Sleef_expd2_u10  , "exp, DP, 128", -700, 700, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_exp2d2_u10 , "exp2, DP, 128", -700, 700, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_exp10d2_u10, "exp10, DP, 128", -700, 700, 1.0, abufdp, vdouble);
  fillDP(abufdp, -30, 30);
  fillDP(bbufdp, -30, 30);
  callFuncSLEEF1_2(Sleef_powd2_u10, "pow, DP, 128", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
  fillDP(abufdp, -1.0, 1.0);
  callFuncSLEEF1_1(Sleef_asind2_u10, "asin, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_acosd2_u10, "acos, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_asind2_u35, "asin, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_acosd2_u35, "acos, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
  fillDP(abufdp, -10, 10);
  fillDP(bbufdp, -10, 10);
  callFuncSLEEF1_1(Sleef_atand2_u10, "atan, DP, 128", -10, 10, 1.0, abufdp, vdouble);
  callFuncSLEEF1_2(Sleef_atan2d2_u10, "atan2, DP, 128", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
  callFuncSLEEF1_1(Sleef_atand2_u35, "atan, DP, 128", -10, 10, 4.0, abufdp, vdouble);
  callFuncSLEEF1_2(Sleef_atan2d2_u35, "atan2, DP, 128", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
 }
 void benchSleef128_SPTrig() {
  fillSP(abufsp, 0, 6.28);
  callFuncSLEEF1_1(Sleef_sinf4_u10   , "sin, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf4_u10   , "cos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf4_u10   , "tan, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sinf4_u35   , "sin, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf4_u35   , "cos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf4_u35   , "tan, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
  fillSP(abufsp, 0, 1e+20);
  callFuncSLEEF1_1(Sleef_sinf4_u10   , "sin, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf4_u10   , "cos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf4_u10   , "tan, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sinf4_u35   , "sin, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf4_u35   , "cos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf4_u35   , "tan, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
 }
 void benchSleef128_SPNontrig() {
  fillSP(abufsp, 0, 1e+38);
  callFuncSLEEF1_1(Sleef_logf4_u10  , "log, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_log10f4_u10, "log10, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log1pf4_u10, "log1p, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_logf4_u35  , "log, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log10f4_u35, "log10, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log1pf4_u35, "log1p, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
  fillSP(abufsp, -100, 100);
  callFuncSLEEF1_1(Sleef_expf4_u10  , "exp, SP, 128", -100, 100, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_exp2f4_u10 , "exp2, SP, 128", -100, 100, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_exp10f4_u10, "exp10, SP, 128", -100, 100, 1.0, abufsp, vfloat);
  fillSP(abufsp, -30, 30);
  fillSP(bbufsp, -30, 30);
  callFuncSLEEF1_2(Sleef_powf4_u10, "pow, SP, 128", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
  fillSP(abufsp, -1.0, 1.0);
  callFuncSLEEF1_1(Sleef_asinf4_u10, "asin, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_acosf4_u10, "acos, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_asinf4_u35, "asin, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_acosf4_u35, "acos, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
  fillSP(abufsp, -10, 10);
  fillSP(bbufsp, -10, 10);
  callFuncSLEEF1_1(Sleef_atanf4_u10, "atan, SP, 128", -10, 10, 1.0, abufsp, vfloat);
  callFuncSLEEF1_2(Sleef_atan2f4_u10, "atan2, SP, 128", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
  callFuncSLEEF1_1(Sleef_atanf4_u35, "atan, SP, 128", -10, 10, 4.0, abufsp, vfloat);
  callFuncSLEEF1_2(Sleef_atan2f4_u35, "atan2, SP, 128", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
 }
 #else // #ifdef ENABLED
 void benchSleef128_DPTrig() {}
 void benchSleef128_DPNontrig() {}
 void benchSleef128_SPTrig() {}
 void benchSleef128_SPNontrig() {}
 #endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c
@@ -0,0 +1,181 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <sleef.h>
 void fillDP(double *buf, double min, double max);
 void fillSP(float *buf, double min, double max);
 extern char x86BrandString[256], versionString[1024];
 extern int veclen;
 extern double *abufdp, *bbufdp;
 extern float *abufsp, *bbufsp;
 extern FILE *fp;
 #include "bench.h"
 #ifdef __AVX__
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 typedef __m256d vdouble;
 typedef __m256 vfloat;
 #define ENABLED
 #endif
 #ifdef ENABLED
 void benchSleef256_DPTrig() {
  fillDP(abufdp, 0, 6.28);
  callFuncSLEEF1_1(Sleef_sind4_u10   , "sin, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd4_u10   , "cos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand4_u10   , "tan, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind4_u35   , "sin, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd4_u35   , "cos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand4_u35   , "tan, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+6);
  callFuncSLEEF1_1(Sleef_sind4_u10   , "sin, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd4_u10   , "cos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand4_u10   , "tan, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind4_u35   , "sin, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd4_u35   , "cos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand4_u35   , "tan, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+100);
  callFuncSLEEF1_1(Sleef_sind4_u10   , "sin, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd4_u10   , "cos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand4_u10   , "tan, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind4_u35   , "sin, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd4_u35   , "cos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand4_u35   , "tan, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
 }
 void benchSleef256_DPNontrig() {
  fillDP(abufdp, 0, 1e+300);
  callFuncSLEEF1_1(Sleef_logd4_u10  , "log, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_log10d4_u10, "log10, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_log1pd4_u10, "log1p, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_logd4_u35  , "log, DP, 256", 0, 1e+300, 4.0, abufdp, vdouble);
  fillDP(abufdp, -700, 700);
  callFuncSLEEF1_1(Sleef_expd4_u10  , "exp, DP, 256", -700, 700, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_exp2d4_u10 , "exp2, DP, 256", -700, 700, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_exp10d4_u10, "exp10, DP, 256", -700, 700, 1.0, abufdp, vdouble);
  fillDP(abufdp, -30, 30);
  fillDP(bbufdp, -30, 30);
  callFuncSLEEF1_2(Sleef_powd4_u10, "pow, DP, 256", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
  fillDP(abufdp, -1.0, 1.0);
  callFuncSLEEF1_1(Sleef_asind4_u10, "asin, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_acosd4_u10, "acos, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_asind4_u35, "asin, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_acosd4_u35, "acos, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
  fillDP(abufdp, -10, 10);
  fillDP(bbufdp, -10, 10);
  callFuncSLEEF1_1(Sleef_atand4_u10, "atan, DP, 256", -10, 10, 1.0, abufdp, vdouble);
  callFuncSLEEF1_2(Sleef_atan2d4_u10, "atan2, DP, 256", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
  callFuncSLEEF1_1(Sleef_atand4_u35, "atan, DP, 256", -10, 10, 4.0, abufdp, vdouble);
  callFuncSLEEF1_2(Sleef_atan2d4_u35, "atan2, DP, 256", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
 }
 void benchSleef256_SPTrig() {
  fillSP(abufsp, 0, 6.28);
  callFuncSLEEF1_1(Sleef_sinf8_u10   , "sin, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf8_u10   , "cos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf8_u10   , "tan, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sinf8_u35   , "sin, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf8_u35   , "cos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf8_u35   , "tan, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
  fillSP(abufsp, 0, 1e+20);
  callFuncSLEEF1_1(Sleef_sinf8_u10   , "sin, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf8_u10   , "cos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf8_u10   , "tan, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sinf8_u35   , "sin, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf8_u35   , "cos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf8_u35   , "tan, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
 }
 void benchSleef256_SPNontrig() {
  fillSP(abufsp, 0, 1e+38);
  callFuncSLEEF1_1(Sleef_logf8_u10  , "log, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_log10f8_u10, "log10, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log1pf8_u10, "log1p, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_logf8_u35  , "log, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log10f8_u35, "log10, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log1pf8_u35, "log1p, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
  fillSP(abufsp, -100, 100);
  callFuncSLEEF1_1(Sleef_expf8_u10  , "exp, SP, 256", -100, 100, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_exp2f8_u10 , "exp2, SP, 256", -100, 100, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_exp10f8_u10, "exp10, SP, 256", -100, 100, 1.0, abufsp, vfloat);
  fillSP(abufsp, -30, 30);
  fillSP(bbufsp, -30, 30);
  callFuncSLEEF1_2(Sleef_powf8_u10, "pow, SP, 256", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
  fillSP(abufsp, -1.0, 1.0);
  callFuncSLEEF1_1(Sleef_asinf8_u10, "asin, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_acosf8_u10, "acos, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_asinf8_u35, "asin, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_acosf8_u35, "acos, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
  fillSP(abufsp, -10, 10);
  fillSP(bbufsp, -10, 10);
  callFuncSLEEF1_1(Sleef_atanf8_u10, "atan, SP, 256", -10, 10, 1.0, abufsp, vfloat);
  callFuncSLEEF1_2(Sleef_atan2f8_u10, "atan2, SP, 256", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
  callFuncSLEEF1_1(Sleef_atanf8_u35, "atan, SP, 256", -10, 10, 4.0, abufsp, vfloat);
  callFuncSLEEF1_2(Sleef_atan2f8_u35, "atan2, SP, 256", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
 }
 #else // #ifdef ENABLED
 void zeroupper256() {}
 void benchSleef256_DPTrig() {}
 void benchSleef256_DPNontrig() {}
 void benchSleef256_SPTrig() {}
 void benchSleef256_SPNontrig() {}
 #endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c
@@ -0,0 +1,180 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <sleef.h>
 void fillDP(double *buf, double min, double max);
 void fillSP(float *buf, double min, double max);
 extern char x86BrandString[256], versionString[1024];
 extern int veclen;
 extern double *abufdp, *bbufdp;
 extern float *abufsp, *bbufsp;
 extern FILE *fp;
 #include "bench.h"
 #ifdef __AVX512F__
 #if defined(_MSC_VER)
 #include <intrin.h>
 #else
 #include <x86intrin.h>
 #endif
 typedef __m512d vdouble;
 typedef __m512 vfloat;
 #define ENABLED
 #endif
 #ifdef ENABLED
 void benchSleef512_DPTrig() {
  fillDP(abufdp, 0, 6.28);
  callFuncSLEEF1_1(Sleef_sind8_u10   , "sin, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd8_u10   , "cos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand8_u10   , "tan, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind8_u35   , "sin, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd8_u35   , "cos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand8_u35   , "tan, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+6);
  callFuncSLEEF1_1(Sleef_sind8_u10   , "sin, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd8_u10   , "cos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand8_u10   , "tan, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind8_u35   , "sin, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd8_u35   , "cos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand8_u35   , "tan, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+100);
  callFuncSLEEF1_1(Sleef_sind8_u10   , "sin, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd8_u10   , "cos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand8_u10   , "tan, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sind8_u35   , "sin, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_cosd8_u35   , "cos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_tand8_u35   , "tan, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
 }
 void benchSleef512_DPNontrig() {
  fillDP(abufdp, 0, 1e+300);
  callFuncSLEEF1_1(Sleef_logd8_u10  , "log, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_log10d8_u10, "log10, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_log1pd8_u10, "log1p, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_logd8_u35  , "log, DP, 512", 0, 1e+300, 4.0, abufdp, vdouble);
  fillDP(abufdp, -700, 700);
  callFuncSLEEF1_1(Sleef_expd8_u10  , "exp, DP, 512", -700, 700, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_exp2d8_u10 , "exp2, DP, 512", -700, 700, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_exp10d8_u10, "exp10, DP, 512", -700, 700, 1.0, abufdp, vdouble);
  fillDP(abufdp, -30, 30);
  fillDP(bbufdp, -30, 30);
  callFuncSLEEF1_2(Sleef_powd8_u10, "pow, DP, 512", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
  fillDP(abufdp, -1.0, 1.0);
  callFuncSLEEF1_1(Sleef_asind8_u10, "asin, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_acosd8_u10, "acos, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_asind8_u35, "asin, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
  callFuncSLEEF1_1(Sleef_acosd8_u35, "acos, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
  fillDP(abufdp, -10, 10);
  fillDP(bbufdp, -10, 10);
  callFuncSLEEF1_1(Sleef_atand8_u10, "atan, DP, 512", -10, 10, 1.0, abufdp, vdouble);
  callFuncSLEEF1_2(Sleef_atan2d8_u10, "atan2, DP, 512", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
  callFuncSLEEF1_1(Sleef_atand8_u35, "atan, DP, 512", -10, 10, 4.0, abufdp, vdouble);
  callFuncSLEEF1_2(Sleef_atan2d8_u35, "atan2, DP, 512", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
 }
 void benchSleef512_SPTrig() {
  fillSP(abufsp, 0, 6.28);
  callFuncSLEEF1_1(Sleef_sinf16_u10   , "sin, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf16_u10   , "cos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf16_u10   , "tan, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sinf16_u35   , "sin, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf16_u35   , "cos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf16_u35   , "tan, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
  fillSP(abufsp, 0, 1e+20);
  callFuncSLEEF1_1(Sleef_sinf16_u10   , "sin, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf16_u10   , "cos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf16_u10   , "tan, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sinf16_u35   , "sin, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_cosf16_u35   , "cos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_tanf16_u35   , "tan, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
 }
 void benchSleef512_SPNontrig() {
  fillSP(abufsp, 0, 1e+38);
  callFuncSLEEF1_1(Sleef_logf16_u10  , "log, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_log10f16_u10, "log10, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log1pf16_u10, "log1p, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_logf16_u35  , "log, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log10f16_u35, "log10, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
  //callFuncSLEEF1_1(Sleef_log1pf16_u35, "log1p, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
  fillSP(abufsp, -100, 100);
  callFuncSLEEF1_1(Sleef_expf16_u10  , "exp, SP, 512", -100, 100, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_exp2f16_u10 , "exp2, SP, 512", -100, 100, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_exp10f16_u10, "exp10, SP, 512", -100, 100, 1.0, abufsp, vfloat);
  fillSP(abufsp, -30, 30);
  fillSP(bbufsp, -30, 30);
  callFuncSLEEF1_2(Sleef_powf16_u10, "pow, SP, 512", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
  fillSP(abufsp, -1.0, 1.0);
  callFuncSLEEF1_1(Sleef_asinf16_u10, "asin, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_acosf16_u10, "acos, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_asinf16_u35, "asin, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
  callFuncSLEEF1_1(Sleef_acosf16_u35, "acos, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
  fillSP(abufsp, -10, 10);
  fillSP(bbufsp, -10, 10);
  callFuncSLEEF1_1(Sleef_atanf16_u10, "atan, SP, 512", -10, 10, 1.0, abufsp, vfloat);
  callFuncSLEEF1_2(Sleef_atan2f16_u10, "atan2, SP, 512", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
  callFuncSLEEF1_1(Sleef_atanf16_u35, "atan, SP, 512", -10, 10, 4.0, abufsp, vfloat);
  callFuncSLEEF1_2(Sleef_atan2f16_u35, "atan2, SP, 512", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
 }
 #else // #ifdef ENABLED
 void benchSleef512_DPTrig() {}
 void benchSleef512_DPNontrig() {}
 void benchSleef512_SPTrig() {}
 void benchSleef512_SPNontrig() {}
 #endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c
@@ -0,0 +1,153 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <unistd.h>
 #include <x86intrin.h>
 #include "bench.h"
 int veclen = 16;
 int enableLogExp;
 double *abufdp, *bbufdp;
 float *abufsp, *bbufsp;
 FILE *fp;
 #if defined(__i386__) || defined(__x86_64__)
 void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
  uint32_t a, b, c, d;
  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
 }
 int cpuSupportsAVX() {
    int32_t reg[4];
    x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 28)) != 0;
 }
 int cpuSupportsAVX512F() {
    int32_t reg[4];
    x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 16)) != 0;
 }
 #endif
 uint64_t Sleef_currentTimeMicros() {
  struct timespec tp;
  clock_gettime(CLOCK_MONOTONIC, &tp);
  return (uint64_t)tp.tv_sec * 1000000LL + ((uint64_t)tp.tv_nsec/1000);
 }
 void fillDP(double *buf, double min, double max) {
  for(int i=0;i<NITER1*veclen;i++) {
    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
    buf[i] = r * (max - min) + min;
  }
 }
 void fillSP(float *buf, double min, double max) {
  for(int i=0;i<NITER1*veclen;i++) {
    double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
    buf[i] = r * (max - min) + min;
  }
 }
 void zeroupper256();
 void benchSVML128_DPTrig();
 void benchSVML256_DPTrig();
 void benchSVML512_DPTrig();
 void benchSVML128_DPNontrig();
 void benchSVML256_DPNontrig();
 void benchSVML512_DPNontrig();
 void benchSVML128_SPTrig();
 void benchSVML256_SPTrig();
 void benchSVML512_SPTrig();
 void benchSVML128_SPNontrig();
 void benchSVML256_SPNontrig();
 void benchSVML512_SPNontrig();
 //
 int main(int argc, char **argv) {
  char *columnTitle = "SVML", *fnBase = "svml";
  char fn[1024];
  if (argc != 1) columnTitle = argv[1];
  if (argc >= 3) fnBase = argv[2];
  srandom(time(NULL));
 #if defined(__i386__) || defined(__x86_64__)
  int do128bit = 1;
  int do256bit = cpuSupportsAVX();
  int do512bit = cpuSupportsAVX512F();
 #elif defined(__ARM_NEON)
  int do128bit = 1;
  int do256bit = 0;
  int do512bit = 0;
 #else
 #error Unsupported architecture
 #endif
  posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
  posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
  abufsp = (float *)abufdp;
  bbufsp = (float *)bbufdp;
  enableLogExp = SVMLULP < 2;
  sprintf(fn, "%sdptrig%gulp.out", fnBase, (double)SVMLULP);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do256bit) zeroupper256();
  if (do128bit) benchSVML128_DPTrig();
  if (do256bit) benchSVML256_DPTrig();
  if (do512bit) benchSVML512_DPTrig();
  fclose(fp);
  sprintf(fn, "%sdpnontrig%gulp.out", fnBase, (double)SVMLULP);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do256bit) zeroupper256();
  if (do128bit) benchSVML128_DPNontrig();
  if (do256bit) benchSVML256_DPNontrig();
  if (do512bit) benchSVML512_DPNontrig();
  fclose(fp);
  sprintf(fn, "%ssptrig%gulp.out", fnBase, (double)SVMLULP);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do256bit) zeroupper256();
  if (do128bit) benchSVML128_SPTrig();
  if (do256bit) benchSVML256_SPTrig();
  if (do512bit) benchSVML512_SPTrig();
  fclose(fp);
  sprintf(fn, "%sspnontrig%gulp.out", fnBase, (double)SVMLULP);
  fp = fopen(fn, "w");
  fprintf(fp, "%s\n", columnTitle);
  if (do256bit) zeroupper256();
  if (do128bit) benchSVML128_SPNontrig();
  if (do256bit) benchSVML256_SPNontrig();
  if (do512bit) benchSVML512_SPNontrig();
  fclose(fp);
  exit(0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c
@@ -0,0 +1,144 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <unistd.h>
 #include <x86intrin.h>
 uint64_t Sleef_currentTimeMicros();
 void fillDP(double *buf, double min, double max);
 void fillSP(float *buf, double min, double max);
 extern char x86BrandString[256], versionString[1024];
 extern int veclen;
 extern int enableLogExp;
 extern double *abufdp, *bbufdp;
 extern float *abufsp, *bbufsp;
 extern FILE *fp;
 #include "bench.h"
 #ifdef __SSE2__
 typedef __m128d vdouble;
 typedef __m128 vfloat;
 #define ENABLED
 #endif
 #ifdef ENABLED
 void benchSVML128_DPTrig() {
  fillDP(abufdp, 0, 6.28);
  callFuncSVML1_1(_mm_sin_pd   , "sin, DP, 128", 0, 6.28, abufdp, vdouble);
  callFuncSVML1_1(_mm_cos_pd   , "cos, DP, 128", 0, 6.28, abufdp, vdouble);
  callFuncSVML1_1(_mm_tan_pd   , "tan, DP, 128", 0, 6.28, abufdp, vdouble);
  callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 6.28, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+6);
  callFuncSVML1_1(_mm_sin_pd   , "sin, DP, 128", 0, 1e+6, abufdp, vdouble);
  callFuncSVML1_1(_mm_cos_pd   , "cos, DP, 128", 0, 1e+6, abufdp, vdouble);
  callFuncSVML1_1(_mm_tan_pd   , "tan, DP, 128", 0, 1e+6, abufdp, vdouble);
  callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+6, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+100);
  callFuncSVML1_1(_mm_sin_pd   , "sin, DP, 128", 0, 1e+100, abufdp, vdouble);
  callFuncSVML1_1(_mm_cos_pd   , "cos, DP, 128", 0, 1e+100, abufdp, vdouble);
  callFuncSVML1_1(_mm_tan_pd   , "tan, DP, 128", 0, 1e+100, abufdp, vdouble);
  callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+100, abufdp, vdouble);
 }
 void benchSVML128_DPNontrig() {
  fillDP(abufdp, 0, 1e+300);
  callFuncSVML1_1(_mm_log_pd  , "log, DP, 128", 0, 1e+300, abufdp, vdouble);
  if (enableLogExp) {
    callFuncSVML1_1(_mm_log10_pd, "log10, DP, 128", 0, 1e+300, abufdp, vdouble);
    callFuncSVML1_1(_mm_log1p_pd, "log1p, DP, 128", 0, 1e+300, abufdp, vdouble);
    fillDP(abufdp, -700, 700);
    callFuncSVML1_1(_mm_exp_pd  , "exp, DP, 128", -700, 700, abufdp, vdouble);
    callFuncSVML1_1(_mm_exp2_pd , "exp2, DP, 128", -700, 700, abufdp, vdouble);
    callFuncSVML1_1(_mm_exp10_pd, "exp10, DP, 128", -700, 700, abufdp, vdouble);
    fillDP(abufdp, -30, 30);
    fillDP(bbufdp, -30, 30);
    callFuncSVML1_2(_mm_pow_pd, "pow, DP, 128", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
  }
  fillDP(abufdp, -1.0, 1.0);
  callFuncSVML1_1(_mm_asin_pd, "asin, DP, 128", -1.0, 1.0, abufdp, vdouble);
  callFuncSVML1_1(_mm_acos_pd, "acos, DP, 128", -1.0, 1.0, abufdp, vdouble);
  fillDP(abufdp, -10, 10);
  fillDP(bbufdp, -10, 10);
  callFuncSVML1_1(_mm_atan_pd, "atan, DP, 128", -10, 10, abufdp, vdouble);
  callFuncSVML1_2(_mm_atan2_pd, "atan2, DP, 128", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
 }
 void benchSVML128_SPTrig() {
  fillSP(abufsp, 0, 6.28);
  callFuncSVML1_1(_mm_sin_ps   , "sin, SP, 128", 0, 6.28, abufsp, vfloat);
  callFuncSVML1_1(_mm_cos_ps   , "cos, SP, 128", 0, 6.28, abufsp, vfloat);
  callFuncSVML1_1(_mm_tan_ps   , "tan, SP, 128", 0, 6.28, abufsp, vfloat);
  callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 6.28, abufsp, vfloat);
  fillSP(abufsp, 0, 1e+20);
  callFuncSVML1_1(_mm_sin_ps   , "sin, SP, 128", 0, 1e+20, abufsp, vfloat);
  callFuncSVML1_1(_mm_cos_ps   , "cos, SP, 128", 0, 1e+20, abufsp, vfloat);
  callFuncSVML1_1(_mm_tan_ps   , "tan, SP, 128", 0, 1e+20, abufsp, vfloat);
  callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 1e+20, abufsp, vfloat);
 }
 void benchSVML128_SPNontrig() {
  fillSP(abufsp, 0, 1e+38);
  callFuncSVML1_1(_mm_log_ps  , "log, SP, 128", 0, 1e+38, abufsp, vfloat);
  if (enableLogExp) {
    callFuncSVML1_1(_mm_log10_ps, "log10, SP, 128", 0, 1e+38, abufsp, vfloat);
    //callFuncSVML1_1(_mm_log1p_ps, "log1p, SP, 128", 0, 1e+38, abufsp, vfloat);
    fillSP(abufsp, -100, 100);
    callFuncSVML1_1(_mm_exp_ps  , "exp, SP, 128", -100, 100, abufsp, vfloat);
    callFuncSVML1_1(_mm_exp2_ps , "exp2, SP, 128", -100, 100, abufsp, vfloat);
    callFuncSVML1_1(_mm_exp10_ps, "exp10, SP, 128", -100, 100, abufsp, vfloat);
    fillSP(abufsp, -30, 30);
    fillSP(bbufsp, -30, 30);
    callFuncSVML1_2(_mm_pow_ps, "pow, SP, 128", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
  }
  fillSP(abufsp, -1.0, 1.0);
  callFuncSVML1_1(_mm_asin_ps, "asin, SP, 128", -1.0, 1, abufsp, vfloat);
  callFuncSVML1_1(_mm_acos_ps, "acos, SP, 128", -1.0, 1, abufsp, vfloat);
  fillSP(abufsp, -10, 10);
  fillSP(bbufsp, -10, 10);
  callFuncSVML1_1(_mm_atan_ps, "atan, SP, 128", -10, 10, abufsp, vfloat);
  callFuncSVML1_2(_mm_atan2_ps, "atan2, SP, 128", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
 }
 #else // #ifdef ENABLED
 void benchSVML128_DPTrig() {}
 void benchSVML128_DPNontrig() {}
 void benchSVML128_SPTrig() {}
 void benchSVML128_SPNontrig() {}
 #endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c
@@ -0,0 +1,147 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <unistd.h>
 #include <x86intrin.h>
 uint64_t Sleef_currentTimeMicros();
 void fillDP(double *buf, double min, double max);
 void fillSP(float *buf, double min, double max);
 extern char x86BrandString[256], versionString[1024];
 extern int veclen;
 extern int enableLogExp;
 extern double *abufdp, *bbufdp;
 extern float *abufsp, *bbufsp;
 extern FILE *fp;
 #include "bench.h"
 #ifdef __AVX__
 typedef __m256d vdouble;
 typedef __m256 vfloat;
 #define ENABLED
 #endif
 #ifdef ENABLED
 void zeroupper256() { _mm256_zeroupper(); }
 void benchSVML256_DPTrig() {
  fillDP(abufdp, 0, 6.28);
  callFuncSVML1_1(_mm256_sin_pd   , "sin, DP, 256", 0, 6.28, abufdp, vdouble);
  callFuncSVML1_1(_mm256_cos_pd   , "cos, DP, 256", 0, 6.28, abufdp, vdouble);
  callFuncSVML1_1(_mm256_tan_pd   , "tan, DP, 256", 0, 6.28, abufdp, vdouble);
  callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 6.28, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+6);
  callFuncSVML1_1(_mm256_sin_pd   , "sin, DP, 256", 0, 1e+6, abufdp, vdouble);
  callFuncSVML1_1(_mm256_cos_pd   , "cos, DP, 256", 0, 1e+6, abufdp, vdouble);
  callFuncSVML1_1(_mm256_tan_pd   , "tan, DP, 256", 0, 1e+6, abufdp, vdouble);
  callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+6, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+100);
  callFuncSVML1_1(_mm256_sin_pd   , "sin, DP, 256", 0, 1e+100, abufdp, vdouble);
  callFuncSVML1_1(_mm256_cos_pd   , "cos, DP, 256", 0, 1e+100, abufdp, vdouble);
  callFuncSVML1_1(_mm256_tan_pd   , "tan, DP, 256", 0, 1e+100, abufdp, vdouble);
  callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+100, abufdp, vdouble);
 }
 void benchSVML256_DPNontrig() {
  fillDP(abufdp, 0, 1e+300);
  callFuncSVML1_1(_mm256_log_pd  , "log, DP, 256", 0, 1e+300, abufdp, vdouble);
  if (enableLogExp) {
    callFuncSVML1_1(_mm256_log10_pd, "log10, DP, 256", 0, 1e+300, abufdp, vdouble);
    callFuncSVML1_1(_mm256_log1p_pd, "log1p, DP, 256", 0, 1e+300, abufdp, vdouble);
    fillDP(abufdp, -700, 700);
    callFuncSVML1_1(_mm256_exp_pd  , "exp, DP, 256", -700, 700, abufdp, vdouble);
    callFuncSVML1_1(_mm256_exp2_pd , "exp2, DP, 256", -700, 700, abufdp, vdouble);
    callFuncSVML1_1(_mm256_exp10_pd, "exp10, DP, 256", -700, 700, abufdp, vdouble);
    fillDP(abufdp, -30, 30);
    fillDP(bbufdp, -30, 30);
    callFuncSVML1_2(_mm256_pow_pd, "pow, DP, 256", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
  }
  fillDP(abufdp, -1.0, 1.0);
  callFuncSVML1_1(_mm256_asin_pd, "asin, DP, 256", -1.0, 1.0, abufdp, vdouble);
  callFuncSVML1_1(_mm256_acos_pd, "acos, DP, 256", -1.0, 1.0, abufdp, vdouble);
  fillDP(abufdp, -10, 10);
  fillDP(bbufdp, -10, 10);
  callFuncSVML1_1(_mm256_atan_pd, "atan, DP, 256", -10, 10, abufdp, vdouble);
  callFuncSVML1_2(_mm256_atan2_pd, "atan2, DP, 256", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
 }
 void benchSVML256_SPTrig() {
  fillSP(abufsp, 0, 6.28);
  callFuncSVML1_1(_mm256_sin_ps   , "sin, SP, 256", 0, 6.28, abufsp, vfloat);
  callFuncSVML1_1(_mm256_cos_ps   , "cos, SP, 256", 0, 6.28, abufsp, vfloat);
  callFuncSVML1_1(_mm256_tan_ps   , "tan, SP, 256", 0, 6.28, abufsp, vfloat);
  callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 6.28, abufsp, vfloat);
  fillSP(abufsp, 0, 1e+20);
  callFuncSVML1_1(_mm256_sin_ps   , "sin, SP, 256", 0, 1e+20, abufsp, vfloat);
  callFuncSVML1_1(_mm256_cos_ps   , "cos, SP, 256", 0, 1e+20, abufsp, vfloat);
  callFuncSVML1_1(_mm256_tan_ps   , "tan, SP, 256", 0, 1e+20, abufsp, vfloat);
  callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 1e+20, abufsp, vfloat);
 }
 void benchSVML256_SPNontrig() {
  fillSP(abufsp, 0, 1e+38);
  callFuncSVML1_1(_mm256_log_ps  , "log, SP, 256", 0, 1e+38, abufsp, vfloat);
  if (enableLogExp) {
    callFuncSVML1_1(_mm256_log10_ps, "log10, SP, 256", 0, 1e+38, abufsp, vfloat);
    //callFuncSVML1_1(_mm256_log1p_ps, "log1p, SP, 256", 0, 1e+38, abufsp, vfloat);
    fillSP(abufsp, -100, 100);
    callFuncSVML1_1(_mm256_exp_ps  , "exp, SP, 256", -100, 100, abufsp, vfloat);
    callFuncSVML1_1(_mm256_exp2_ps , "exp2, SP, 256", -100, 100, abufsp, vfloat);
    callFuncSVML1_1(_mm256_exp10_ps, "exp10, SP, 256", -100, 100, abufsp, vfloat);
    fillSP(abufsp, -30, 30);
    fillSP(bbufsp, -30, 30);
    callFuncSVML1_2(_mm256_pow_ps, "pow, SP, 256", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
  }
  fillSP(abufsp, -1.0, 1.0);
  callFuncSVML1_1(_mm256_asin_ps, "asin, SP, 256", -1.0, 1, abufsp, vfloat);
  callFuncSVML1_1(_mm256_acos_ps, "acos, SP, 256", -1.0, 1, abufsp, vfloat);
  fillSP(abufsp, -10, 10);
  fillSP(bbufsp, -10, 10);
  callFuncSVML1_1(_mm256_atan_ps, "atan, SP, 256", -10, 10, abufsp, vfloat);
  callFuncSVML1_2(_mm256_atan2_ps, "atan2, SP, 256", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
 }
 #else // #ifdef ENABLED
 void zeroupper256() {}
 void benchSVML256_DPTrig() {}
 void benchSVML256_DPNontrig() {}
 void benchSVML256_SPTrig() {}
 void benchSVML256_SPNontrig() {}
 #endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c
@@ -0,0 +1,144 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #include <time.h>
 #include <unistd.h>
 #include <x86intrin.h>
 uint64_t Sleef_currentTimeMicros();
 void fillDP(double *buf, double min, double max);
 void fillSP(float *buf, double min, double max);
 extern char x86BrandString[256], versionString[1024];
 extern int veclen;
 extern int enableLogExp;
 extern double *abufdp, *bbufdp;
 extern float *abufsp, *bbufsp;
 extern FILE *fp;
 #include "bench.h"
 #ifdef __AVX512F__
 typedef __m512d vdouble;
 typedef __m512 vfloat;
 #define ENABLED
 #endif
 #ifdef ENABLED
 void benchSVML512_DPTrig() {
  fillDP(abufdp, 0, 6.28);
  callFuncSVML1_1(_mm512_sin_pd   , "sin, DP, 512", 0, 6.28, abufdp, vdouble);
  callFuncSVML1_1(_mm512_cos_pd   , "cos, DP, 512", 0, 6.28, abufdp, vdouble);
  callFuncSVML1_1(_mm512_tan_pd   , "tan, DP, 512", 0, 6.28, abufdp, vdouble);
  callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 6.28, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+6);
  callFuncSVML1_1(_mm512_sin_pd   , "sin, DP, 512", 0, 1e+6, abufdp, vdouble);
  callFuncSVML1_1(_mm512_cos_pd   , "cos, DP, 512", 0, 1e+6, abufdp, vdouble);
  callFuncSVML1_1(_mm512_tan_pd   , "tan, DP, 512", 0, 1e+6, abufdp, vdouble);
  callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+6, abufdp, vdouble);
  fillDP(abufdp, 0, 1e+100);
  callFuncSVML1_1(_mm512_sin_pd   , "sin, DP, 512", 0, 1e+100, abufdp, vdouble);
  callFuncSVML1_1(_mm512_cos_pd   , "cos, DP, 512", 0, 1e+100, abufdp, vdouble);
  callFuncSVML1_1(_mm512_tan_pd   , "tan, DP, 512", 0, 1e+100, abufdp, vdouble);
  callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+100, abufdp, vdouble);
 }
 void benchSVML512_DPNontrig() {
  fillDP(abufdp, 0, 1e+300);
  callFuncSVML1_1(_mm512_log_pd  , "log, DP, 512", 0, 1e+300, abufdp, vdouble);
  if (enableLogExp) {
    callFuncSVML1_1(_mm512_log10_pd, "log10, DP, 512", 0, 1e+300, abufdp, vdouble);
    callFuncSVML1_1(_mm512_log1p_pd, "log1p, DP, 512", 0, 1e+300, abufdp, vdouble);
    fillDP(abufdp, -700, 700);
    callFuncSVML1_1(_mm512_exp_pd  , "exp, DP, 512", -700, 700, abufdp, vdouble);
    callFuncSVML1_1(_mm512_exp2_pd , "exp2, DP, 512", -700, 700, abufdp, vdouble);
    callFuncSVML1_1(_mm512_exp10_pd, "exp10, DP, 512", -700, 700, abufdp, vdouble);
    fillDP(abufdp, -30, 30);
    fillDP(bbufdp, -30, 30);
    callFuncSVML1_2(_mm512_pow_pd, "pow, DP, 512", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
  }
  fillDP(abufdp, -1.0, 1.0);
  callFuncSVML1_1(_mm512_asin_pd, "asin, DP, 512", -1.0, 1.0, abufdp, vdouble);
  callFuncSVML1_1(_mm512_acos_pd, "acos, DP, 512", -1.0, 1.0, abufdp, vdouble);
  fillDP(abufdp, -10, 10);
  fillDP(bbufdp, -10, 10);
  callFuncSVML1_1(_mm512_atan_pd, "atan, DP, 512", -10, 10, abufdp, vdouble);
  callFuncSVML1_2(_mm512_atan2_pd, "atan2, DP, 512", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
 }
 void benchSVML512_SPTrig() {
  fillSP(abufsp, 0, 6.28);
  callFuncSVML1_1(_mm512_sin_ps   , "sin, SP, 512", 0, 6.28, abufsp, vfloat);
  callFuncSVML1_1(_mm512_cos_ps   , "cos, SP, 512", 0, 6.28, abufsp, vfloat);
  callFuncSVML1_1(_mm512_tan_ps   , "tan, SP, 512", 0, 6.28, abufsp, vfloat);
  callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 6.28, abufsp, vfloat);
  fillSP(abufsp, 0, 1e+20);
  callFuncSVML1_1(_mm512_sin_ps   , "sin, SP, 512", 0, 1e+20, abufsp, vfloat);
  callFuncSVML1_1(_mm512_cos_ps   , "cos, SP, 512", 0, 1e+20, abufsp, vfloat);
  callFuncSVML1_1(_mm512_tan_ps   , "tan, SP, 512", 0, 1e+20, abufsp, vfloat);
  callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 1e+20, abufsp, vfloat);
 }
 void benchSVML512_SPNontrig() {
  fillSP(abufsp, 0, 1e+38);
  callFuncSVML1_1(_mm512_log_ps  , "log, SP, 512", 0, 1e+38, abufsp, vfloat);
  if (enableLogExp) {
    callFuncSVML1_1(_mm512_log10_ps, "log10, SP, 512", 0, 1e+38, abufsp, vfloat);
    //callFuncSVML1_1(_mm512_log1p_ps, "log1p, SP, 512", 0, 1e+38, abufsp, vfloat);
    fillSP(abufsp, -100, 100);
    callFuncSVML1_1(_mm512_exp_ps  , "exp, SP, 512", -100, 100, abufsp, vfloat);
    callFuncSVML1_1(_mm512_exp2_ps , "exp2, SP, 512", -100, 100, abufsp, vfloat);
    callFuncSVML1_1(_mm512_exp10_ps, "exp10, SP, 512", -100, 100, abufsp, vfloat);
    fillSP(abufsp, -30, 30);
    fillSP(bbufsp, -30, 30);
    callFuncSVML1_2(_mm512_pow_ps, "pow, SP, 512", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
  }
  fillSP(abufsp, -1.0, 1.0);
  callFuncSVML1_1(_mm512_asin_ps, "asin, SP, 512", -1.0, 1, abufsp, vfloat);
  callFuncSVML1_1(_mm512_acos_ps, "acos, SP, 512", -1.0, 1, abufsp, vfloat);
  fillSP(abufsp, -10, 10);
  fillSP(bbufsp, -10, 10);
  callFuncSVML1_1(_mm512_atan_ps, "atan, SP, 512", -10, 10, abufsp, vfloat);
  callFuncSVML1_2(_mm512_atan2_ps, "atan2, SP, 512", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
 }
 #else // #ifdef ENABLED
 void benchSVML512_DPTrig() {}
 void benchSVML512_DPNontrig() {}
 void benchSVML512_SPTrig() {}
 void benchSVML512_SPNontrig() {}
 #endif // #ifdef ENABLED
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/measure.sh
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-benchmarks/measure.sh
@@ -0,0 +1,17 @@
 #!/bin/sh
 echo
 read -p "Enter label of measurement(e.g. My desktop PC) : " label
 if [ -f counter.txt ]
 then
    counter=`cat counter.txt`
 else
    counter=0
 fi
 echo Measurement in progress. This may take several minutes.
 for i in $*; do
    $i "$label" $counter
 done
 counter=$((counter+1))
 echo $counter > counter.txt
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/CMakeLists.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/CMakeLists.txt
@@ -0,0 +1,517 @@
 # Settings
 # TESTER3_DEFINITIONS
 set(TESTER3_DEFINITIONS_SSE2          ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4  EXTSPEC=sse2)
 set(TESTER3_DEFINITIONS_SSE4          ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4  EXTSPEC=sse4)
 set(TESTER3_DEFINITIONS_AVX2128       ATR=finz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4  EXTSPEC=avx2128)
 set(TESTER3_DEFINITIONS_AVX           ATR=cinz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8  EXTSPEC=avx)
 set(TESTER3_DEFINITIONS_FMA4          ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8  EXTSPEC=fma4)
 set(TESTER3_DEFINITIONS_AVX2          ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8  EXTSPEC=avx2)
 set(TESTER3_DEFINITIONS_AVX512F       ATR=finz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512f)
 set(TESTER3_DEFINITIONS_AVX512FNOFMA  ATR=cinz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512fnofma)
 set(TESTER3_DEFINITIONS_ADVSIMD       ATR=finz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimd)
 set(TESTER3_DEFINITIONS_ADVSIMDNOFMA  ATR=cinz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimdnofma)
 set(TESTER3_DEFINITIONS_SVE           ATR=finz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=sve)
 set(TESTER3_DEFINITIONS_SVENOFMA      ATR=cinz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=svenofma)
 set(TESTER3_DEFINITIONS_VSX       ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx)
 set(TESTER3_DEFINITIONS_VSXNOFMA  ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsxnofma)
 set(TESTER3_DEFINITIONS_VSX3      ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3)
 set(TESTER3_DEFINITIONS_VSX3NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3nofma)
 set(TESTER3_DEFINITIONS_VXE       ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe)
 set(TESTER3_DEFINITIONS_VXENOFMA  ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxenofma)
 set(TESTER3_DEFINITIONS_VXE2      ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2)
 set(TESTER3_DEFINITIONS_VXE2NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2nofma)
 set(TESTER3_DEFINITIONS_RVVM1      ATR=finz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1 ENABLE_RVVM1)
 set(TESTER3_DEFINITIONS_RVVM1NOFMA ATR=cinz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1nofma ENABLE_RVVM1)
 set(TESTER3_DEFINITIONS_RVVM2      ATR=finz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2 ENABLE_RVVM2)
 set(TESTER3_DEFINITIONS_RVVM2NOFMA ATR=cinz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2nofma ENABLE_RVVM2)
 set(TESTER3_DEFINITIONS_PUREC_SCALAR    ATR=cinz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purec)
 set(TESTER3_DEFINITIONS_PURECFMA_SCALAR ATR=finz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purecfma)
 #
 if (SLEEF_ARCH_X86)
  set(TEST3_CINZ purec_scalar sse2 sse4 avx avx512fnofma)
  set(TEST3_FINZ purecfma_scalar avx2128 avx2 avx512f)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
  set(TEST3_CINZ purec_scalar advsimdnofma svenofma)
  set(TEST3_FINZ purecfma_scalar advsimd sve)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
  set(TEST3_CINZ purec_scalar)
  set(TEST3_FINZ purecfma_scalar)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
  set(TEST3_CINZ purec_scalar vsxnofma vsx3nofma)
  set(TEST3_FINZ purecfma_scalar vsx vsx3)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
  set(TEST3_CINZ purec_scalar vxenofma vxe2nofma)
  set(TEST3_FINZ purecfma_scalar vxe vxe2)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
  set(TEST3_CINZ purec_scalar rvvm1nofma rvvm2nofma)
  set(TEST3_FINZ purecfma_scalar rvvm1 rvvm2)
 endif()
 #
 link_directories(${sleef_BINARY_DIR}/lib)                 # libsleef
 link_directories(${sleef_BINARY_DIR}/src/common)          # common.a
 include_directories(${sleef_BINARY_DIR}/include)          # sleef.h
 include_directories(${sleef_SOURCE_DIR}/src/libm)         # rename.h
 include_directories(${sleef_BINARY_DIR}/src/libm/include) # rename headers
 if(NOT LIB_MPFR)
  find_program(TESTER_COMMAND tester)
 endif(NOT LIB_MPFR)
 if (SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
  message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified and tester is not available")
 endif(SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
 find_library(LIBRT rt)
 if (NOT LIBRT)
  set(LIBRT "")
 endif()
 set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}")
 set(COMMON_TARGET_PROPERTIES
  C_STANDARD 99                  # -std=gnu99
  )
 if (SLEEF_ENABLE_LTO)
  list(APPEND COMMON_TARGET_PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)  # -flto
 endif()
 #
 function(add_test_iut IUT C)
  if (LIB_MPFR)
    set(TESTER ${TARGET_TESTER})
  elseif(TESTER_COMMAND)
    set(TESTER ${TESTER_COMMAND})
  endif()
  # When we are crosscompiling using the mkrename* tools from a native
  # build, we use the tester executable from the native build.
  if (CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
    set(TESTER ${NATIVE_BUILD_DIR}/bin/${TARGET_TESTER})
  endif(CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
  if (TESTER)
    if (NOT EMULATOR)
      if (SDE_COMMAND)
        set(FLAGS_SDE "--sde" ${SDE_COMMAND})
      else()
        set(FLAGS_SDE)
      endif()
      if (ARMIE_COMMAND)
        set(FLAGS_ARMIE ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS})
      else()
        set(FLAGS_ARMIE)
      endif()
      add_test(NAME ${IUT}
        COMMAND ${TESTER} ${FLAGS_SDE} ${FLAGS_ARMIE} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
      set_tests_properties(${IUT} PROPERTIES COST ${C})
    else()
      add_test(NAME ${IUT}
        COMMAND ${TESTER} "--qemu" ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
      set_tests_properties(${IUT} PROPERTIES COST ${C})
    endif()
  endif()
 endfunction()
 # Compile executable 'iut'
 add_executable(${TARGET_IUT} iut.c testerutil.c)
 target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
 target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF}
  ${LIBM} ${LIBRT})
 set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
 add_test_iut(${TARGET_IUT} 1.0)
 set(IUT_LIST ${TARGET_IUT})
 # Compile executable 'iutcuda'
 if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
  add_executable(iutcuda iutcuda.cu)
  set_target_properties(iutcuda PROPERTIES LINKER_LANGUAGE CUDA)
  target_compile_options(iutcuda PRIVATE "--fmad=false;-Xcompiler;-ffp-contract=off")
  add_dependencies(iutcuda ${TARGET_INLINE_HEADERS})
  add_test_iut(iutcuda 20.0)
  list(APPEND IUT_LIST iutcuda)
 endif()
 set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
 # Add vector extension `iut`s
 macro(test_extension SIMD)
  if(COMPILER_SUPPORTS_${SIMD})
    string(TOLOWER ${SIMD} LCSIMD)
    string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD})
    add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC})
    target_compile_options(${TARGET_IUT${SIMD}}
      PRIVATE ${FLAGS_ENABLE_${SIMD}})
    target_compile_definitions(${TARGET_IUT${SIMD}}
      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
    target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}
      ${LIBM} ${LIBRT})
    if (FORCE_AAVPCS)
      target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1)
    endif(FORCE_AAVPCS)
    add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS})
    add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF})
    set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
    if (DEFINED COSTOVERRIDE_${SIMD})
      add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}})
    else()
      add_test_iut(${TARGET_IUT${SIMD}} 1.0)
    endif()
    list(APPEND IUT_LIST ${TARGET_IUT${SIMD}})
    # The iut programs whose names begin with "iuty" are the iut for the
    # deterministic version of functions. By checking the result of
    # testing with iutysse2, for example, it can be checked that the
    # corresponding deterministic functions passes the accuracy and
    # nonnumber tests.
    string(CONCAT IUTYNAME "iuty" ${LCSIMD})
    add_executable(${IUTYNAME} ${IUT_SRC})
    target_compile_options(${IUTYNAME}
      PRIVATE ${FLAGS_ENABLE_${SIMD}})
    target_compile_definitions(${IUTYNAME}
      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
    target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF}
      ${LIBM} ${LIBRT})
    add_dependencies(${IUTYNAME} ${TARGET_HEADERS})
    add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF})
    set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES})
    if (DEFINED COSTOVERRIDE_${SIMD})
      add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}})
    else()
      add_test_iut(${IUTYNAME} 1.0)
    endif()
    list(APPEND IUT_LIST ${IUTYNAME})
    # The iut programs whose names begin with "iuti" are the iut for the
    # inline version of functions.
    if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
      string(CONCAT IUTINAME "iuti" ${LCSIMD})
      add_executable(${IUTINAME} ${IUT_SRC})
      target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}})
      target_compile_definitions(${IUTINAME}
        PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
        USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
        MACRO_ONLY_HEADER="macroonly${SIMD}.h"
        SIMD_SUFFIX=_${LCSIMD}_sleef
        )
      target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include)
      target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
      add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
      set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
      if (DEFINED COSTOVERRIDE_${SIMD})
        add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}})
      else()
        add_test_iut(${IUTINAME} 1.0)
      endif()
      list(APPEND IUT_LIST ${IUTINAME})
    endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
    if(LIB_MPFR AND NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND NOT MINGW)
      # Build tester2 SIMD
      string(TOLOWER ${SIMD} SCSIMD)
      foreach(P dp sp)
              set(T "tester2${SCSIMD}${P}")
              add_executable(${T} tester2simd${P}.c testerutil.c)
              if(FORCE_AAVPCS)
                target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
              endif(FORCE_AAVPCS)
              target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
              target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
              set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
              target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
              add_dependencies(${T} ${TARGET_HEADERS})
              add_dependencies(${T} ${TARGET_LIBSLEEF})
              if (MPFR_INCLUDE_DIR)
                target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
              endif()
              # The tester2 programs whose name begins with "tester2y" are the
              # testing program for the deterministic version of functions.
              set(T "tester2y${SCSIMD}${P}")
              add_executable(${T} tester2simd${P}.c testerutil.c)
              target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
              target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
              set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
              target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
              add_dependencies(${T} ${TARGET_HEADERS})
              add_dependencies(${T} ${TARGET_LIBSLEEF})
              if (MPFR_INCLUDE_DIR)
                target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
              endif()
      endforeach()
    endif()
    if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND SLEEF_OPENSSL_FOUND)
      # Build tester3
      string(TOLOWER ${SIMD} SCSIMD)
      set(T "tester3${SCSIMD}")
      add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
      target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
      target_compile_definitions(${T} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${TESTER3_DEFINITIONS_${SIMD}})
      set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
      # Enable Vector PCS for Advanced SIMD (if supported)
      if(FORCE_AAVPCS)
        host_target_AAVPCS_definitions(${T})
      endif()
      target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBM} ${SLEEF_OPENSSL_LIBRARIES})
      target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
      add_dependencies(${T} ${TARGET_HEADERS})
      add_dependencies(${T} ${TARGET_LIBSLEEF})
      # Add test with tester3
      list(FIND TEST3_CINZ ${SCSIMD} INDEX_TEST3_CINZ)
      if (NOT INDEX_TEST3_CINZ EQUAL -1)
        if (SDE_COMMAND)
          add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
        elseif(EMULATOR)
          add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
        else()
          add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
        endif()
        if (DEFINED COSTOVERRIDE_${SIMD})
          set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
        else()
          set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
        endif()
      endif()
      list(FIND TEST3_FINZ ${SCSIMD} INDEX_TEST3_FINZ)
      if (NOT INDEX_TEST3_FINZ EQUAL -1)
        if (SDE_COMMAND)
          add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
        elseif(EMULATOR)
          add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
        else()
          add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
        endif()
        if (DEFINED COSTOVERRIDE_${SIMD})
          set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
        else()
          set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
        endif()
      endif()
    endif()
  endif(COMPILER_SUPPORTS_${SIMD})
 endmacro(test_extension)
 foreach(SIMD ${SLEEF_SUPPORTED_LIBM_EXTENSIONS})
  test_extension(${SIMD})
 endforeach()
 function(add_gnuabi_compatibility_test SIMD MASKED)
  if (MASKED)
    set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD}_masked)
  else(MASKED)
    set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD})
  endif(MASKED)
  add_executable(${GNUABI_COMPATIBILITY_TEST} gnuabi_compatibility.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
  set_target_properties(${GNUABI_COMPATIBILITY_TEST} PROPERTIES ${COMMON_TARGET_PROPERTIES})
  target_compile_options(${GNUABI_COMPATIBILITY_TEST}
    PRIVATE ${FLAGS_ENABLE_${SIMD}})
  if (MASKED)
    target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} MASKED_GNUABI=1)
  else(MASKED)
    target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
      PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
  endif(MASKED)
  if (FORCE_AAVPCS)
    target_compile_definitions(${GNUABI_COMPATIBILITY_TEST} PRIVATE ENABLE_AAVPCS=1)
  endif(FORCE_AAVPCS)
  target_link_libraries(${GNUABI_COMPATIBILITY_TEST} ${TARGET_LIBSLEEFGNUABI} ${LIBM})
  # These are linker tests that don't really need to be executed,
  # but seeing them in the report of ctest gives an idea of what
  # has been built for testing.
  if (EMULATOR)
    add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
      COMMAND ${EMULATOR} $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>
      WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
  elseif(SDE_COMMAND)
    add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
      COMMAND ${SDE_COMMAND} "--" $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
  else()
    add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
      COMMAND $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
  endif(EMULATOR)
 endfunction(add_gnuabi_compatibility_test)
 if(ENABLE_GNUABI)
  foreach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
    if(COMPILER_SUPPORTS_${SIMD})
      # GNUABI compatibility for the unmasked symbols.
      add_gnuabi_compatibility_test(${SIMD} OFF)
      # GNUABI compatibility for the masked symbols.
      if (MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
        add_gnuabi_compatibility_test(${SIMD} ON)
      endif(MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
    endif (COMPILER_SUPPORTS_${SIMD})
  endforeach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
 endif(ENABLE_GNUABI)
 #
 if (SLEEF_ARCH_X86)
  # iutdsp128
  add_executable(iutdsp128 ${IUT_SRC})
  target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
  target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
  target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
  add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
  add_test_iut(iutdsp128 1.0)
  list(APPEND IUT_LIST iutdsp128)
  # iutdsp256
  add_executable(iutdsp256 ${IUT_SRC})
  target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
  target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX})
  target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
  add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
  add_test_iut(iutdsp256 1.0)
  list(APPEND IUT_LIST iutdsp256)
 endif(SLEEF_ARCH_X86)
 if (SLEEF_ARCH_PPC64)
  add_executable(iutdsp128 ${IUT_SRC})
  target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
  target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX})
  target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
  add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
  add_test_iut(iutdsp128 1.0)
  list(APPEND IUT_LIST iutdsp128)
 endif(SLEEF_ARCH_PPC64)
 if (SLEEF_ARCH_S390X)
  add_executable(iutdsp128 ${IUT_SRC})
  target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
  target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE})
  target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
  add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
  add_test_iut(iutdsp128 1.0)
  list(APPEND IUT_LIST iutdsp128)
 endif(SLEEF_ARCH_S390X)
 if(SLEEF_BUILD_SCALAR_LIB)
  # Compile executable 'iutscalar'
  add_executable(iutscalar iut.c testerutil.c)
  target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS})
  target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
  set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES})
  add_test_iut(iutscalar 1.0)
  list(APPEND IUT_LIST iutscalar)
 endif()
 if(LIB_MPFR AND NOT MINGW)
  # Build tester2 scalar
  set(PRECISIONS dp sp)
  if(COMPILER_SUPPORTS_LONG_DOUBLE)
    list(APPEND PRECISIONS ld)
  endif()
  if(COMPILER_SUPPORTS_QUADMATH)
    list(APPEND PRECISIONS qp)
    set(LIBQUADMATH "-lquadmath")
    set(ENABLEFLOAT128 PRIVATE ENABLEFLOAT128=1)
  endif()
  foreach(P ${PRECISIONS})
    set(T "tester2${P}")
    add_executable(${T} tester2${P}.c testerutil.c)
    target_compile_definitions(${T} PRIVATE USEMPFR=1 ${ENABLEFLOAT128} ${COMMON_TARGET_DEFINITIONS})
    set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
    if (FORCE_AAVPCS)
      target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
    endif(FORCE_AAVPCS)
    if (MPFR_INCLUDE_DIR)
      target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
    endif()
    target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${LIB_MPFR} ${LIBM} ${LIBGMP})
    add_dependencies(${T} ${TARGET_HEADERS})
    add_dependencies(${T} ${TARGET_LIBSLEEF})
  endforeach()
  # Compile executable 'tester'
  add_host_executable(${TARGET_TESTER} tester.c testerutil.c)
  if (NOT CMAKE_CROSSCOMPILING)
    target_link_libraries(${TARGET_TESTER} ${LIB_MPFR} ${TARGET_LIBSLEEF} ${LIBM} ${LIBGMP})
    target_compile_definitions(${TARGET_TESTER}
      PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
    target_compile_options(${TARGET_TESTER} PRIVATE -Wno-unused-result)
    set_target_properties(${TARGET_TESTER} PROPERTIES ${COMMON_TARGET_PROPERTIES})
    if (MPFR_INCLUDE_DIR)
      target_include_directories(${TARGET_TESTER} PRIVATE ${MPFR_INCLUDE_DIR})
    endif()
  endif()
 endif(LIB_MPFR AND NOT MINGW)
 if(ENABLE_GNUABI AND COMPILER_SUPPORTS_OMP_SIMD AND NOT SLEEF_TARGET_PROCESSOR MATCHES "^i.86$")
  # Build tester for vectorabi
  add_executable(testervecabi testervecabi.c)
  target_compile_definitions(testervecabi PRIVATE ${COMMON_TARGET_DEFINITIONS})
  target_compile_options(testervecabi PRIVATE ${OpenMP_C_FLAGS})
  target_link_libraries(testervecabi ${TARGET_LIBSLEEF} ${OpenMP_C_FLAGS})
  set_target_properties(testervecabi PROPERTIES C_STANDARD 99)
  add_test(NAME testervecabi COMMAND ${EMULATOR} testervecabi
    WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 endif()
 # mveclibtest
 if (ENABLE_GNUABI AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
  add_executable(mveclibtest-sse2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
  set_target_properties(mveclibtest-sse2 PROPERTIES C_STANDARD 99)
  target_compile_options(mveclibtest-sse2 PRIVATE ${FLAGS_FASTMATH} "-O3")
  target_link_libraries(mveclibtest-sse2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
  add_dependencies(mveclibtest-sse2 ${TARGET_HEADERS})
  add_test(NAME mveclibtest-sse2 COMMAND mveclibtest-sse2)
  add_executable(mveclibtest-avx mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
  set_target_properties(mveclibtest-avx PROPERTIES C_STANDARD 99)
  target_compile_options(mveclibtest-avx PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX} "-O3")
  target_link_libraries(mveclibtest-avx ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
  add_dependencies(mveclibtest-avx ${TARGET_HEADERS})
  add_test(NAME mveclibtest-avx COMMAND mveclibtest-avx)
  add_executable(mveclibtest-avx2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
  set_target_properties(mveclibtest-avx2 PROPERTIES C_STANDARD 99)
  target_compile_options(mveclibtest-avx2 PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX2} "-O3")
  target_link_libraries(mveclibtest-avx2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
  add_dependencies(mveclibtest-avx2 ${TARGET_HEADERS})
  add_test(NAME mveclibtest-avx2 COMMAND mveclibtest-avx2)
  add_executable(mveclibtest-avx512f mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
  set_target_properties(mveclibtest-avx512f PROPERTIES C_STANDARD 99)
  target_compile_options(mveclibtest-avx512f PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX512F} "-O3")
  target_link_libraries(mveclibtest-avx512f ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
  add_dependencies(mveclibtest-avx512f ${TARGET_HEADERS})
  add_test(NAME mveclibtest-avx512f COMMAND mveclibtest-avx512f)
 endif()
 #
 if (FILECHECK_COMMAND AND COMPILER_SUPPORTS_OPENMP AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
  add_test(NAME autovec-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-AVX2")
  add_test(NAME autovec-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-SSE2")
  add_test(NAME testervecabi-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-SSE2")
  add_test(NAME testervecabi-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-AVX2")
 endif()
 # Tests depends on the library
 add_dependencies(${TARGET_IUT} ${TARGET_HEADERS})
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/autovec.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/autovec.c
@@ -0,0 +1,651 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #define SLEEF_ENABLE_OMP_SIMD
 #include "sleef.h"
 #define N 1024
 double a[N], b[N], c[N], d[N];
 float e[N], f[N], g[N], h[N];
 void testsind1_u10() {
 // CHECK-SSE2: testsind1_u10
 // CHECK-AVX2: testsind1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_sind1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u10
 }
 void testsind1_u35() {
 // CHECK-SSE2: testsind1_u35
 // CHECK-AVX2: testsind1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_sind1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u35
 }
 void testsinf1_u10() {
 // CHECK-SSE2: testsinf1_u10
 // CHECK-AVX2: testsinf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u10
 }
 void testsinf1_u35() {
 // CHECK-SSE2: testsinf1_u35
 // CHECK-AVX2: testsinf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u35
 }
 void testcosd1_u10() {
 // CHECK-SSE2: testcosd1_u10
 // CHECK-AVX2: testcosd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u10
 }
 void testcosd1_u35() {
 // CHECK-SSE2: testcosd1_u35
 // CHECK-AVX2: testcosd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u35
 }
 void testcosf1_u10() {
 // CHECK-SSE2: testcosf1_u10
 // CHECK-AVX2: testcosf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u10
 }
 void testcosf1_u35() {
 // CHECK-SSE2: testcosf1_u35
 // CHECK-AVX2: testcosf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u35
 }
 void testtand1_u10() {
 // CHECK-SSE2: testtand1_u10
 // CHECK-AVX2: testtand1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_tand1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u10
 }
 void testtand1_u35() {
 // CHECK-SSE2: testtand1_u35
 // CHECK-AVX2: testtand1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_tand1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u35
 }
 void testtanf1_u10() {
 // CHECK-SSE2: testtanf1_u10
 // CHECK-AVX2: testtanf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u10
 }
 void testtanf1_u35() {
 // CHECK-SSE2: testtanf1_u35
 // CHECK-AVX2: testtanf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u35
 }
 void testasind1_u10() {
 // CHECK-SSE2: testasind1_u10
 // CHECK-AVX2: testasind1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_asind1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u10
 }
 void testasind1_u35() {
 // CHECK-SSE2: testasind1_u35
 // CHECK-AVX2: testasind1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_asind1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u35
 }
 void testasinf1_u10() {
 // CHECK-SSE2: testasinf1_u10
 // CHECK-AVX2: testasinf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u10
 }
 void testasinf1_u35() {
 // CHECK-SSE2: testasinf1_u35
 // CHECK-AVX2: testasinf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u35
 }
 void testacosd1_u10() {
 // CHECK-SSE2: testacosd1_u10
 // CHECK-AVX2: testacosd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u10
 }
 void testacosd1_u35() {
 // CHECK-SSE2: testacosd1_u35
 // CHECK-AVX2: testacosd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u35
 }
 void testacosf1_u10() {
 // CHECK-SSE2: testacosf1_u10
 // CHECK-AVX2: testacosf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u10
 }
 void testacosf1_u35() {
 // CHECK-SSE2: testacosf1_u35
 // CHECK-AVX2: testacosf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u35
 }
 void testatand1_u10() {
 // CHECK-SSE2: testatand1_u10
 // CHECK-AVX2: testatand1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_atand1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u10
 }
 void testatand1_u35() {
 // CHECK-SSE2: testatand1_u35
 // CHECK-AVX2: testatand1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_atand1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u35
 }
 void testatanf1_u10() {
 // CHECK-SSE2: testatanf1_u10
 // CHECK-AVX2: testatanf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u10
 }
 void testatanf1_u35() {
 // CHECK-SSE2: testatanf1_u35
 // CHECK-AVX2: testatanf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u35
 }
 void testatan2d1_u10() {
 // CHECK-SSE2: testatan2d1_u10
 // CHECK-AVX2: testatan2d1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u10(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u10
 // CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u10
 }
 void testatan2d1_u35() {
 // CHECK-SSE2: testatan2d1_u35
 // CHECK-AVX2: testatan2d1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u35(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u35
 // CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u35
 }
 void testatan2f1_u10() {
 // CHECK-SSE2: testatan2f1_u10
 // CHECK-AVX2: testatan2f1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u10(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u10
 // CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u10
 }
 void testatan2f1_u35() {
 // CHECK-SSE2: testatan2f1_u35
 // CHECK-AVX2: testatan2f1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u35(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u35
 // CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u35
 }
 void testsinhd1_u10() {
 // CHECK-SSE2: testsinhd1_u10
 // CHECK-AVX2: testsinhd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u10
 }
 void testsinhd1_u35() {
 // CHECK-SSE2: testsinhd1_u35
 // CHECK-AVX2: testsinhd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u35
 }
 void testsinhf1_u10() {
 // CHECK-SSE2: testsinhf1_u10
 // CHECK-AVX2: testsinhf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u10
 }
 void testsinhf1_u35() {
 // CHECK-SSE2: testsinhf1_u35
 // CHECK-AVX2: testsinhf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u35
 }
 void testcoshd1_u10() {
 // CHECK-SSE2: testcoshd1_u10
 // CHECK-AVX2: testcoshd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u10
 }
 void testcoshd1_u35() {
 // CHECK-SSE2: testcoshd1_u35
 // CHECK-AVX2: testcoshd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u35
 }
 void testcoshf1_u10() {
 // CHECK-SSE2: testcoshf1_u10
 // CHECK-AVX2: testcoshf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u10
 }
 void testcoshf1_u35() {
 // CHECK-SSE2: testcoshf1_u35
 // CHECK-AVX2: testcoshf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u35
 }
 void testtanhd1_u10() {
 // CHECK-SSE2: testtanhd1_u10
 // CHECK-AVX2: testtanhd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u10
 }
 void testtanhd1_u35() {
 // CHECK-SSE2: testtanhd1_u35
 // CHECK-AVX2: testtanhd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u35
 }
 void testtanhf1_u10() {
 // CHECK-SSE2: testtanhf1_u10
 // CHECK-AVX2: testtanhf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u10
 }
 void testtanhf1_u35() {
 // CHECK-SSE2: testtanhf1_u35
 // CHECK-AVX2: testtanhf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u35
 }
 void testasinhd1_u10() {
 // CHECK-SSE2: testasinhd1_u10
 // CHECK-AVX2: testasinhd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_asinhd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_asinhd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_asinhd1_u10
 }
 void testasinhf1_u10() {
 // CHECK-SSE2: testasinhf1_u10
 // CHECK-AVX2: testasinhf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_asinhf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_asinhf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_asinhf1_u10
 }
 void testacoshd1_u10() {
 // CHECK-SSE2: testacoshd1_u10
 // CHECK-AVX2: testacoshd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_acoshd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_acoshd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_acoshd1_u10
 }
 void testacoshf1_u10() {
 // CHECK-SSE2: testacoshf1_u10
 // CHECK-AVX2: testacoshf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_acoshf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_acoshf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_acoshf1_u10
 }
 void testatanhd1_u10() {
 // CHECK-SSE2: testatanhd1_u10
 // CHECK-AVX2: testatanhd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_atanhd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_atanhd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_atanhd1_u10
 }
 void testatanhf1_u10() {
 // CHECK-SSE2: testatanhf1_u10
 // CHECK-AVX2: testatanhf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_atanhf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_atanhf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_atanhf1_u10
 }
 void testlogd1_u10() {
 // CHECK-SSE2: testlogd1_u10
 // CHECK-AVX2: testlogd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_logd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u10
 }
 void testlogd1_u35() {
 // CHECK-SSE2: testlogd1_u35
 // CHECK-AVX2: testlogd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_logd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u35
 }
 void testlogf1_u10() {
 // CHECK-SSE2: testlogf1_u10
 // CHECK-AVX2: testlogf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_logf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u10
 }
 void testlogf1_u35() {
 // CHECK-SSE2: testlogf1_u35
 // CHECK-AVX2: testlogf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_logf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u35
 }
 void testlog2d1_u10() {
 // CHECK-SSE2: testlog2d1_u10
 // CHECK-AVX2: testlog2d1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_log2d1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_log2d1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_log2d1_u10
 }
 void testlog2f1_u10() {
 // CHECK-SSE2: testlog2f1_u10
 // CHECK-AVX2: testlog2f1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_log2f1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_log2f1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_log2f1_u10
 }
 void testlog10d1_u10() {
 // CHECK-SSE2: testlog10d1_u10
 // CHECK-AVX2: testlog10d1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_log10d1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_log10d1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_log10d1_u10
 }
 void testlog10f1_u10() {
 // CHECK-SSE2: testlog10f1_u10
 // CHECK-AVX2: testlog10f1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_log10f1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_log10f1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_log10f1_u10
 }
 void testlog1pd1_u10() {
 // CHECK-SSE2: testlog1pd1_u10
 // CHECK-AVX2: testlog1pd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_log1pd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_log1pd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_log1pd1_u10
 }
 void testlog1pf1_u10() {
 // CHECK-SSE2: testlog1pf1_u10
 // CHECK-AVX2: testlog1pf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_log1pf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_log1pf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_log1pf1_u10
 }
 void testexpd1_u10() {
 // CHECK-SSE2: testexpd1_u10
 // CHECK-AVX2: testexpd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_expd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_expd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_expd1_u10
 }
 void testexpf1_u10() {
 // CHECK-SSE2: testexpf1_u10
 // CHECK-AVX2: testexpf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_expf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_expf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_expf1_u10
 }
 void testexp2d1_u10() {
 // CHECK-SSE2: testexp2d1_u10
 // CHECK-AVX2: testexp2d1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_exp2d1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_exp2d1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_exp2d1_u10
 }
 void testexp2f1_u10() {
 // CHECK-SSE2: testexp2f1_u10
 // CHECK-AVX2: testexp2f1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_exp2f1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_exp2f1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_exp2f1_u10
 }
 void testexp10d1_u10() {
 // CHECK-SSE2: testexp10d1_u10
 // CHECK-AVX2: testexp10d1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_exp10d1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_exp10d1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_exp10d1_u10
 }
 void testexp10f1_u10() {
 // CHECK-SSE2: testexp10f1_u10
 // CHECK-AVX2: testexp10f1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_exp10f1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_exp10f1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_exp10f1_u10
 }
 void testexpm1d1_u10() {
 // CHECK-SSE2: testexpm1d1_u10
 // CHECK-AVX2: testexpm1d1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_expm1d1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_expm1d1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_expm1d1_u10
 }
 void testexpm1f1_u10() {
 // CHECK-SSE2: testexpm1f1_u10
 // CHECK-AVX2: testexpm1f1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_expm1f1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_expm1f1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_expm1f1_u10
 }
 void testpowd1_u10() {
 // CHECK-SSE2: testpowd1_u10
 // CHECK-AVX2: testpowd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_powd1_u10(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_powd1_u10
 // CHECK-AVX2: _ZGVdN4vv_Sleef_powd1_u10
 }
 void testpowf1_u10() {
 // CHECK-SSE2: testpowf1_u10
 // CHECK-AVX2: testpowf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_powf1_u10(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_powf1_u10
 // CHECK-AVX2: _ZGVdN8vv_Sleef_powf1_u10
 }
 void testcbrtd1_u10() {
 // CHECK-SSE2: testcbrtd1_u10
 // CHECK-AVX2: testcbrtd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u10
 }
 void testcbrtd1_u35() {
 // CHECK-SSE2: testcbrtd1_u35
 // CHECK-AVX2: testcbrtd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u35(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u35
 // CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u35
 }
 void testcbrtf1_u10() {
 // CHECK-SSE2: testcbrtf1_u10
 // CHECK-AVX2: testcbrtf1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u10
 }
 void testcbrtf1_u35() {
 // CHECK-SSE2: testcbrtf1_u35
 // CHECK-AVX2: testcbrtf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u35(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u35
 // CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u35
 }
 void testhypotd1_u05() {
 // CHECK-SSE2: testhypotd1_u05
 // CHECK-AVX2: testhypotd1_u05
  for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u05(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u05
 // CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u05
 }
 void testhypotd1_u35() {
 // CHECK-SSE2: testhypotd1_u35
 // CHECK-AVX2: testhypotd1_u35
  for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u35(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u35
 // CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u35
 }
 void testhypotf1_u05() {
 // CHECK-SSE2: testhypotf1_u05
 // CHECK-AVX2: testhypotf1_u05
  for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u05(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u05
 // CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u05
 }
 void testhypotf1_u35() {
 // CHECK-SSE2: testhypotf1_u35
 // CHECK-AVX2: testhypotf1_u35
  for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u35(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u35
 // CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u35
 }
 void testerfd1_u10() {
 // CHECK-SSE2: testerfd1_u10
 // CHECK-AVX2: testerfd1_u10
  for(int i=0;i<N;i++) a[i] = Sleef_erfd1_u10(b[i]);
 // CHECK-SSE2: _ZGVbN2v_Sleef_erfd1_u10
 // CHECK-AVX2: _ZGVdN4v_Sleef_erfd1_u10
 }
 void testerff1_u10() {
 // CHECK-SSE2: testerff1_u10
 // CHECK-AVX2: testerff1_u10
  for(int i=0;i<N;i++) e[i] = Sleef_erff1_u10(f[i]);
 // CHECK-SSE2: _ZGVbN4v_Sleef_erff1_u10
 // CHECK-AVX2: _ZGVdN8v_Sleef_erff1_u10
 }
 void testfmodd1() {
 // CHECK-SSE2: testfmodd1
 // CHECK-AVX2: testfmodd1
  for(int i=0;i<N;i++) a[i] = Sleef_fmodd1(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_fmodd1
 // CHECK-AVX2: _ZGVdN4vv_Sleef_fmodd1
 }
 void testfmodf1() {
 // CHECK-SSE2: testfmodf1
 // CHECK-AVX2: testfmodf1
  for(int i=0;i<N;i++) e[i] = Sleef_fmodf1(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_fmodf1
 // CHECK-AVX2: _ZGVdN8vv_Sleef_fmodf1
 }
 void testremainderd1() {
 // CHECK-SSE2: testremainderd1
 // CHECK-AVX2: testremainderd1
  for(int i=0;i<N;i++) a[i] = Sleef_remainderd1(b[i], c[i]);
 // CHECK-SSE2: _ZGVbN2vv_Sleef_remainderd1
 // CHECK-AVX2: _ZGVdN4vv_Sleef_remainderd1
 }
 void testremainderf1() {
 // CHECK-SSE2: testremainderf1
 // CHECK-AVX2: testremainderf1
  for(int i=0;i<N;i++) e[i] = Sleef_remainderf1(f[i], g[i]);
 // CHECK-SSE2: _ZGVbN4vv_Sleef_remainderf1
 // CHECK-AVX2: _ZGVdN8vv_Sleef_remainderf1
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c
@@ -0,0 +1,714 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 /// This program makes sure that all the symbols that a
 /// GNUABI-compatible compiler (clang or gcc) can generate when
 /// vectorizing functions call from `#include <math.h>` are present in
 /// `libsleefgnuabi.so`.
 ///
 /// The header `math.h` is not the same on all systems, and different
 /// macros can activate different sets of functions. The list provide
 /// here shoudl cover the union of all possible systems that we want
 /// to support. In particular, the test is checking that the "finite"
 /// symmbols from `#include <bits/math-finite.h>` are present for
 /// those systems supporting them.
 #include <setjmp.h>
 #include <stdio.h>
 #include <string.h>
 #if defined(ENABLE_SSE4) || defined(ENABLE_SSE2)
 #include <x86intrin.h>
 #define ISA_TOKEN b
 #define VLEN_SP 4
 #define VLEN_DP 2
 #define VECTOR_CC
 typedef __m128i vopmask;
 typedef __m128d vdouble;
 typedef __m128  vfloat;
 typedef __m128i vint;
 typedef __m128i vint2;
 #endif /* defined(ENABLE_SSE4) || defined(ENABLE_SSE2) */
 #ifdef ENABLE_AVX
 #include <x86intrin.h>
 #define ISA_TOKEN c
 #define VLEN_SP 8
 #define VLEN_DP 4
 #define VECTOR_CC
 typedef __m256i vopmask;
 typedef __m256d vdouble;
 typedef __m256 vfloat;
 typedef __m128i vint;
 typedef struct { __m128i x, y; } vint2;
 #endif /* ENABLE_AVX */
 #ifdef ENABLE_AVX2
 #include <x86intrin.h>
 #define ISA_TOKEN d
 #define VLEN_SP 8
 #define VLEN_DP 4
 #define VECTOR_CC
 typedef __m256i vopmask;
 typedef __m256d vdouble;
 typedef __m256 vfloat;
 typedef __m128i vint;
 typedef __m256i vint2;
 #endif /* ENABLE_AVX2 */
 #ifdef ENABLE_AVX512F
 #include <x86intrin.h>
 #define ISA_TOKEN e
 #define VLEN_SP 16
 #define VLEN_DP 8
 #define VECTOR_CC
 typedef __mmask16 vopmask;
 typedef __m512d vdouble;
 typedef __m512 vfloat;
 typedef __m256i vint;
 typedef __m512i vint2;
 #endif /* ENABLE_AVX512F */
 #ifdef ENABLE_ADVSIMD
 #include <arm_neon.h>
 #define ISA_TOKEN n
 #define VLEN_DP 2
 #define VLEN_SP 4
 #ifdef ENABLE_AAVPCS
 #define VECTOR_CC __attribute__((aarch64_vector_pcs))
 #else
 #define VECTOR_CC
 #endif
 typedef uint32x4_t vopmask;
 typedef float64x2_t vdouble;
 typedef float32x4_t vfloat;
 typedef int32x2_t vint;
 typedef int32x4_t vint2;
 #endif /* ENABLE_ADVSIMDF */
 #ifdef ENABLE_SVE
 #include <arm_sve.h>
 #define ISA_TOKEN s
 #define VLEN_SP (svcntw())
 #define VLEN_DP (svcntd())
 #define VLA_TOKEN x
 #define VECTOR_CC
 typedef svbool_t vopmask;
 typedef svfloat64_t vdouble;
 typedef svfloat32_t vfloat;
 typedef svint32_t vint;
 typedef svint32_t vint2;
 #endif /* ENABLE_SVE */
 // GNUABI name mangling macro.
 #ifndef MASKED_GNUABI
 #define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name
 #define __DECLARE_vd_vd(name, t, vl, p)                                 \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
 #define __CALL_vd_vd(name, t, vl, p)                            \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
 #define __DECLARE_vi_vd(name, t, vl, p)                         \
  extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
 #define __CALL_vi_vd(name, t, vl, p)                            \
  do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
 #define __DECLARE_vd_vd_vi(name, t, vl, p)                              \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint)
 #define __CALL_vd_vd_vi(name, t, vl, p)                                 \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0)
 #define __DECLARE_vd_vd_vd(name, t, vl, p)                              \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble)
 #define __CALL_vd_vd_vd(name, t, vl, p)                                 \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0)
 #define __DECLARE_vd_vd_vd_vd(name, t, vl, p)                           \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble)
 #define __CALL_vd_vd_vd_vd(name, t, vl, p)                              \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0)
 #define __DECLARE_vd_vd_pvd(name, t, vl, p)                             \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *)
 #define __CALL_vd_vd_pvd(name, t, vl, p)                                \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0)
 #define __DECLARE_v_vd_pvd_pvd(name, t, vl, p)                          \
  extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *)
 #define __CALL_v_vd_pvd_pvd(name, t, vl, p)                             \
  do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0)
 #define __DECLARE_vf_vf(name, t, vl, p)                                 \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
 #define __CALL_vf_vf(name, t, vl, p)                            \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
 #define __DECLARE_vf_vf_vf(name, t, vl, p)                              \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat)
 #define __CALL_vf_vf_vf(name, t, vl, p)                                 \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0)
 #define __DECLARE_vf_vf_vf_vf(name, t, vl, p)                           \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat)
 #define __CALL_vf_vf_vf_vf(name, t, vl, p)                              \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0)
 #define __DECLARE_vf_vf_pvf(name, t, vl, p)                             \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *)
 #define __CALL_vf_vf_pvf(name, t, vl, p)                                \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0)
 #define __DECLARE_vi_vf(name, t, vl, p)                         \
  extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
 #define __CALL_vi_vf(name, t, vl, p)                            \
  do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
 #define __DECLARE_vf_vf_vi(name, t, vl, p)                              \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2)
 #define __CALL_vf_vf_vi(name, t, vl, p)                                 \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0)
 #define __DECLARE_v_vf_pvf_pvf(name, t, vl, p)                          \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*)
 #define __CALL_v_vf_pvf_pvf(name, t, vl, p)                             \
  do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0)
 #else /******************** MASKED_GNUABI *****************************/
 #define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name
 #define __DECLARE_vd_vd(name, t, vl, p)                                 \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
 #define __CALL_vd_vd(name, t, vl, p)                                    \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
 #define __DECLARE_vi_vd(name, t, vl, p)                                 \
  extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
 #define __CALL_vi_vd(name, t, vl, p)                                    \
  do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
 #define __DECLARE_vd_vd_vi(name, t, vl, p)                              \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask)
 #define __CALL_vd_vd_vi(name, t, vl, p)                                 \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0)
 #define __DECLARE_vd_vd_vd(name, t, vl, p)                              \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask)
 #define __CALL_vd_vd_vd(name, t, vl, p)                                 \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0)
 #define __DECLARE_vd_vd_vd_vd(name, t, vl, p)                           \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask)
 #define __CALL_vd_vd_vd_vd(name, t, vl, p)                              \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0)
 #define __DECLARE_vd_vd_pvd(name, t, vl, p)                             \
  extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask)
 #define __CALL_vd_vd_pvd(name, t, vl, p)                                \
  do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0)
 #define __DECLARE_v_vd_pvd_pvd(name, t, vl, p)                          \
  extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask)
 #define __CALL_v_vd_pvd_pvd(name, t, vl, p)                             \
  do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0)
 #define __DECLARE_vf_vf(name, t, vl, p)                                 \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
 #define __CALL_vf_vf(name, t, vl, p)                                    \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
 #define __DECLARE_vf_vf_vf(name, t, vl, p)                              \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask)
 #define __CALL_vf_vf_vf(name, t, vl, p)                                 \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0)
 #define __DECLARE_vf_vf_vf_vf(name, t, vl, p)                           \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask)
 #define __CALL_vf_vf_vf_vf(name, t, vl, p)                              \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0)
 #define __DECLARE_vf_vf_pvf(name, t, vl, p)                             \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask)
 #define __CALL_vf_vf_pvf(name, t, vl, p)                                \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0)
 #define __DECLARE_vi_vf(name, t, vl, p)                                 \
  extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
 #define __CALL_vi_vf(name, t, vl, p)                                    \
  do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
 #define __DECLARE_vf_vf_vi(name, t, vl, p)                              \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask)
 #define __CALL_vf_vf_vi(name, t, vl, p)                                 \
  do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0)
 #define __DECLARE_v_vf_pvf_pvf(name, t, vl, p)                          \
  extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask)
 #define __CALL_v_vf_pvf_pvf(name, t, vl, p)                             \
  do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0)
 #endif /* MASKED_GNUABI */
 // Level-1 expansion macros for declaration and call. The signature of
 // each function has three input paramters to avoid segfaults of
 // sincos-like functions that are effectively loading data from
 // memory.
 // Make sure that the architectural macros are defined for each vector
 // extension.
 #ifndef ISA_TOKEN
 #error "Missing ISA token"
 #endif
 #ifndef VLEN_DP
 #error "Missing VLEN_DP"
 #endif
 #ifndef VLEN_DP
 #error "Missing VLEN_SP"
 #endif
 #if defined(ENABLE_SVE) && !defined(VLA_TOKEN)
 #error "Missing VLA_TOKEN"
 #endif /* defined(ENABLE_SVE) && !defined(VLA_TOKEN) */
 // Declaration and call, first level expantion to pick up the
 // ISA_TOKEN and VLEN_* architectural macros.
 #ifndef ENABLE_SVE
 #define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
 #define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
 #define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
 #define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
 #define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
 #define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
 #define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
 #else /* ENABLE_SVE */
 #define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
 #define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
 #define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
 #define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
 #define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi0)
 #define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
 #define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
 #define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
 #define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
 #define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
 #define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
 #define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi20)
 #define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
 #define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
 #define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
 #endif /* ENABLE_SVE */
 //
 // Douple precision function declarations.
 DECLARE_DP_vd_vd(__acos_finite, v);
 DECLARE_DP_vd_vd(__acosh_finite, v);
 DECLARE_DP_vd_vd(__asin_finite, v);
 DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
 DECLARE_DP_vd_vd(__atanh_finite, v);
 DECLARE_DP_vd_vd(__cosh_finite, v);
 DECLARE_DP_vd_vd(__exp10_finite, v);
 DECLARE_DP_vd_vd(__exp2_finite, v);
 DECLARE_DP_vd_vd(__exp_finite, v);
 DECLARE_DP_vd_vd_vd(__fmod_finite, vv);
 DECLARE_DP_vd_vd_pvd(__modf_finite, vl8);
 DECLARE_DP_vd_vd_vd(__hypot_finite, vv);
 DECLARE_DP_vd_vd(__log10_finite, v);
 // DECLARE_DP_vd_vd(__log2_finite,v);
 DECLARE_DP_vd_vd(__log_finite, v);
 DECLARE_DP_vd_vd_vd(__pow_finite, vv);
 DECLARE_DP_vd_vd(__sinh_finite, v);
 DECLARE_DP_vd_vd(__sqrt_finite, v);
 DECLARE_DP_vd_vd(acos, v);
 DECLARE_DP_vd_vd(acosh, v);
 DECLARE_DP_vd_vd(asin, v);
 DECLARE_DP_vd_vd(asinh, v);
 DECLARE_DP_vd_vd(atan, v);
 DECLARE_DP_vd_vd_vd(atan2, vv);
 DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
 DECLARE_DP_vd_vd(atanh, v);
 DECLARE_DP_vd_vd(cbrt, v);
 DECLARE_DP_vd_vd(ceil, v);
 DECLARE_DP_vd_vd_vd(copysign, vv);
 DECLARE_DP_vd_vd(cos, v);
 DECLARE_DP_vd_vd(cosh, v);
 DECLARE_DP_vd_vd(cospi, v);
 DECLARE_DP_vd_vd(erf, v);
 DECLARE_DP_vd_vd(erfc, v);
 DECLARE_DP_vd_vd(exp, v);
 DECLARE_DP_vd_vd(exp10, v);
 DECLARE_DP_vd_vd(exp2, v);
 DECLARE_DP_vi_vd(expfrexp, v);
 DECLARE_DP_vd_vd(expm1, v);
 DECLARE_DP_vd_vd(fabs, v);
 DECLARE_DP_vd_vd_vd(fdim, vv);
 DECLARE_DP_vd_vd(floor, v);
 DECLARE_DP_vd_vd_vd_vd(fma, vvv);
 DECLARE_DP_vd_vd_vd(fmax, vv);
 DECLARE_DP_vd_vd_vd(fmin, vv);
 DECLARE_DP_vd_vd_vd(fmod, vv);
 DECLARE_DP_vd_vd(frfrexp, v);
 DECLARE_DP_vd_vd_vd(hypot, vv);
 DECLARE_DP_vi_vd(ilogb, v);
 DECLARE_DP_vd_vd_vi(ldexp, vv);
 DECLARE_DP_vd_vd(lgamma, v);
 DECLARE_DP_vd_vd(log, v);
 DECLARE_DP_vd_vd(log10, v);
 DECLARE_DP_vd_vd(log1p, v);
 DECLARE_DP_vd_vd(log2, v);
 DECLARE_DP_vd_vd_pvd(modf, vl8);
 DECLARE_DP_vd_vd_vd(nextafter, vv);
 DECLARE_DP_vd_vd_vd(pow, vv);
 DECLARE_DP_vd_vd(rint, v);
 DECLARE_DP_vd_vd(round, v);
 DECLARE_DP_vd_vd(sin, v);
 DECLARE_DP_v_vd_pvd_pvd(sincos, vl8l8);
 DECLARE_DP_v_vd_pvd_pvd(sincospi, vl8l8);
 DECLARE_DP_vd_vd(sinh, v);
 DECLARE_DP_vd_vd(sinpi, v);
 DECLARE_DP_vd_vd(sqrt, v);
 DECLARE_DP_vd_vd(tan, v);
 DECLARE_DP_vd_vd(tanh, v);
 DECLARE_DP_vd_vd(tgamma, v);
 DECLARE_DP_vd_vd(trunc, v);
 // Single precision function declarations.
 DECLARE_SP_vf_vf(__acosf_finite, v);
 DECLARE_SP_vf_vf(__acoshf_finite, v);
 DECLARE_SP_vf_vf(__asinf_finite, v);
 DECLARE_SP_vf_vf_vf(__atan2f_finite, vv);
 DECLARE_SP_vf_vf(__atanhf_finite, v);
 DECLARE_SP_vf_vf(__coshf_finite, v);
 DECLARE_SP_vf_vf(__exp10f_finite, v);
 DECLARE_SP_vf_vf(__exp2f_finite, v);
 DECLARE_SP_vf_vf(__expf_finite, v);
 DECLARE_SP_vf_vf_vf(__fmodf_finite, vv);
 DECLARE_SP_vf_vf_pvf(__modff_finite, vl4);
 DECLARE_SP_vf_vf_vf(__hypotf_finite, vv);
 DECLARE_SP_vf_vf(__log10f_finite, v);
 // DECLARE_SP_vf_vf(__log2f_finite,v);
 DECLARE_SP_vf_vf(__logf_finite, v);
 DECLARE_SP_vf_vf_vf(__powf_finite, vv);
 DECLARE_SP_vf_vf(__sinhf_finite, v);
 DECLARE_SP_vf_vf(__sqrtf_finite, v);
 DECLARE_SP_vf_vf(acosf, v);
 DECLARE_SP_vf_vf(acoshf, v);
 DECLARE_SP_vf_vf(asinf, v);
 DECLARE_SP_vf_vf(asinhf, v);
 DECLARE_SP_vf_vf(atanf, v);
 DECLARE_SP_vf_vf_vf(atan2f, vv);
 DECLARE_SP_vf_vf(atanhf, v);
 DECLARE_SP_vf_vf(cbrtf, v);
 DECLARE_SP_vf_vf(ceilf, v);
 DECLARE_SP_vf_vf_vf(copysignf, vv);
 DECLARE_SP_vf_vf(cosf, v);
 DECLARE_SP_vf_vf(coshf, v);
 DECLARE_SP_vf_vf(cospif, v);
 DECLARE_SP_vf_vf(erff, v);
 DECLARE_SP_vf_vf(erfcf, v);
 DECLARE_SP_vf_vf(expf, v);
 DECLARE_SP_vf_vf(exp10f, v);
 DECLARE_SP_vf_vf(exp2f, v);
 DECLARE_SP_vf_vf(expm1f, v);
 DECLARE_SP_vf_vf(fabsf, v);
 DECLARE_SP_vf_vf_vf(fdimf, vv);
 DECLARE_SP_vf_vf(floorf, v);
 DECLARE_SP_vf_vf_vf_vf(fmaf, vvv);
 DECLARE_SP_vf_vf_vf(fmaxf, vv);
 DECLARE_SP_vf_vf_vf(fminf, vv);
 DECLARE_SP_vf_vf_vf(fmodf, vv);
 DECLARE_SP_vf_vf(frfrexpf, v);
 DECLARE_SP_vf_vf_vf(hypotf, vv);
 #ifndef ENABLE_AVX
 // These two functions are not checked in some configurations due to
 // the issue in https://github.com/shibatch/sleef/issues/221
 DECLARE_SP_vi_vf(expfrexpf, v);
 DECLARE_SP_vi_vf(ilogbf, v);
 #endif
 DECLARE_SP_vf_vf_vi(ldexpf, vv);
 DECLARE_SP_vf_vf(lgammaf, v);
 DECLARE_SP_vf_vf(logf, v);
 DECLARE_SP_vf_vf(log10f, v);
 DECLARE_SP_vf_vf(log1pf, v);
 DECLARE_SP_vf_vf(log2f, v);
 DECLARE_SP_vf_vf_pvf(modff, vl4);
 DECLARE_SP_vf_vf_vf(nextafterf, vv);
 DECLARE_SP_vf_vf_vf(powf, vv);
 DECLARE_SP_vf_vf(rintf, v);
 DECLARE_SP_vf_vf(roundf, v);
 DECLARE_SP_vf_vf(sinf, v);
 DECLARE_SP_v_vf_pvf_pvf(sincosf, vl4l4);
 DECLARE_SP_v_vf_pvf_pvf(sincospif, vl4l4);
 DECLARE_SP_vf_vf(sinhf, v);
 DECLARE_SP_vf_vf(sinpif, v);
 DECLARE_SP_vf_vf(sqrtf, v);
 DECLARE_SP_vf_vf(tanf, v);
 DECLARE_SP_vf_vf(tanhf, v);
 DECLARE_SP_vf_vf(tgammaf, v);
 DECLARE_SP_vf_vf(truncf, v);
 #ifndef ENABLE_SVE
 vdouble vd0, vd1, vd2, vd3;
 vfloat vf0, vf1, vf2, vf3;
 vint vi0, vi1, vi2, vi3;
 vint2 vi20, vi21, vi22, vi23;
 vopmask mask;
 #else
 volatile char outbuf[1024];
 #endif
 int check_feature(double d, float f) {
 #ifdef ENABLE_SVE
  vdouble vd0 = svdup_n_f64(d), vd1 = svdup_n_f64(d);
 #ifdef MASKED_GNUABI
  vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(f), svdup_n_s32(0));
 #endif
 #endif
  CALL_DP_vd_vd(__acos_finite, v);
 #ifdef ENABLE_SVE
  svst1_f64(svptrue_b8(), (double *)outbuf, vd0);
 #endif
  return 1;
 }
 int main2(int argc, char **argv) {
 #ifdef ENABLE_SVE
  vdouble vd0 = svdup_n_f64(argc), vd1 = svdup_n_f64(argc), vd2 = svdup_n_f64(argc), vd3 = svdup_n_f64(argc);
  vfloat vf0 = svdup_n_f32(argc), vf1 = svdup_n_f32(argc), vf2 = svdup_n_f32(argc), vf3 = svdup_n_f32(argc);
  vint vi0 = svdup_n_s32(argc), vi2 = svdup_n_s32(argc);
  vint2 vi20 = svdup_n_s32(argc), vi22 = svdup_n_s32(argc);
 #ifdef MASKED_GNUABI
  vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(argc), svdup_n_s32(0));
 #endif
 #endif
  // Double precision function call.
  CALL_DP_vd_vd(__acos_finite, v);
  CALL_DP_vd_vd(__acosh_finite, v);
  CALL_DP_vd_vd(__asin_finite, v);
  CALL_DP_vd_vd_vd(__atan2_finite, vv);
  CALL_DP_vd_vd(__atanh_finite, v);
  CALL_DP_vd_vd(__cosh_finite, v);
  CALL_DP_vd_vd(__exp10_finite, v);
  CALL_DP_vd_vd(__exp2_finite, v);
  CALL_DP_vd_vd(__exp_finite, v);
  CALL_DP_vd_vd_vd(__fmod_finite, vv);
  CALL_DP_vd_vd_pvd(__modf_finite, vl8);
  CALL_DP_vd_vd_vd(__hypot_finite, vv);
  CALL_DP_vd_vd(__log10_finite, v);
  // CALL_DP_vd_vd(__log2_finite,v);
  CALL_DP_vd_vd(__log_finite, v);
  CALL_DP_vd_vd_vd(__pow_finite, vv);
  CALL_DP_vd_vd(__sinh_finite, v);
  CALL_DP_vd_vd(__sqrt_finite, v);
  CALL_DP_vd_vd(acos, v);
  CALL_DP_vd_vd(acosh, v);
  CALL_DP_vd_vd(asin, v);
  CALL_DP_vd_vd(asinh, v);
  CALL_DP_vd_vd(atan, v);
  CALL_DP_vd_vd_vd(atan2, vv);
  CALL_DP_vd_vd(atanh, v);
  CALL_DP_vd_vd(cbrt, v);
  CALL_DP_vd_vd(ceil, v);
  CALL_DP_vd_vd_vd(copysign, vv);
  CALL_DP_vd_vd(cos, v);
  CALL_DP_vd_vd(cosh, v);
  CALL_DP_vd_vd(cospi, v);
  CALL_DP_vd_vd(erf, v);
  CALL_DP_vd_vd(erfc, v);
  CALL_DP_vd_vd(exp, v);
  CALL_DP_vd_vd(exp10, v);
  CALL_DP_vd_vd(exp2, v);
  CALL_DP_vi_vd(expfrexp, v);
  CALL_DP_vd_vd(expm1, v);
  CALL_DP_vd_vd(fabs, v);
  CALL_DP_vd_vd_vd(fdim, vv);
  CALL_DP_vd_vd(floor, v);
  CALL_DP_vd_vd_vd_vd(fma, vvv);
  CALL_DP_vd_vd_vd(fmax, vv);
  CALL_DP_vd_vd_vd(fmin, vv);
  CALL_DP_vd_vd_vd(fmod, vv);
  CALL_DP_vd_vd(frfrexp, v);
  CALL_DP_vd_vd_vd(hypot, vv);
  CALL_DP_vi_vd(ilogb, v);
  CALL_DP_vd_vd_vi(ldexp, vv);
  CALL_DP_vd_vd(lgamma, v);
  CALL_DP_vd_vd(log, v);
  CALL_DP_vd_vd(log10, v);
  CALL_DP_vd_vd(log1p, v);
  CALL_DP_vd_vd(log2, v);
  CALL_DP_vd_vd_pvd(modf, vl8);
  CALL_DP_vd_vd_vd(nextafter, vv);
  CALL_DP_vd_vd_vd(pow, vv);
  CALL_DP_vd_vd(rint, v);
  CALL_DP_vd_vd(round, v);
  CALL_DP_vd_vd(sin, v);
  CALL_DP_v_vd_pvd_pvd(sincos, vl8l8);
  CALL_DP_v_vd_pvd_pvd(sincospi, vl8l8);
  CALL_DP_vd_vd(sinh, v);
  CALL_DP_vd_vd(sinpi, v);
  CALL_DP_vd_vd(sqrt, v);
  CALL_DP_vd_vd(tan, v);
  CALL_DP_vd_vd(tanh, v);
  CALL_DP_vd_vd(tgamma, v);
  CALL_DP_vd_vd(trunc, v);
  // Single precision function call.
  CALL_SP_vf_vf(__acosf_finite, v);
  CALL_SP_vf_vf(__acoshf_finite, v);
  CALL_SP_vf_vf(__asinf_finite, v);
  CALL_SP_vf_vf_vf(__atan2f_finite, vv);
  CALL_SP_vf_vf(__atanhf_finite, v);
  CALL_SP_vf_vf(__coshf_finite, v);
  CALL_SP_vf_vf(__exp10f_finite, v);
  CALL_SP_vf_vf(__exp2f_finite, v);
  CALL_SP_vf_vf(__expf_finite, v);
  CALL_SP_vf_vf_vf(__fmodf_finite, vv);
  CALL_SP_vf_vf_pvf(__modff_finite, vl4);
  CALL_SP_vf_vf_vf(__hypotf_finite, vv);
  CALL_SP_vf_vf(__log10f_finite, v);
  // CALL_SP_vf_vf(__log2f_finite,v);
  CALL_SP_vf_vf(__logf_finite, v);
  CALL_SP_vf_vf_vf(__powf_finite, vv);
  CALL_SP_vf_vf(__sinhf_finite, v);
  CALL_SP_vf_vf(__sqrtf_finite, v);
  CALL_SP_vf_vf(acosf, v);
  CALL_SP_vf_vf(acoshf, v);
  CALL_SP_vf_vf(asinf, v);
  CALL_SP_vf_vf(asinhf, v);
  CALL_SP_vf_vf(atanf, v);
  CALL_SP_vf_vf_vf(atan2f, vv);
  CALL_SP_vf_vf(atanhf, v);
  CALL_SP_vf_vf(cbrtf, v);
  CALL_SP_vf_vf(ceilf, v);
  CALL_SP_vf_vf_vf(copysignf, vv);
  CALL_SP_vf_vf(cosf, v);
  CALL_SP_vf_vf(coshf, v);
  CALL_SP_vf_vf(cospif, v);
  CALL_SP_vf_vf(erff, v);
  CALL_SP_vf_vf(erfcf, v);
  CALL_SP_vf_vf(expf, v);
  CALL_SP_vf_vf(exp10f, v);
  CALL_SP_vf_vf(exp2f, v);
  CALL_SP_vf_vf(expm1f, v);
  CALL_SP_vf_vf(fabsf, v);
  CALL_SP_vf_vf_vf(fdimf, vv);
  CALL_SP_vf_vf(floorf, v);
  CALL_SP_vf_vf_vf_vf(fmaf, vvv);
  CALL_SP_vf_vf_vf(fmaxf, vv);
  CALL_SP_vf_vf_vf(fminf, vv);
  CALL_SP_vf_vf_vf(fmodf, vv);
  CALL_SP_vf_vf(frfrexpf, v);
  CALL_SP_vf_vf_vf(hypotf, vv);
 #ifndef ENABLE_AVX
 // These two functions are not checked in some configurations due to
 // the issue in https://github.com/shibatch/sleef/issues/221
  CALL_SP_vi_vf(expfrexpf, v);
  CALL_SP_vi_vf(ilogbf, v);
 #endif
  CALL_SP_vf_vf_vi(ldexpf, vv);
  CALL_SP_vf_vf(lgammaf, v);
  CALL_SP_vf_vf(logf, v);
  CALL_SP_vf_vf(log10f, v);
  CALL_SP_vf_vf(log1pf, v);
  CALL_SP_vf_vf(log2f, v);
  CALL_SP_vf_vf_pvf(modff, vl4);
  CALL_SP_vf_vf_vf(nextafterf, vv);
  CALL_SP_vf_vf_vf(powf, vv);
  CALL_SP_vf_vf(rintf, v);
  CALL_SP_vf_vf(roundf, v);
  CALL_SP_vf_vf(sinf, v);
  CALL_SP_v_vf_pvf_pvf(sincosf, vl4l4);
  CALL_SP_v_vf_pvf_pvf(sincospif, vl4l4);
  CALL_SP_vf_vf(sinhf, v);
  CALL_SP_vf_vf(sinpif, v);
  CALL_SP_vf_vf(sqrtf, v);
  CALL_SP_vf_vf(tanf, v);
  CALL_SP_vf_vf(tanhf, v);
  CALL_SP_vf_vf(tgammaf, v);
  CALL_SP_vf_vf(truncf, v);
  return 0;
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_cinz.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_cinz.txt
@@ -0,0 +1,129 @@
 sin u35 bc50dfbcbd8ef534541d1babe90860c7
 sin u10 dbc2cf81f292ef50fa0119e222c6c9f9
 cos u35 506e34a809b80ad3603ed46ba2a574b0
 cos u10 a0f69df5937152b8f8f0e671f3676289
 tan u35 970b5cd7f0e05defa22ebb155ab61a40
 tan u10 5fd08e0552e3ab853439bf5fd2bd344d
 sincos u10 7c164edcaa45988f6165b653fc76c495
 sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4
 sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
 sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
 log u10 4855b27222d900bea47a27cadba71727
 log u35 c95484de57c167da3d8d6d1baadf9ffa
 log2 u10 2662df9af919680ca62e1752fb1b7539
 log2 u35 1cd6d7f194a5e8364191497adc5c5cec
 log10 u10 36645e8031d873d66fd0ec2c5959f273
 log1p u10 1383924fb56cf2e7eda27de21320c591
 exp u10 13692a48edf2cf7a3e047b16ddfb7b81
 exp2 u10 436146f8d6dcaa4a754837108a9aa3e1
 exp2 u35 8881d075d9101a1dfa3f6a10b9ee8373
 exp10 u10 9d704b310f683872a6446cfc97726a4d
 exp10 u35 bc07745ebc22a7ee97679154c24b23cc
 expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
 pow u10 a0ea63b27d33262346a35c9439741075
 cbrt u10 5d8bf28ac74624594fd1be9217817690
 cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
 cbrt u35 73daa306764e208aab1627ac110b10d7
 cbrt u35 c29b7bf200215425b4ba948c8cc94c42
 hypot u05 cc2f18e409e19a02cadf7b91fd869120
 hypot u35 5194e0a554174a6145511ce3df9c1f46
 asin u10 86c061caec3fa2e1bc71bda4dad29f4c
 asin u35 31303b88bdc00206265002d6cc5e89e4
 acos u10 0a1a403590f2ac8364f132b334920945
 acos u35 493f960c1cce57931d95a5a22a0587a3
 atan u10 c97624a24ec034cc0c8985acb61d13cd
 atan u10 0be0f550406923016cfeb5ef62c25b15
 atan u35 9d6d83e066b5a4851d44771418c9948c
 atan u35 f32c1aa4caa08c6945afd1125ba8b113
 atan2 u10 6b1d9d25fcd96053acc19d1633fab36a
 atan2 u35 afb07894347062a96dab705b34eb1763
 sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
 cosh u10 f77eb95f79e274c12b4e92dc0389259b
 tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
 asinh u10 01136e54e2a434839530dda54f33cfdb
 acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
 atanh u10 601a77ba8c1d5175f2808b48a41260c1
 lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
 tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7
 erf u10 f4ae148b59bb7501d8f5746300850376
 erfc u15 5e116a4316dafa742769f71e18f6f9fe
 fabs  bef2f2ac8a4789357e580b4da4f9b9fe
 copysign  3219022f267464e3704f90558e8df3bc
 fmax  4e4f5220ccfef191864c316df0d18fc0
 fmin  c0f8effb6c611e2b3b91b820ad943f62
 fdim  e876d103931f18ceede5bfd7e3df7ab0
 fmod  618aa751e13012afdb41ec80dd35e6ba
 remainder  8d692dbb44bbc9be5af0c0657d3008b8
 modf  f03ce73cd4f9ea7f69c017f6e53355d5
 nextafter  9eba4e30d12d74dc4e8003fcff0f1582
 trunc  1bc7e909eba121dcef7f0e4046937ae5
 floor  2cff66b499dc8a30cec9467de659b774
 ceil  b080e632dcb8f8134d8715752be12917
 round  8907e21687ca9c2a539297536e754950
 rint  e49f837096bc661fe1c742801dd99a30
 sinf u35 833d845950b9cbb025629fe4c040f8f6
 sinf u10 9c21afa4d7d6af3fc666309c3cd647fe
 cosf u35 74d7f871a6553cd0019087895e2052ad
 cosf u10 35349e94c323c1614f22093959288010
 tanf u35 bbb7c092d017e96d2454a38a20687735
 tanf u10 227423bc04f42d76a8f68082ba696126
 sincosf u10 83ecc4e3d5295056e9d8c52bc196b666
 sincosf u35 533319caa49a961e4909bd6dcab40721
 sincospif u05 8b3762b67a661957c1414c351ec49034
 sincospif u35 cec15ed76a358091632634166fa77b66
 logf u10 c5a90119943acc4199e1cc7030b5def8
 logf u35 af2fbe4bfa2caaf59c734e3749dd15be
 log2f u10 ba8acae369bbb7b6404cccbc633fe25b
 log2f u35 ba32ebaa8c470899ebd433d190c00f03
 log10f u10 7e235a82d960e4434575dd39648d8bb7
 log1pf u10 350fc4f13502b36bb1107e1b1122acb1
 expf u10 ee4adaabefa3fac6c0f1925b2a948eea
 exp2f u10 b0d283dbae0f36f1b3c7eed9871f0d0d
 exp2f u35 522cc30f722f77fceb07015830b351a3
 exp10f u10 b0564be151965600f5744ff2e4992bc9
 exp10f u35 d142f1fb40e44f0c9e042718f27ee3e0
 expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
 powf u10 a7cba3239c87969662e8b41a4dd8b4ab
 cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
 cbrtf u10 2a245b03f83e9114644d03b40dac707b
 cbrtf u35 3ce62350fd585f0524a12c974fbe6cf5
 cbrtf u35 2aca0404626a28f7af7f60105ad6e217
 hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
 hypotf u35 a6f0f774b346a6bba08889ff9ba3f193
 asinf u10 7f77f7453b961512c89e87e49c549cfe
 asinf u35 22ed8760aa328e1f714031eec592a4d8
 acosf u10 15617dd0429b90e59d2923415934c2a6
 acosf u35 af0b132d9e263721f9296187dbf9b9bf
 atanf u10 26b77fb423104b45633cf24500237d6e
 atanf u10 4313d0bc2708de53f74d804aac6564d4
 atanf u35 97a1797897955643c722c7d291987331
 atanf u35 7d3f47169415058e8578f11d899bfd10
 atan2f u10 098a33f730fe95ce4774a991db4cee14
 atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363
 sinhf u10 0780a2f57df3a831718195d1ee5c19ef
 coshf u10 cfbb6aed408e43a7b7f053474100ff2d
 tanhf u10 d19f254d41e8726c748df87b95bc9acd
 asinhf u10 260d129221468a86bbfd609c27bfea6a
 acoshf u10 24ced7e5631c78b20a5716faeedbaa92
 atanhf u10 164fd77b8372b8c131baaacab1c9e650
 lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
 tgammaf u10 f3a8d25c852068622bdfcae4cb813583
 erff u10 f34af3814153de040b93e573ca7d21d8
 erfcf u15 915ab9830de89a5a504b3ce7cd2fecda
 fabsf  a3c72220bc0ade68fe22e0a15eb730d4
 copysignf  6b35517b8e1da78d9c9b52915d9a9b19
 fmaxf  9833a60a2080e8fd9ae8de32c758966f
 fminf  2dcfa19e1f1ab4973a7dec9f2cc09fa0
 fdimf  c5c0fe7b095eb8ccbb19fbf934a36b24
 fmodf  77aa84a9703e202a56e5f4609bd2482b
 remainderf  5a453b1217c173e4dc0b0211066750be
 modff  5fa4f044f20478216aa085a01b189697
 nextafterf  517c1c8f072e9024518d3d9ead98b85b
 truncf  6937050850be63c44d4b7dbd666febe6
 floorf  9341be69ee345c8554bf3ab4e9316133
 ceilf  c70874771cbe9741f1f05fedd4b629e9
 roundf  0cf52f6b8015099771e9a7dfa6b090bc
 rintf  bed68e788e2b11543c09c9d52198abf8
 fastsinf u3500 8eb51f86fb40414dd21284f020f24b6c
 fastcosf u3500 69cbc3703f1d2c68695b00b1b09287b2
 fastpowf u3500 e02e6a692cfa22a6b7149168c67ea1d2
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_finz.txt
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/hash_finz.txt
@@ -0,0 +1,129 @@
 sin u35 c163e4a7e9ccebb2181dcc8653367d8c
 sin u10 0d6bf6f2c935db82588222da95659019
 cos u35 52f902bd939d751b5b544ac70181fcff
 cos u10 afcdba92a75a76d56b8cf2f22d4bec9e
 tan u35 906cc42b6755fe514c5e185fcb4d2f55
 tan u10 c98f29a62067fa63646d9bcc29a310c6
 sincos u10 3fe37f4eb805505152f2b14a22a9f94e
 sincos u35 95a7b7f48c71febf10ec6eff796dd391
 sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
 sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
 log u10 4855b27222d900bea47a27cadba71727
 log u35 015f8ae899c9b921d48919dd12ef19a9
 log2 u10 2662df9af919680ca62e1752fb1b7539
 log2 u35 908b1949db34ea855944f00089b21e23
 log10 u10 36645e8031d873d66fd0ec2c5959f273
 log1p u10 1383924fb56cf2e7eda27de21320c591
 exp u10 084e5be89c2ad03e356078ea4f287bab
 exp2 u10 6e36db9ae2cf9eca82e3d9157c622351
 exp2 u35 6e36db9ae2cf9eca82e3d9157c622351
 exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e
 exp10 u35 6904d5509ca794747aa249c13886f90f
 expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
 pow u10 7e19796027d7c1d1999be948f90e6181
 cbrt u10 5d8bf28ac74624594fd1be9217817690
 cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
 cbrt u35 fc7ee3e3e6c54365d708b752c242a947
 cbrt u35 2408714a56d74f8c82389ca6772cdbc1
 hypot u05 cc2f18e409e19a02cadf7b91fd869120
 hypot u35 be7bbd41dffd746b70261ee773cbd4b2
 asin u10 8a21b7c28cdaffc9d3e53f415367932e
 asin u35 9c9e8107782898e9faed6924ad1b3cb1
 acos u10 28261e4eb8331865660c814676d5c6bc
 acos u35 310911130bfc45b10dabe3a072939331
 atan u10 f931de72f2f6a7928f307a8a382ae255
 atan u10 453f9ef62f58f9829320baf482a1d457
 atan u35 6161b6189609f105b017d8768d0a41f1
 atan u35 6face71d8d93c69448d49ed6140e361d
 atan2 u10 469babaeee9bd30e17af2f473b3ea500
 atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f
 sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
 cosh u10 f77eb95f79e274c12b4e92dc0389259b
 tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
 asinh u10 01136e54e2a434839530dda54f33cfdb
 acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
 atanh u10 601a77ba8c1d5175f2808b48a41260c1
 lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
 tgamma u10 cb9a93844ad1713d2ab92ff5b6398150
 erf u10 8a0bc2146a5c67b6bebc58f4b0076568
 erfc u15 3e247a54183eeddedc33e99c50118995
 fabs  bef2f2ac8a4789357e580b4da4f9b9fe
 copysign  3219022f267464e3704f90558e8df3bc
 fmax  4e4f5220ccfef191864c316df0d18fc0
 fmin  c0f8effb6c611e2b3b91b820ad943f62
 fdim  e876d103931f18ceede5bfd7e3df7ab0
 fmod  618aa751e13012afdb41ec80dd35e6ba
 remainder  8d692dbb44bbc9be5af0c0657d3008b8
 modf  f03ce73cd4f9ea7f69c017f6e53355d5
 nextafter  9eba4e30d12d74dc4e8003fcff0f1582
 trunc  1bc7e909eba121dcef7f0e4046937ae5
 floor  2cff66b499dc8a30cec9467de659b774
 ceil  b080e632dcb8f8134d8715752be12917
 round  8907e21687ca9c2a539297536e754950
 rint  e49f837096bc661fe1c742801dd99a30
 sinf u35 f8f804eae1d9443103e81fec96293477
 sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe
 cosf u35 f2f3d1c9f090cde9c02439608dc7066e
 cosf u10 dc35f27fae65f63f0aa6ad241f8b387b
 tanf u35 68d42ad1fb412e6b8be3853461e61213
 tanf u10 97df301d4f59e67d5318b5356b703f06
 sincosf u10 a97124d810ec461c135dc4fb0c059b6f
 sincosf u35 0cc521e52ae1227d311012c2919c1ff2
 sincospif u05 8b3762b67a661957c1414c351ec49034
 sincospif u35 8720757f221c00cc8de24b7dc4949144
 logf u10 c5a90119943acc4199e1cc7030b5def8
 logf u35 b6234302d534d6ccd48155dd6b9a4293
 log2f u10 ba8acae369bbb7b6404cccbc633fe25b
 log2f u35 74174c90717c86642b71284452a8aef6
 log10f u10 7e235a82d960e4434575dd39648d8bb7
 log1pf u10 e53dbfa80bcc1a7bcfd21000e6950475
 expf u10 9597388315e4b3e89c4c97ce46374dcf
 exp2f u10 42d66e5e4cb88feb29c5b36c632159a5
 exp2f u35 42d66e5e4cb88feb29c5b36c632159a5
 exp10f u10 954f0824b6d949d0da03b49950dc6642
 exp10f u35 6fb0e9a829e12a06679d379d05b53ede
 expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
 powf u10 2ed84af40d03e307a620365f172d010d
 cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
 cbrtf u10 2a245b03f83e9114644d03b40dac707b
 cbrtf u35 6c22a6dc132c5212250970f22f42256d
 cbrtf u35 5ab696ae11f9637413d30e6496d5324b
 hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
 hypotf u35 2a7cd97768287084b7fffc7e9fb39072
 asinf u10 e2e571a01984c4ffb3f6e38e0328d90e
 asinf u35 70df2dfc3a3569868cce60c38e7b1962
 acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb
 acosf u35 72b0e2f9791f90f1c43570b9e9ba893f
 atanf u10 fa672e387a204055f735b7af98dd8a35
 atanf u10 d017670c13bc221b68bc9ee5f41c4b5e
 atanf u35 f592e46eaa5d29583f86d3e336f20b6b
 atanf u35 e7087fe40de46921826b373d10c40954
 atan2f u10 275b2fa8ee554c45551bb142db9f8197
 atan2f u35 44b187851195d24bab2561eb8f4ff5d0
 sinhf u10 45bc228a14c3e39eeb35e9764394a23e
 coshf u10 838d441e85d415ef4fb1e5c5ea966a71
 tanhf u10 d19f254d41e8726c748df87b95bc9acd
 asinhf u10 927eeb621a3e2d5039f1a07fcf150901
 acoshf u10 932520013273174fcabe2be4a55f919f
 atanhf u10 164fd77b8372b8c131baaacab1c9e650
 lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
 tgammaf u10 c3059747811d98846f74a63d3747ac3d
 erff u10 f34af3814153de040b93e573ca7d21d8
 erfcf u15 687a9c577512d349ddbc0643013d2c56
 fabsf  a3c72220bc0ade68fe22e0a15eb730d4
 copysignf  6b35517b8e1da78d9c9b52915d9a9b19
 fmaxf  9833a60a2080e8fd9ae8de32c758966f
 fminf  2dcfa19e1f1ab4973a7dec9f2cc09fa0
 fdimf  c5c0fe7b095eb8ccbb19fbf934a36b24
 fmodf  77aa84a9703e202a56e5f4609bd2482b
 remainderf  5a453b1217c173e4dc0b0211066750be
 modff  5fa4f044f20478216aa085a01b189697
 nextafterf  517c1c8f072e9024518d3d9ead98b85b
 truncf  6937050850be63c44d4b7dbd666febe6
 floorf  9341be69ee345c8554bf3ab4e9316133
 ceilf  c70874771cbe9741f1f05fedd4b629e9
 roundf  0cf52f6b8015099771e9a7dfa6b090bc
 rintf  bed68e788e2b11543c09c9d52198abf8
 fastsinf u3500 5c48081c74cd0316379b580b047dbfc2
 fastcosf u3500 6f73d116f109283e5632c31f5988f55b
 fastpowf u3500 6dbb3110412df4fed5a71f50d40def89
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iut.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iut.c
@@ -0,0 +1,777 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <inttypes.h>
 #include <assert.h>
 #include <math.h>
 #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
 #define STDIN_FILENO 0
 #else
 #include <unistd.h>
 #include <sys/types.h>
 #endif
 #include "sleef.h"
 #include "testerutil.h"
 #define DORENAME
 #include "rename.h"
 #define BUFSIZE 1024
 int main(int argc, char **argv) {
  char buf[BUFSIZE];
  printf("3\n");
  fflush(stdout);
  for(;;) {
    if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;
    if (startsWith(buf, "sin ")) {
      uint64_t u;
      sscanf(buf, "sin %" PRIx64, &u);
      u = d2u(xsin(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sin_u1 ")) {
      uint64_t u;
      sscanf(buf, "sin_u1 %" PRIx64, &u);
      u = d2u(xsin_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cos ")) {
      uint64_t u;
      sscanf(buf, "cos %" PRIx64, &u);
      u = d2u(xcos(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cos_u1 ")) {
      uint64_t u;
      sscanf(buf, "cos_u1 %" PRIx64, &u);
      u = d2u(xcos_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sincos ")) {
      uint64_t u;
      sscanf(buf, "sincos %" PRIx64, &u);
      Sleef_double2 x = xsincos(u2d(u));
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
    } else if (startsWith(buf, "sincos_u1 ")) {
      uint64_t u;
      sscanf(buf, "sincos_u1 %" PRIx64, &u);
      Sleef_double2 x = xsincos_u1(u2d(u));
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
    } else if (startsWith(buf, "sincospi_u05 ")) {
      uint64_t u;
      sscanf(buf, "sincospi_u05 %" PRIx64, &u);
      Sleef_double2 x = xsincospi_u05(u2d(u));
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
    } else if (startsWith(buf, "sincospi_u35 ")) {
      uint64_t u;
      sscanf(buf, "sincospi_u35 %" PRIx64, &u);
      Sleef_double2 x = xsincospi_u35(u2d(u));
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
    } else if (startsWith(buf, "sinpi_u05 ")) {
      uint64_t u;
      sscanf(buf, "sinpi_u05 %" PRIx64, &u);
      u = d2u(xsinpi_u05(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cospi_u05 ")) {
      uint64_t u;
      sscanf(buf, "cospi_u05 %" PRIx64, &u);
      u = d2u(xcospi_u05(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "tan ")) {
      uint64_t u;
      sscanf(buf, "tan %" PRIx64, &u);
      u = d2u(xtan(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "tan_u1 ")) {
      uint64_t u;
      sscanf(buf, "tan_u1 %" PRIx64, &u);
      u = d2u(xtan_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "asin ")) {
      uint64_t u;
      sscanf(buf, "asin %" PRIx64, &u);
      u = d2u(xasin(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "acos ")) {
      uint64_t u;
      sscanf(buf, "acos %" PRIx64, &u);
      u = d2u(xacos(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "atan ")) {
      uint64_t u;
      sscanf(buf, "atan %" PRIx64, &u);
      u = d2u(xatan(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "log ")) {
      uint64_t u;
      sscanf(buf, "log %" PRIx64, &u);
      u = d2u(xlog(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "exp ")) {
      uint64_t u;
      sscanf(buf, "exp %" PRIx64, &u);
      u = d2u(xexp(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "atan2 ")) {
      uint64_t u, v;
      sscanf(buf, "atan2 %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xatan2(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "asin_u1 ")) {
      uint64_t u;
      sscanf(buf, "asin_u1 %" PRIx64, &u);
      u = d2u(xasin_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "acos_u1 ")) {
      uint64_t u;
      sscanf(buf, "acos_u1 %" PRIx64, &u);
      u = d2u(xacos_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "atan_u1 ")) {
      uint64_t u;
      sscanf(buf, "atan_u1 %" PRIx64, &u);
      u = d2u(xatan_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "atan2_u1 ")) {
      uint64_t u, v;
      sscanf(buf, "atan2_u1 %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xatan2_u1(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "log_u1 ")) {
      uint64_t u;
      sscanf(buf, "log_u1 %" PRIx64, &u);
      u = d2u(xlog_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "pow ")) {
      uint64_t u, v;
      sscanf(buf, "pow %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xpow(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sinh ")) {
      uint64_t u;
      sscanf(buf, "sinh %" PRIx64, &u);
      u = d2u(xsinh(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cosh ")) {
      uint64_t u;
      sscanf(buf, "cosh %" PRIx64, &u);
      u = d2u(xcosh(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "tanh ")) {
      uint64_t u;
      sscanf(buf, "tanh %" PRIx64, &u);
      u = d2u(xtanh(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sinh_u35 ")) {
      uint64_t u;
      sscanf(buf, "sinh_u35 %" PRIx64, &u);
      u = d2u(xsinh_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cosh_u35 ")) {
      uint64_t u;
      sscanf(buf, "cosh_u35 %" PRIx64, &u);
      u = d2u(xcosh_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "tanh_u35 ")) {
      uint64_t u;
      sscanf(buf, "tanh_u35 %" PRIx64, &u);
      u = d2u(xtanh_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "asinh ")) {
      uint64_t u;
      sscanf(buf, "asinh %" PRIx64, &u);
      u = d2u(xasinh(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "acosh ")) {
      uint64_t u;
      sscanf(buf, "acosh %" PRIx64, &u);
      u = d2u(xacosh(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "atanh ")) {
      uint64_t u;
      sscanf(buf, "atanh %" PRIx64, &u);
      u = d2u(xatanh(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "fma ")) {
      uint64_t u, v, w;
      sscanf(buf, "fma %" PRIx64 " %" PRIx64 " %" PRIx64, &u, &v, &w);
      u = d2u(xfma(u2d(u), u2d(v), u2d(w)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sqrt ")) {
      uint64_t u;
      sscanf(buf, "sqrt %" PRIx64, &u);
      u = d2u(xsqrt(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sqrt_u05 ")) {
      uint64_t u;
      sscanf(buf, "sqrt_u05 %" PRIx64, &u);
      u = d2u(xsqrt_u05(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "sqrt_u35 ")) {
      uint64_t u;
      sscanf(buf, "sqrt_u35 %" PRIx64, &u);
      u = d2u(xsqrt_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cbrt ")) {
      uint64_t u;
      sscanf(buf, "cbrt %" PRIx64, &u);
      u = d2u(xcbrt(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "cbrt_u1 ")) {
      uint64_t u;
      sscanf(buf, "cbrt_u1 %" PRIx64, &u);
      u = d2u(xcbrt_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "exp2 ")) {
      uint64_t u;
      sscanf(buf, "exp2 %" PRIx64, &u);
      u = d2u(xexp2(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "exp2_u35 ")) {
      uint64_t u;
      sscanf(buf, "exp2_u35 %" PRIx64, &u);
      u = d2u(xexp2_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "exp10 ")) {
      uint64_t u;
      sscanf(buf, "exp10 %" PRIx64, &u);
      u = d2u(xexp10(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "exp10_u35 ")) {
      uint64_t u;
      sscanf(buf, "exp10_u35 %" PRIx64, &u);
      u = d2u(xexp10_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "expm1 ")) {
      uint64_t u;
      sscanf(buf, "expm1 %" PRIx64, &u);
      u = d2u(xexpm1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "log10 ")) {
      uint64_t u;
      sscanf(buf, "log10 %" PRIx64, &u);
      u = d2u(xlog10(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "log2 ")) {
      uint64_t u;
      sscanf(buf, "log2 %" PRIx64, &u);
      u = d2u(xlog2(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "log2_u35 ")) {
      uint64_t u;
      sscanf(buf, "log2_u35 %" PRIx64, &u);
      u = d2u(xlog2_u35(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "log1p ")) {
      uint64_t u;
      sscanf(buf, "log1p %" PRIx64, &u);
      u = d2u(xlog1p(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "ldexp ")) {
      uint64_t u, v;
      sscanf(buf, "ldexp %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xldexp(u2d(u), (int)u2d(v)));
      printf("%" PRIx64 "\n", u);
    }
    else if (startsWith(buf, "hypot_u05 ")) {
      uint64_t u, v;
      sscanf(buf, "hypot_u05 %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xhypot_u05(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "hypot_u35 ")) {
      uint64_t u, v;
      sscanf(buf, "hypot_u35 %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xhypot_u35(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "copysign ")) {
      uint64_t u, v;
      sscanf(buf, "copysign %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xcopysign(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "fmax ")) {
      uint64_t u, v;
      sscanf(buf, "fmax %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xfmax(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "fmin ")) {
      uint64_t u, v;
      sscanf(buf, "fmin %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xfmin(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "fdim ")) {
      uint64_t u, v;
      sscanf(buf, "fdim %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xfdim(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "nextafter ")) {
      uint64_t u, v;
      sscanf(buf, "nextafter %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xnextafter(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "fmod ")) {
      uint64_t u, v;
      sscanf(buf, "fmod %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xfmod(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "remainder ")) {
      uint64_t u, v;
      sscanf(buf, "remainder %" PRIx64 " %" PRIx64, &u, &v);
      u = d2u(xremainder(u2d(u), u2d(v)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "fabs ")) {
      uint64_t u;
      sscanf(buf, "fabs %" PRIx64, &u);
      u = d2u(xfabs(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "trunc ")) {
      uint64_t u;
      sscanf(buf, "trunc %" PRIx64, &u);
      u = d2u(xtrunc(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "floor ")) {
      uint64_t u;
      sscanf(buf, "floor %" PRIx64, &u);
      u = d2u(xfloor(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "ceil ")) {
      uint64_t u;
      sscanf(buf, "ceil %" PRIx64, &u);
      u = d2u(xceil(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "round ")) {
      uint64_t u;
      sscanf(buf, "round %" PRIx64, &u);
      u = d2u(xround(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "rint ")) {
      uint64_t u;
      sscanf(buf, "rint %" PRIx64, &u);
      u = d2u(xrint(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "frfrexp ")) {
      uint64_t u;
      sscanf(buf, "frfrexp %" PRIx64, &u);
      u = d2u(xfrfrexp(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "modf ")) {
      uint64_t u;
      sscanf(buf, "modf %" PRIx64, &u);
      Sleef_double2 x = xmodf(u2d(u));
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
    } else if (startsWith(buf, "tgamma_u1 ")) {
      uint64_t u;
      sscanf(buf, "tgamma_u1 %" PRIx64, &u);
      u = d2u(xtgamma_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "lgamma_u1 ")) {
      uint64_t u;
      sscanf(buf, "lgamma_u1 %" PRIx64, &u);
      u = d2u(xlgamma_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "erf_u1 ")) {
      uint64_t u;
      sscanf(buf, "erf_u1 %" PRIx64, &u);
      u = d2u(xerf_u1(u2d(u)));
      printf("%" PRIx64 "\n", u);
    } else if (startsWith(buf, "erfc_u15 ")) {
      uint64_t u;
      sscanf(buf, "erfc_u15 %" PRIx64, &u);
      u = d2u(xerfc_u15(u2d(u)));
      printf("%" PRIx64 "\n", u);
    }
    else if (startsWith(buf, "sinf ")) {
      uint32_t u;
      sscanf(buf, "sinf %x", &u);
      u = f2u(xsinf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "cosf ")) {
      uint32_t u;
      sscanf(buf, "cosf %x", &u);
      u = f2u(xcosf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sincosf ")) {
      uint32_t u;
      sscanf(buf, "sincosf %x", &u);
      Sleef_float2 x = xsincosf(u2f(u));
      printf("%x %x\n", f2u(x.x), f2u(x.y));
    } else if (startsWith(buf, "tanf ")) {
      uint32_t u;
      sscanf(buf, "tanf %x", &u);
      u = f2u(xtanf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "asinf ")) {
      uint32_t u;
      sscanf(buf, "asinf %x", &u);
      u = f2u(xasinf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "acosf ")) {
      uint32_t u;
      sscanf(buf, "acosf %x", &u);
      u = f2u(xacosf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "atanf ")) {
      uint32_t u;
      sscanf(buf, "atanf %x", &u);
      u = f2u(xatanf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "atan2f ")) {
      uint32_t u, v;
      sscanf(buf, "atan2f %x %x", &u, &v);
      u = f2u(xatan2f(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "logf ")) {
      uint32_t u;
      sscanf(buf, "logf %x", &u);
      u = f2u(xlogf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "expf ")) {
      uint32_t u;
      sscanf(buf, "expf %x", &u);
      u = f2u(xexpf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "cbrtf ")) {
      uint32_t u;
      sscanf(buf, "cbrtf %x", &u);
      u = f2u(xcbrtf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sqrtf ")) {
      uint32_t u;
      sscanf(buf, "sqrtf %x", &u);
      u = f2u(xsqrtf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sqrtf_u05 ")) {
      uint32_t u;
      sscanf(buf, "sqrtf_u05 %x", &u);
      u = f2u(xsqrtf_u05(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sqrtf_u35 ")) {
      uint32_t u;
      sscanf(buf, "sqrtf_u35 %x", &u);
      u = f2u(xsqrtf_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "ldexpf ")) {
      uint32_t u, v;
      sscanf(buf, "ldexpf %x %x", &u, &v);
      u = f2u(xldexpf(u2f(u), (int)u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "powf ")) {
      uint32_t u, v;
      sscanf(buf, "powf %x %x", &u, &v);
      u = f2u(xpowf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fastpowf_u3500 ")) {
      uint32_t u, v;
      sscanf(buf, "fastpowf_u3500 %x %x", &u, &v);
      u = f2u(xfastpowf_u3500(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sinhf ")) {
      uint32_t u;
      sscanf(buf, "sinhf %x", &u);
      u = f2u(xsinhf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "coshf ")) {
      uint32_t u;
      sscanf(buf, "coshf %x", &u);
      u = f2u(xcoshf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "tanhf ")) {
      uint32_t u;
      sscanf(buf, "tanhf %x", &u);
      u = f2u(xtanhf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sinhf_u35 ")) {
      uint32_t u;
      sscanf(buf, "sinhf_u35 %x", &u);
      u = f2u(xsinhf_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "coshf_u35 ")) {
      uint32_t u;
      sscanf(buf, "coshf_u35 %x", &u);
      u = f2u(xcoshf_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "tanhf_u35 ")) {
      uint32_t u;
      sscanf(buf, "tanhf_u35 %x", &u);
      u = f2u(xtanhf_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "asinhf ")) {
      uint32_t u;
      sscanf(buf, "asinhf %x", &u);
      u = f2u(xasinhf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "acoshf ")) {
      uint32_t u;
      sscanf(buf, "acoshf %x", &u);
      u = f2u(xacoshf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "atanhf ")) {
      uint32_t u;
      sscanf(buf, "atanhf %x", &u);
      u = f2u(xatanhf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "exp2f ")) {
      uint32_t u;
      sscanf(buf, "exp2f %x", &u);
      u = f2u(xexp2f(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "exp10f ")) {
      uint32_t u;
      sscanf(buf, "exp10f %x", &u);
      u = f2u(xexp10f(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "exp2f_u35 ")) {
      uint32_t u;
      sscanf(buf, "exp2f_u35 %x", &u);
      u = f2u(xexp2f_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "exp10f_u35 ")) {
      uint32_t u;
      sscanf(buf, "exp10f_u35 %x", &u);
      u = f2u(xexp10f_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "expm1f ")) {
      uint32_t u;
      sscanf(buf, "expm1f %x", &u);
      u = f2u(xexpm1f(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "log10f ")) {
      uint32_t u;
      sscanf(buf, "log10f %x", &u);
      u = f2u(xlog10f(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "log2f ")) {
      uint32_t u;
      sscanf(buf, "log2f %x", &u);
      u = f2u(xlog2f(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "log2f_u35 ")) {
      uint32_t u;
      sscanf(buf, "log2f_u35 %x", &u);
      u = f2u(xlog2f_u35(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "log1pf ")) {
      uint32_t u;
      sscanf(buf, "log1pf %x", &u);
      u = f2u(xlog1pf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sinf_u1 ")) {
      uint32_t u;
      sscanf(buf, "sinf_u1 %x", &u);
      u = f2u(xsinf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "cosf_u1 ")) {
      uint32_t u;
      sscanf(buf, "cosf_u1 %x", &u);
      u = f2u(xcosf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "sincosf_u1 ")) {
      uint32_t u;
      sscanf(buf, "sincosf_u1 %x", &u);
      Sleef_float2 x = xsincosf_u1(u2f(u));
      printf("%x %x\n", f2u(x.x), f2u(x.y));
    } else if (startsWith(buf, "sincospif_u05 ")) {
      uint32_t u;
      sscanf(buf, "sincospif_u05 %x", &u);
      Sleef_float2 x = xsincospif_u05(u2f(u));
      printf("%x %x\n", f2u(x.x), f2u(x.y));
    } else if (startsWith(buf, "sincospif_u35 ")) {
      uint32_t u;
      sscanf(buf, "sincospif_u35 %x", &u);
      Sleef_float2 x = xsincospif_u35(u2f(u));
      printf("%x %x\n", f2u(x.x), f2u(x.y));
    } else if (startsWith(buf, "sinpif_u05 ")) {
      uint32_t u;
      sscanf(buf, "sinpif_u05 %x", &u);
      u = f2u(xsinpif_u05(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "cospif_u05 ")) {
      uint32_t u;
      sscanf(buf, "cospif_u05 %x", &u);
      u = f2u(xcospif_u05(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fastsinf_u3500 ")) {
      uint32_t u;
      sscanf(buf, "fastsinf_u3500 %x", &u);
      u = f2u(xfastsinf_u3500(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fastcosf_u3500 ")) {
      uint32_t u;
      sscanf(buf, "fastcosf_u3500 %x", &u);
      u = f2u(xfastcosf_u3500(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "tanf_u1 ")) {
      uint32_t u;
      sscanf(buf, "tanf_u1 %x", &u);
      u = f2u(xtanf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "asinf_u1 ")) {
      uint32_t u;
      sscanf(buf, "asinf_u1 %x", &u);
      u = f2u(xasinf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "acosf_u1 ")) {
      uint32_t u;
      sscanf(buf, "acosf_u1 %x", &u);
      u = f2u(xacosf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "atanf_u1 ")) {
      uint32_t u;
      sscanf(buf, "atanf_u1 %x", &u);
      u = f2u(xatanf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "atan2f_u1 ")) {
      uint32_t u, v;
      sscanf(buf, "atan2f_u1 %x %x", &u, &v);
      u = f2u(xatan2f_u1(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "logf_u1 ")) {
      uint32_t u;
      sscanf(buf, "logf_u1 %x", &u);
      u = f2u(xlogf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "cbrtf_u1 ")) {
      uint32_t u;
      sscanf(buf, "cbrtf_u1 %x", &u);
      u = f2u(xcbrtf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "ilogb ")) {
      uint64_t u;
      int i;
      sscanf(buf, "ilogb %" PRIx64, &u);
      i = xilogb(u2d(u));
      printf("%d\n", i);
    } else if (startsWith(buf, "ilogbf ")) {
      uint32_t u;
      int i;
      sscanf(buf, "ilogbf %x", &u);
      i = xilogbf(u2f(u));
      printf("%d\n", i);
    }
    else if (startsWith(buf, "hypotf_u05 ")) {
      uint32_t u, v;
      sscanf(buf, "hypotf_u05 %x %x", &u, &v);
      u = f2u(xhypotf_u05(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "hypotf_u35 ")) {
      uint32_t u, v;
      sscanf(buf, "hypotf_u35 %x %x", &u, &v);
      u = f2u(xhypotf_u35(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "copysignf ")) {
      uint32_t u, v;
      sscanf(buf, "copysignf %x %x", &u, &v);
      u = f2u(xcopysignf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fmaxf ")) {
      uint32_t u, v;
      sscanf(buf, "fmaxf %x %x", &u, &v);
      u = f2u(xfmaxf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fminf ")) {
      uint32_t u, v;
      sscanf(buf, "fminf %x %x", &u, &v);
      u = f2u(xfminf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fdimf ")) {
      uint32_t u, v;
      sscanf(buf, "fdimf %x %x", &u, &v);
      u = f2u(xfdimf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "nextafterf ")) {
      uint32_t u, v;
      sscanf(buf, "nextafterf %x %x", &u, &v);
      u = f2u(xnextafterf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fmodf ")) {
      uint32_t u, v;
      sscanf(buf, "fmodf %x %x", &u, &v);
      u = f2u(xfmodf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "remainderf ")) {
      uint32_t u, v;
      sscanf(buf, "remainderf %x %x", &u, &v);
      u = f2u(xremainderf(u2f(u), u2f(v)));
      printf("%x\n", u);
    } else if (startsWith(buf, "fabsf ")) {
      uint32_t u;
      sscanf(buf, "fabsf %x", &u);
      u = f2u(xfabsf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "truncf ")) {
      uint32_t u;
      sscanf(buf, "truncf %x", &u);
      u = f2u(xtruncf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "floorf ")) {
      uint32_t u;
      sscanf(buf, "floorf %x", &u);
      u = f2u(xfloorf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "ceilf ")) {
      uint32_t u;
      sscanf(buf, "ceilf %x", &u);
      u = f2u(xceilf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "roundf ")) {
      uint32_t u;
      sscanf(buf, "roundf %x", &u);
      u = f2u(xroundf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "rintf ")) {
      uint32_t u;
      sscanf(buf, "rintf %x", &u);
      u = f2u(xrintf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "frfrexpf ")) {
      uint32_t u;
      sscanf(buf, "frfrexpf %x", &u);
      u = f2u(xfrfrexpf(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "modff ")) {
      uint32_t u;
      sscanf(buf, "modff %x", &u);
      Sleef_float2 x = xmodff(u2f(u));
      printf("%x %x\n", f2u(x.x), f2u(x.y));
    } else if (startsWith(buf, "tgammaf_u1 ")) {
      uint32_t u;
      sscanf(buf, "tgammaf_u1 %x", &u);
      u = f2u(xtgammaf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "lgammaf_u1 ")) {
      uint32_t u;
      sscanf(buf, "lgammaf_u1 %x", &u);
      u = f2u(xlgammaf_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "erff_u1 ")) {
      uint32_t u;
      sscanf(buf, "erff_u1 %x", &u);
      u = f2u(xerff_u1(u2f(u)));
      printf("%x\n", u);
    } else if (startsWith(buf, "erfcf_u15 ")) {
      uint32_t u;
      sscanf(buf, "erfcf_u15 %x", &u);
      u = f2u(xerfcf_u15(u2f(u)));
      printf("%x\n", u);
    }
    else {
      break;
    }
    fflush(stdout);
  }
  return 0;
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutcuda.cu
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutcuda.cu
@@ -0,0 +1,546 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 #include <math.h>
 #include <float.h>
 #include <stdint.h>
 #include <cuda.h>
 #include "sleefinline_purec_scalar.h"
 #include "sleefinline_cuda.h"
 #define STDIN_FILENO 0
 #define SIMD_SUFFIX _cuda_sleef
 #define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
 #define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
 #define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
 #define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
 //
 static int startsWith(const char *str, const char *prefix) {
  while(*prefix != '\0') if (*str++ != *prefix++) return 0;
  return *prefix == '\0';
 }
 static double u2d(uint64_t u) {
  union {
    double f;
    uint64_t i;
  } tmp;
  tmp.i = u;
  return tmp.f;
 }
 static uint64_t d2u(double d) {
  union {
    double f;
    uint64_t i;
  } tmp;
  tmp.f = d;
  return tmp.i;
 }
 static float u2f(uint32_t u) {
  union {
    float f;
    uint32_t i;
  } tmp;
  tmp.i = u;
  return tmp.f;
 }
 static uint32_t f2u(float d) {
  union {
    float f;
    uint32_t i;
  } tmp;
  tmp.f = d;
  return tmp.i;
 }
 //
 __global__ void xsin(double *r, double *a0) { *r = Sleef_sind1_u35cuda(*a0); }
 __global__ void xcos(double *r, double *a0) { *r = Sleef_cosd1_u35cuda(*a0); }
 __global__ void xsincos(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u35cuda(*a0); }
 __global__ void xtan(double *r, double *a0) { *r = Sleef_tand1_u35cuda(*a0); }
 __global__ void xasin(double *r, double *a0) { *r = Sleef_asind1_u35cuda(*a0); }
 __global__ void xacos(double *r, double *a0) { *r = Sleef_acosd1_u35cuda(*a0); }
 __global__ void xatan(double *r, double *a0) { *r = Sleef_atand1_u35cuda(*a0); }
 __global__ void xatan2(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u35cuda(*a0, *a1); }
 __global__ void xlog(double *r, double *a0) { *r = Sleef_logd1_u35cuda(*a0); }
 __global__ void xcbrt(double *r, double *a0) { *r = Sleef_cbrtd1_u35cuda(*a0); }
 __global__ void xsin_u1(double *r, double *a0) { *r = Sleef_sind1_u10cuda(*a0); }
 __global__ void xcos_u1(double *r, double *a0) { *r = Sleef_cosd1_u10cuda(*a0); }
 __global__ void xsincos_u1(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u10cuda(*a0); }
 __global__ void xtan_u1(double *r, double *a0) { *r = Sleef_tand1_u10cuda(*a0); }
 __global__ void xasin_u1(double *r, double *a0) { *r = Sleef_asind1_u10cuda(*a0); }
 __global__ void xacos_u1(double *r, double *a0) { *r = Sleef_acosd1_u10cuda(*a0); }
 __global__ void xatan_u1(double *r, double *a0) { *r = Sleef_atand1_u10cuda(*a0); }
 __global__ void xatan2_u1(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u10cuda(*a0, *a1); }
 __global__ void xlog_u1(double *r, double *a0) { *r = Sleef_logd1_u10cuda(*a0); }
 __global__ void xcbrt_u1(double *r, double *a0) { *r = Sleef_cbrtd1_u10cuda(*a0); }
 __global__ void xexp(double *r, double *a0) { *r = Sleef_expd1_u10cuda(*a0); }
 __global__ void xpow(double *r, double *a0, double *a1) { *r = Sleef_powd1_u10cuda(*a0, *a1); }
 __global__ void xsinh(double *r, double *a0) { *r = Sleef_sinhd1_u10cuda(*a0); }
 __global__ void xcosh(double *r, double *a0) { *r = Sleef_coshd1_u10cuda(*a0); }
 __global__ void xtanh(double *r, double *a0) { *r = Sleef_tanhd1_u10cuda(*a0); }
 __global__ void xsinh_u35(double *r, double *a0) { *r = Sleef_sinhd1_u35cuda(*a0); }
 __global__ void xcosh_u35(double *r, double *a0) { *r = Sleef_coshd1_u35cuda(*a0); }
 __global__ void xtanh_u35(double *r, double *a0) { *r = Sleef_tanhd1_u35cuda(*a0); }
 __global__ void xasinh(double *r, double *a0) { *r = Sleef_asinhd1_u10cuda(*a0); }
 __global__ void xacosh(double *r, double *a0) { *r = Sleef_acoshd1_u10cuda(*a0); }
 __global__ void xatanh(double *r, double *a0) { *r = Sleef_atanhd1_u10cuda(*a0); }
 __global__ void xexp2(double *r, double *a0) { *r = Sleef_exp2d1_u10cuda(*a0); }
 __global__ void xexp2_u35(double *r, double *a0) { *r = Sleef_exp2d1_u35cuda(*a0); }
 __global__ void xexp10(double *r, double *a0) { *r = Sleef_exp10d1_u10cuda(*a0); }
 __global__ void xexp10_u35(double *r, double *a0) { *r = Sleef_exp10d1_u35cuda(*a0); }
 __global__ void xexpm1(double *r, double *a0) { *r = Sleef_expm1d1_u10cuda(*a0); }
 __global__ void xlog10(double *r, double *a0) { *r = Sleef_log10d1_u10cuda(*a0); }
 __global__ void xlog2(double *r, double *a0) { *r = Sleef_log2d1_u10cuda(*a0); }
 __global__ void xlog2_u35(double *r, double *a0) { *r = Sleef_log2d1_u35cuda(*a0); }
 __global__ void xlog1p(double *r, double *a0) { *r = Sleef_log1pd1_u10cuda(*a0); }
 __global__ void xsincospi_u05(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u05cuda(*a0); }
 __global__ void xsincospi_u35(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u35cuda(*a0); }
 __global__ void xsinpi_u05(double *r, double *a0) { *r = Sleef_sinpid1_u05cuda(*a0); }
 __global__ void xcospi_u05(double *r, double *a0) { *r = Sleef_cospid1_u05cuda(*a0); }
 __global__ void xldexp(double *r, double *a0, int *a1) { *r = Sleef_ldexpd1_cuda(*a0, *a1); }
 __global__ void xilogb(int *r, double *a0) { *r = Sleef_ilogbd1_cuda(*a0); }
 __global__ void xfma(double *r, double *a0, double *a1, double *a2) { *r = Sleef_fmad1_cuda(*a0, *a1, *a2); }
 __global__ void xsqrt(double *r, double *a0) { *r = Sleef_sqrtd1_cuda(*a0); }
 __global__ void xsqrt_u05(double *r, double *a0) { *r = Sleef_sqrtd1_u05cuda(*a0); }
 __global__ void xsqrt_u35(double *r, double *a0) { *r = Sleef_sqrtd1_u35cuda(*a0); }
 __global__ void xhypot_u05(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u05cuda(*a0, *a1); }
 __global__ void xhypot_u35(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u35cuda(*a0, *a1); }
 __global__ void xfabs(double *r, double *a0) { *r = Sleef_fabsd1_cuda(*a0); }
 __global__ void xcopysign(double *r, double *a0, double *a1) { *r = Sleef_copysignd1_cuda(*a0, *a1); }
 __global__ void xfmax(double *r, double *a0, double *a1) { *r = Sleef_fmaxd1_cuda(*a0, *a1); }
 __global__ void xfmin(double *r, double *a0, double *a1) { *r = Sleef_fmind1_cuda(*a0, *a1); }
 __global__ void xfdim(double *r, double *a0, double *a1) { *r = Sleef_fdimd1_cuda(*a0, *a1); }
 __global__ void xtrunc(double *r, double *a0) { *r = Sleef_truncd1_cuda(*a0); }
 __global__ void xfloor(double *r, double *a0) { *r = Sleef_floord1_cuda(*a0); }
 __global__ void xceil(double *r, double *a0) { *r = Sleef_ceild1_cuda(*a0); }
 __global__ void xround(double *r, double *a0) { *r = Sleef_roundd1_cuda(*a0); }
 __global__ void xrint(double *r, double *a0) { *r = Sleef_rintd1_cuda(*a0); }
 __global__ void xnextafter(double *r, double *a0, double *a1) { *r = Sleef_nextafterd1_cuda(*a0, *a1); }
 __global__ void xfrfrexp(double *r, double *a0) { *r = Sleef_frfrexpd1_cuda(*a0); }
 __global__ void xexpfrexp(int *r, double *a0) { *r = Sleef_expfrexpd1_cuda(*a0); }
 __global__ void xfmod(double *r, double *a0, double *a1) { *r = Sleef_fmodd1_cuda(*a0, *a1); }
 __global__ void xremainder(double *r, double *a0, double *a1) { *r = Sleef_remainderd1_cuda(*a0, *a1); }
 __global__ void xmodf(vdouble2 *r, double *a0) { *r = Sleef_modfd1_cuda(*a0); }
 __global__ void xlgamma_u1(double *r, double *a0) { *r = Sleef_lgammad1_u10cuda(*a0); }
 __global__ void xtgamma_u1(double *r, double *a0) { *r = Sleef_tgammad1_u10cuda(*a0); }
 __global__ void xerf_u1(double *r, double *a0) { *r = Sleef_erfd1_u10cuda(*a0); }
 __global__ void xerfc_u15(double *r, double *a0) { *r = Sleef_erfcd1_u15cuda(*a0); }
 __global__ void xsinf(float *r, float *a0) { *r = Sleef_sinf1_u35cuda(*a0); }
 __global__ void xcosf(float *r, float *a0) { *r = Sleef_cosf1_u35cuda(*a0); }
 __global__ void xsincosf(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u35cuda(*a0); }
 __global__ void xtanf(float *r, float *a0) { *r = Sleef_tanf1_u35cuda(*a0); }
 __global__ void xasinf(float *r, float *a0) { *r = Sleef_asinf1_u35cuda(*a0); }
 __global__ void xacosf(float *r, float *a0) { *r = Sleef_acosf1_u35cuda(*a0); }
 __global__ void xatanf(float *r, float *a0) { *r = Sleef_atanf1_u35cuda(*a0); }
 __global__ void xatan2f(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u35cuda(*a0, *a1); }
 __global__ void xlogf(float *r, float *a0) { *r = Sleef_logf1_u35cuda(*a0); }
 __global__ void xcbrtf(float *r, float *a0) { *r = Sleef_cbrtf1_u35cuda(*a0); }
 __global__ void xsinf_u1(float *r, float *a0) { *r = Sleef_sinf1_u10cuda(*a0); }
 __global__ void xcosf_u1(float *r, float *a0) { *r = Sleef_cosf1_u10cuda(*a0); }
 __global__ void xsincosf_u1(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u10cuda(*a0); }
 __global__ void xtanf_u1(float *r, float *a0) { *r = Sleef_tanf1_u10cuda(*a0); }
 __global__ void xasinf_u1(float *r, float *a0) { *r = Sleef_asinf1_u10cuda(*a0); }
 __global__ void xacosf_u1(float *r, float *a0) { *r = Sleef_acosf1_u10cuda(*a0); }
 __global__ void xatanf_u1(float *r, float *a0) { *r = Sleef_atanf1_u10cuda(*a0); }
 __global__ void xatan2f_u1(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u10cuda(*a0, *a1); }
 __global__ void xlogf_u1(float *r, float *a0) { *r = Sleef_logf1_u10cuda(*a0); }
 __global__ void xcbrtf_u1(float *r, float *a0) { *r = Sleef_cbrtf1_u10cuda(*a0); }
 __global__ void xexpf(float *r, float *a0) { *r = Sleef_expf1_u10cuda(*a0); }
 __global__ void xpowf(float *r, float *a0, float *a1) { *r = Sleef_powf1_u10cuda(*a0, *a1); }
 __global__ void xsinhf(float *r, float *a0) { *r = Sleef_sinhf1_u10cuda(*a0); }
 __global__ void xcoshf(float *r, float *a0) { *r = Sleef_coshf1_u10cuda(*a0); }
 __global__ void xtanhf(float *r, float *a0) { *r = Sleef_tanhf1_u10cuda(*a0); }
 __global__ void xsinhf_u35(float *r, float *a0) { *r = Sleef_sinhf1_u35cuda(*a0); }
 __global__ void xcoshf_u35(float *r, float *a0) { *r = Sleef_coshf1_u35cuda(*a0); }
 __global__ void xtanhf_u35(float *r, float *a0) { *r = Sleef_tanhf1_u35cuda(*a0); }
 __global__ void xfastsinf_u3500(float *r, float *a0) { *r = Sleef_fastsinf1_u3500cuda(*a0); }
 __global__ void xfastcosf_u3500(float *r, float *a0) { *r = Sleef_fastcosf1_u3500cuda(*a0); }
 __global__ void xfastpowf_u3500(float *r, float *a0, float *a1) { *r = Sleef_fastpowf1_u3500cuda(*a0, *a1); }
 __global__ void xasinhf(float *r, float *a0) { *r = Sleef_asinhf1_u10cuda(*a0); }
 __global__ void xacoshf(float *r, float *a0) { *r = Sleef_acoshf1_u10cuda(*a0); }
 __global__ void xatanhf(float *r, float *a0) { *r = Sleef_atanhf1_u10cuda(*a0); }
 __global__ void xexp2f(float *r, float *a0) { *r = Sleef_exp2f1_u10cuda(*a0); }
 __global__ void xexp2f_u35(float *r, float *a0) { *r = Sleef_exp2f1_u35cuda(*a0); }
 __global__ void xexp10f(float *r, float *a0) { *r = Sleef_exp10f1_u10cuda(*a0); }
 __global__ void xexp10f_u35(float *r, float *a0) { *r = Sleef_exp10f1_u35cuda(*a0); }
 __global__ void xexpm1f(float *r, float *a0) { *r = Sleef_expm1f1_u10cuda(*a0); }
 __global__ void xlog10f(float *r, float *a0) { *r = Sleef_log10f1_u10cuda(*a0); }
 __global__ void xlog2f(float *r, float *a0) { *r = Sleef_log2f1_u10cuda(*a0); }
 __global__ void xlog2f_u35(float *r, float *a0) { *r = Sleef_log2f1_u35cuda(*a0); }
 __global__ void xlog1pf(float *r, float *a0) { *r = Sleef_log1pf1_u10cuda(*a0); }
 __global__ void xsincospif_u05(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u05cuda(*a0); }
 __global__ void xsincospif_u35(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u35cuda(*a0); }
 __global__ void xsinpif_u05(float *r, float *a0) { *r = Sleef_sinpif1_u05cuda(*a0); }
 __global__ void xcospif_u05(float *r, float *a0) { *r = Sleef_cospif1_u05cuda(*a0); }
 __global__ void xldexpf(float *r, float *a0, int *a1) { *r = Sleef_ldexpf1_cuda(*a0, *a1); }
 __global__ void xilogbf(int *r, float *a0) { *r = Sleef_ilogbf1_cuda(*a0); }
 __global__ void xfmaf(float *r, float *a0, float *a1, float *a2) { *r = Sleef_fmaf1_cuda(*a0, *a1, *a2); }
 __global__ void xsqrtf(float *r, float *a0) { *r = Sleef_sqrtf1_cuda(*a0); }
 __global__ void xsqrtf_u05(float *r, float *a0) { *r = Sleef_sqrtf1_u05cuda(*a0); }
 __global__ void xsqrtf_u35(float *r, float *a0) { *r = Sleef_sqrtf1_u35cuda(*a0); }
 __global__ void xhypotf_u05(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u05cuda(*a0, *a1); }
 __global__ void xhypotf_u35(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u35cuda(*a0, *a1); }
 __global__ void xfabsf(float *r, float *a0) { *r = Sleef_fabsf1_cuda(*a0); }
 __global__ void xcopysignf(float *r, float *a0, float *a1) { *r = Sleef_copysignf1_cuda(*a0, *a1); }
 __global__ void xfmaxf(float *r, float *a0, float *a1) { *r = Sleef_fmaxf1_cuda(*a0, *a1); }
 __global__ void xfminf(float *r, float *a0, float *a1) { *r = Sleef_fminf1_cuda(*a0, *a1); }
 __global__ void xfdimf(float *r, float *a0, float *a1) { *r = Sleef_fdimf1_cuda(*a0, *a1); }
 __global__ void xtruncf(float *r, float *a0) { *r = Sleef_truncf1_cuda(*a0); }
 __global__ void xfloorf(float *r, float *a0) { *r = Sleef_floorf1_cuda(*a0); }
 __global__ void xceilf(float *r, float *a0) { *r = Sleef_ceilf1_cuda(*a0); }
 __global__ void xroundf(float *r, float *a0) { *r = Sleef_roundf1_cuda(*a0); }
 __global__ void xrintf(float *r, float *a0) { *r = Sleef_rintf1_cuda(*a0); }
 __global__ void xnextafterf(float *r, float *a0, float *a1) { *r = Sleef_nextafterf1_cuda(*a0, *a1); }
 __global__ void xfrfrexpf(float *r, float *a0) { *r = Sleef_frfrexpf1_cuda(*a0); }
 __global__ void xexpfrexpf(float *r, float *a0) { *r = Sleef_expfrexpf1_cuda(*a0); }
 __global__ void xfmodf(float *r, float *a0, float *a1) { *r = Sleef_fmodf1_cuda(*a0, *a1); }
 __global__ void xremainderf(float *r, float *a0, float *a1) { *r = Sleef_remainderf1_cuda(*a0, *a1); }
 __global__ void xmodff(vfloat2 *r, float *a0) { *r = Sleef_modff1_cuda(*a0); }
 __global__ void xlgammaf_u1(float *r, float *a0) { *r = Sleef_lgammaf1_u10cuda(*a0); }
 __global__ void xtgammaf_u1(float *r, float *a0) { *r = Sleef_tgammaf1_u10cuda(*a0); }
 __global__ void xerff_u1(float *r, float *a0) { *r = Sleef_erff1_u10cuda(*a0); }
 __global__ void xerfcf_u15(float *r, float *a0) { *r = Sleef_erfcf1_u15cuda(*a0); }
 //
 #define func_d_d(funcStr, funcName) {                           \
    while (startsWith(buf, funcStr " ")) {                      \
      uint64_t u;                                               \
      sscanf(buf, funcStr " %" PRIx64, &u);                     \
      *a0 = u2d(u);                                             \
      funcName<<<1, 1>>>(r, a0);                                \
      cudaDeviceSynchronize();                                  \
      printf("%" PRIx64 "\n", d2u(*r));                         \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 #define func_d2_d(funcStr, funcName) {                                  \
    while (startsWith(buf, funcStr " ")) {                              \
      uint64_t u;                                                       \
      sscanf(buf, funcStr " %" PRIx64, &u);                             \
      *a0 = u2d(u);                                                     \
      funcName<<<1, 1>>>(r2, a0);                                       \
      cudaDeviceSynchronize();                                          \
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y));      \
      fflush(stdout);                                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
    }                                                                   \
  }
 #define func_d_d_d(funcStr, funcName) {                         \
    while (startsWith(buf, funcStr " ")) {                      \
      uint64_t u, v;                                            \
      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);     \
      *a0 = u2d(u);                                             \
      *a1 = u2d(v);                                             \
      funcName<<<1, 1>>>(r, a0, a1);                            \
      cudaDeviceSynchronize();                                  \
      printf("%" PRIx64 "\n", d2u(*r));                         \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 #define func_d_d_i(funcStr, funcName) {                                 \
    while (startsWith(buf, funcStr " ")) {                              \
      uint64_t u, v;                                                    \
      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);             \
      *a0 = u2d(u);                                                     \
      *i0 = (int)u2d(v);                                                \
      funcName<<<1, 1>>>(r, a0, i0);                                    \
      cudaDeviceSynchronize();                                          \
      printf("%" PRIx64 "\n", d2u(*r));                                 \
      fflush(stdout);                                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
    }                                                                   \
  }
 #define func_i_d(funcStr, funcName) {                   \
    while (startsWith(buf, funcStr " ")) {              \
      uint64_t u;                                       \
      sscanf(buf, funcStr " %" PRIx64, &u);             \
      *a0 = u2d(u);                                     \
      funcName<<<1, 1>>>(i0, a0);                       \
      cudaDeviceSynchronize();                          \
      printf("%d\n", *i0);                              \
      fflush(stdout);                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
    }                                                   \
  }
 //
 #define func_f_f(funcStr, funcName) {                           \
    while (startsWith(buf, funcStr " ")) {                      \
      uint32_t u;                                               \
      sscanf(buf, funcStr " %x", &u);                           \
      *b0 = u2f(u);                                             \
      funcName<<<1, 1>>>(s, b0);                                \
      cudaDeviceSynchronize();                                  \
      printf("%x\n", f2u(*s));                                  \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 #define func_f2_f(funcStr, funcName) {                          \
    while (startsWith(buf, funcStr " ")) {                      \
      uint32_t u;                                               \
      sscanf(buf, funcStr " %x", &u);                           \
      *b0 = u2f(u);                                             \
      funcName<<<1, 1>>>(s2, b0);                               \
      cudaDeviceSynchronize();                                  \
      printf("%x %x\n", f2u(s2->x), f2u(s2->y));                \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 #define func_f_f_f(funcStr, funcName) {                         \
    while (startsWith(buf, funcStr " ")) {                      \
      uint32_t u, v;                                            \
      sscanf(buf, funcStr " %x %x", &u, &v);                    \
      *b0 = u2f(u);                                             \
      *b1 = u2f(v);                                             \
      funcName<<<1, 1>>>(s, b0, b1);                            \
      cudaDeviceSynchronize();                                  \
      printf("%x\n", f2u(*s));                                  \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 //
 #define BUFSIZE 1024
 int main(int argc, char **argv) {
 #if 0
  cuInit(0);
  int ndevice;
  cuDeviceGetCount(&ndevice);
  if (ndevice == 0) {
    fprintf(stderr, "No cuda device available\n");
    exit(0);
  }
  CUdevice device;
  char deviceName[1024];
  cuDeviceGet(&device, 0);
  cuDeviceGetName(deviceName, 1000, device);
  fprintf(stderr, "Device : %s\n", deviceName);
 #endif
  cudaSetDeviceFlags(cudaDeviceScheduleSpin);
  vdouble2 *r2;
  vfloat2 *s2;
  double *r, *a0, *a1, *a2;
  float *s, *b0, *b1, *b2;
  int *i0;
  cudaMallocManaged(&r , 1*sizeof(double));
  cudaMallocManaged(&r2, 1*sizeof(vdouble2));
  cudaMallocManaged(&a0, 1*sizeof(double));
  cudaMallocManaged(&a1, 1*sizeof(double));
  cudaMallocManaged(&a2, 1*sizeof(double));
  cudaMallocManaged(&s , 1*sizeof(float));
  cudaMallocManaged(&s2, 1*sizeof(vfloat2));
  cudaMallocManaged(&b0, 1*sizeof(float));
  cudaMallocManaged(&b1, 1*sizeof(float));
  cudaMallocManaged(&b2, 1*sizeof(float));
  cudaMallocManaged(&i0, 1*sizeof(int));
  printf("3\n");
  fflush(stdout);
  char buf[BUFSIZE];
  if (fgets(buf, BUFSIZE-1, stdin)) {}
  while(!feof(stdin)) {
    func_d_d("sin", xsin);
    func_d_d("cos", xcos);
    func_d_d("tan", xtan);
    func_d_d("asin", xasin);
    func_d_d("acos", xacos);
    func_d_d("atan", xatan);
    func_d_d("log", xlog);
    func_d_d("exp", xexp);
    func_d_d("sqrt", xsqrt);
    func_d_d("sqrt_u05", xsqrt_u05);
    func_d_d("sqrt_u35", xsqrt_u35);
    func_d_d("cbrt", xcbrt);
    func_d_d("cbrt_u1", xcbrt_u1);
    func_d_d("sinh", xsinh);
    func_d_d("cosh", xcosh);
    func_d_d("tanh", xtanh);
    func_d_d("sinh_u35", xsinh_u35);
    func_d_d("cosh_u35", xcosh_u35);
    func_d_d("tanh_u35", xtanh_u35);
    func_d_d("asinh", xasinh);
    func_d_d("acosh", xacosh);
    func_d_d("atanh", xatanh);
    func_d_d("sin_u1", xsin_u1);
    func_d_d("cos_u1", xcos_u1);
    func_d_d("tan_u1", xtan_u1);
    func_d_d("sinpi_u05", xsinpi_u05);
    func_d_d("cospi_u05", xcospi_u05);
    func_d_d("asin_u1", xasin_u1);
    func_d_d("acos_u1", xacos_u1);
    func_d_d("atan_u1", xatan_u1);
    func_d_d("log_u1", xlog_u1);
    func_d_d("exp2", xexp2);
    func_d_d("exp10", xexp10);
    func_d_d("exp2_u35", xexp2_u35);
    func_d_d("exp10_u35", xexp10_u35);
    func_d_d("expm1", xexpm1);
    func_d_d("log10", xlog10);
    func_d_d("log2", xlog2);
    func_d_d("log2_u35", xlog2_u35);
    func_d_d("log1p", xlog1p);
    func_d_d("fabs", xfabs);
    func_d_d("trunc", xtrunc);
    func_d_d("floor", xfloor);
    func_d_d("ceil", xceil);
    func_d_d("round", xround);
    func_d_d("rint", xrint);
    func_d_d("frfrexp", xfrfrexp);
    func_d_d("tgamma_u1", xtgamma_u1);
    func_d_d("lgamma_u1", xlgamma_u1);
    func_d_d("erf_u1", xerf_u1);
    func_d_d("erfc_u15", xerfc_u15);
    func_d2_d("sincos", xsincos);
    func_d2_d("sincos_u1", xsincos_u1);
    func_d2_d("sincospi_u35", xsincospi_u35);
    func_d2_d("sincospi_u05", xsincospi_u05);
    func_d2_d("modf", xmodf);
    func_d_d_d("pow", xpow);
    func_d_d_d("atan2", xatan2);
    func_d_d_d("atan2_u1", xatan2_u1);
    func_d_d_d("hypot_u05", xhypot_u05);
    func_d_d_d("hypot_u35", xhypot_u35);
    func_d_d_d("copysign", xcopysign);
    func_d_d_d("fmax", xfmax);
    func_d_d_d("fmin", xfmin);
    func_d_d_d("fdim", xfdim);
    func_d_d_d("nextafter", xnextafter);
    func_d_d_d("fmod", xfmod);
    func_d_d_d("remainder", xremainder);
    func_d_d_i("ldexp", xldexp);
    func_i_d("ilogb", xilogb);
    func_i_d("expfrexp", xexpfrexp);
    //
    func_f_f("sinf", xsinf);
    func_f_f("cosf", xcosf);
    func_f_f("tanf", xtanf);
    func_f_f("asinf", xasinf);
    func_f_f("acosf", xacosf);
    func_f_f("atanf", xatanf);
    func_f_f("logf", xlogf);
    func_f_f("expf", xexpf);
    func_f_f("sqrtf", xsqrtf);
    func_f_f("sqrtf_u05", xsqrtf_u05);
    func_f_f("sqrtf_u35", xsqrtf_u35);
    func_f_f("cbrtf", xcbrtf);
    func_f_f("cbrtf_u1", xcbrtf_u1);
    func_f_f("sinhf", xsinhf);
    func_f_f("coshf", xcoshf);
    func_f_f("tanhf", xtanhf);
    func_f_f("sinhf_u35", xsinhf_u35);
    func_f_f("coshf_u35", xcoshf_u35);
    func_f_f("tanhf_u35", xtanhf_u35);
    func_f_f("asinhf", xasinhf);
    func_f_f("acoshf", xacoshf);
    func_f_f("atanhf", xatanhf);
    func_f_f("sinf_u1", xsinf_u1);
    func_f_f("cosf_u1", xcosf_u1);
    func_f_f("tanf_u1", xtanf_u1);
    func_f_f("sinpif_u05", xsinpif_u05);
    func_f_f("cospif_u05", xcospif_u05);
    func_f_f("asinf_u1", xasinf_u1);
    func_f_f("acosf_u1", xacosf_u1);
    func_f_f("atanf_u1", xatanf_u1);
    func_f_f("logf_u1", xlogf_u1);
    func_f_f("exp2f", xexp2f);
    func_f_f("exp10f", xexp10f);
    func_f_f("exp2f_u35", xexp2f_u35);
    func_f_f("exp10f_u35", xexp10f_u35);
    func_f_f("expm1f", xexpm1f);
    func_f_f("log10f", xlog10f);
    func_f_f("log2f", xlog2f);
    func_f_f("log2f_u35", xlog2f_u35);
    func_f_f("log1pf", xlog1pf);
    func_f2_f("sincosf", xsincosf);
    func_f2_f("sincosf_u1", xsincosf_u1);
    func_f2_f("sincospif_u35", xsincospif_u35);
    func_f2_f("sincospif_u05", xsincospif_u05);
    func_f_f_f("powf", xpowf);
    func_f_f_f("atan2f", xatan2f);
    func_f_f_f("atan2f_u1", xatan2f_u1);
    func_f_f("fabsf", xfabsf);
    func_f_f("truncf", xtruncf);
    func_f_f("floorf", xfloorf);
    func_f_f("ceilf", xceilf);
    func_f_f("roundf", xroundf);
    func_f_f("rintf", xrintf);
    func_f_f("frfrexpf", xfrfrexpf);
    func_f_f_f("hypotf_u05", xhypotf_u05);
    func_f_f_f("hypotf_u35", xhypotf_u35);
    func_f_f_f("copysignf", xcopysignf);
    func_f_f_f("fmaxf", xfmaxf);
    func_f_f_f("fminf", xfminf);
    func_f_f_f("fdimf", xfdimf);
    func_f_f_f("nextafterf", xnextafterf);
    func_f_f_f("fmodf", xfmodf);
    func_f_f_f("remainderf", xremainderf);
    func_f2_f("modff", xmodff);
    func_f_f("tgammaf_u1", xtgammaf_u1);
    func_f_f("lgammaf_u1", xlgammaf_u1);
    func_f_f("erff_u1", xerff_u1);
    func_f_f("erfcf_u15", xerfcf_u15);
    func_f_f("fastsinf_u3500", xfastsinf_u3500);
    func_f_f("fastcosf_u3500", xfastcosf_u3500);
    func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
  }
  return 0;
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutsimd.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/iutsimd.c
@@ -0,0 +1,859 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2023.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <time.h>
 #include <inttypes.h>
 #include <assert.h>
 #include <math.h>
 #if defined(_MSC_VER)
 #define STDIN_FILENO 0
 #else
 #include <unistd.h>
 #include <sys/types.h>
 #endif
 #include "quaddef.h"
 #include "misc.h"
 #if !defined(USE_INLINE_HEADER)
 #include "sleef.h"
 #else // #if !defined(USE_INLINE_HEADER)
 #include <stddef.h>
 #include <stdint.h>
 #include <float.h>
 #include <limits.h>
 #if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__)
 #ifndef FP_FAST_FMA
 #define FP_FAST_FMA
 #endif
 #endif
 #if defined(_MSC_VER) && !defined(__STDC__)
 #define __STDC__ 1
 #endif
 #if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
 #include <x86intrin.h>
 #endif
 #if (defined(_MSC_VER))
 #include <intrin.h>
 #endif
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif
 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 #endif
 #if defined(__riscv) && defined(__riscv_v)
 #include <riscv_vector.h>
 #endif
 #if defined(__VSX__)
 #include <altivec.h>
 #endif
 #if defined(__VX__)
 #include <vecintrin.h>
 #endif
 #define SLEEF_ALWAYS_INLINE inline
 #define SLEEF_INLINE
 #define SLEEF_CONST
 #include USE_INLINE_HEADER
 #include MACRO_ONLY_HEADER
 #ifndef ENABLE_PUREC_SCALAR
 #include "sleefinline_purec_scalar.h"
 #endif
 #endif // #if !defined(USE_INLINE_HEADER)
 #include "testerutil.h"
 #define DORENAME
 #ifdef ENABLE_SSE2
 #include "renamesse2.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helpersse2.h"
 typedef Sleef___m128d_2 vdouble2;
 typedef Sleef___m128_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_SSE4
 #include "renamesse4.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 4
 #include "helpersse2.h"
 typedef Sleef___m128d_2 vdouble2;
 typedef Sleef___m128_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_AVX
 #include "renameavx.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperavx.h"
 typedef Sleef___m256d_2 vdouble2;
 typedef Sleef___m256_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_FMA4
 #include "renamefma4.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 4
 #include "helperavx.h"
 typedef Sleef___m256d_2 vdouble2;
 typedef Sleef___m256_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_AVX2
 #include "renameavx2.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperavx2.h"
 typedef Sleef___m256d_2 vdouble2;
 typedef Sleef___m256_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_AVX2128
 #include "renameavx2128.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperavx2_128.h"
 typedef Sleef___m128d_2 vdouble2;
 typedef Sleef___m128_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_AVX512F
 #include "renameavx512f.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperavx512f.h"
 typedef Sleef___m512d_2 vdouble2;
 typedef Sleef___m512_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_AVX512FNOFMA
 #include "renameavx512fnofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helperavx512f.h"
 typedef Sleef___m512d_2 vdouble2;
 typedef Sleef___m512_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VECEXT
 #define CONFIG 1
 #include "helpervecext.h"
 #include "norename.h"
 #endif
 #ifdef ENABLE_PUREC
 #define CONFIG 1
 #include "helperpurec.h"
 #include "norename.h"
 #endif
 #ifdef ENABLE_NEON32
 #include "renameneon32.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperneon32.h"
 typedef Sleef_float32x4_t_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_NEON32VFPV4
 #include "renameneon32vfpv4.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 4
 #include "helperneon32.h"
 typedef Sleef_float32x4_t_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_ADVSIMD
 #include "renameadvsimd.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperadvsimd.h"
 typedef Sleef_float64x2_t_2 vdouble2;
 typedef Sleef_float32x4_t_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_ADVSIMDNOFMA
 #include "renameadvsimdnofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helperadvsimd.h"
 typedef Sleef_float64x2_t_2 vdouble2;
 typedef Sleef_float32x4_t_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_DSP128
 #define CONFIG 2
 #include "helpersse2.h"
 #include "renamedsp128.h"
 typedef Sleef___m128d_2 vdouble2;
 typedef Sleef___m128_2 vfloat2;
 #endif
 #ifdef ENABLE_SVE
 #include "renamesve.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helpersve.h"
 #endif
 #endif
 #ifdef ENABLE_SVENOFMA
 #include "renamesvenofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helpersve.h"
 #endif
 #endif
 #ifdef ENABLE_DSP256
 #define CONFIG 1
 #include "helperavx.h"
 #include "renamedsp256.h"
 typedef Sleef___m256d_2 vdouble2;
 typedef Sleef___m256_2 vfloat2;
 #endif
 #ifdef ENABLE_VSX
 #include "renamevsx.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperpower_128.h"
 #include "renamevsx.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VSXNOFMA
 #include "renamevsxnofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helperpower_128.h"
 #include "renamevsxnofma.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VSX3
 #include "renamevsx3.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 3
 #include "helperpower_128.h"
 #include "renamevsx3.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VSX3NOFMA
 #include "renamevsx3nofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 4
 #include "helperpower_128.h"
 #include "renamevsx3nofma.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VXE
 #include "renamevxe.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 140
 #include "helpers390x_128.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VXENOFMA
 #include "renamevxenofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 141
 #include "helpers390x_128.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VXE2
 #include "renamevxe2.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 150
 #include "helpers390x_128.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_VXE2NOFMA
 #include "renamevxe2nofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 151
 #include "helpers390x_128.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_DSPPOWER_128
 #define CONFIG 1
 #include "helperpower_128.h"
 #include "renamedsp128.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #ifdef ENABLE_DSPS390X_128
 #define CONFIG 140
 #include "helpers390x_128.h"
 #include "renamedsp128.h"
 typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
 typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
 #endif
 #ifdef ENABLE_RVVM1
 #include "renamervvm1.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperrvv.h"
 #endif
 #endif
 #ifdef ENABLE_RVVM1NOFMA
 #include "renamervvm1nofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helperrvv.h"
 #endif
 #endif
 #ifdef ENABLE_RVVM2
 #include "renamervvm2.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperrvv.h"
 #endif
 #endif
 #ifdef ENABLE_RVVM2NOFMA
 #include "renamervvm2nofma.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helperrvv.h"
 #endif
 #endif
 #ifdef ENABLE_PUREC_SCALAR
 #include "renamepurec_scalar.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 1
 #include "helperpurec_scalar.h"
 typedef Sleef_double_2 vdouble2;
 typedef Sleef_float_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_PURECFMA_SCALAR
 #include "renamepurecfma_scalar.h"
 #if !defined(USE_INLINE_HEADER)
 #define CONFIG 2
 #include "helperpurec_scalar.h"
 typedef Sleef_double_2 vdouble2;
 typedef Sleef_float_2 vfloat2;
 #endif
 #endif
 #ifdef ENABLE_DSP_SCALAR
 #include "renamedspscalar.h"
 #define CONFIG 1
 #include "helperpurec_scalar.h"
 typedef Sleef_double_2 vdouble2;
 typedef Sleef_float_2 vfloat2;
 #endif
 #ifdef USE_INLINE_HEADER
 #define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
 #define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
 #define vmask CONCAT_SIMD_SUFFIX(vmask, SIMD_SUFFIX)
 #define vopmask CONCAT_SIMD_SUFFIX(vopmask, SIMD_SUFFIX)
 #define vdouble CONCAT_SIMD_SUFFIX(vdouble, SIMD_SUFFIX)
 #define vint CONCAT_SIMD_SUFFIX(vint, SIMD_SUFFIX)
 #define vfloat CONCAT_SIMD_SUFFIX(vfloat, SIMD_SUFFIX)
 #define vint2 CONCAT_SIMD_SUFFIX(vint2, SIMD_SUFFIX)
 #define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
 #define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
 #define vd2getx_vd_vd2 CONCAT_SIMD_SUFFIX(vd2getx_vd_vd2, SIMD_SUFFIX)
 #define vd2gety_vd_vd2 CONCAT_SIMD_SUFFIX(vd2gety_vd_vd2, SIMD_SUFFIX)
 #define vf2getx_vf_vf2 CONCAT_SIMD_SUFFIX(vf2getx_vf_vf2, SIMD_SUFFIX)
 #define vf2gety_vf_vf2 CONCAT_SIMD_SUFFIX(vf2gety_vf_vf2, SIMD_SUFFIX)
 #define vloadu_vd_p CONCAT_SIMD_SUFFIX(vloadu_vd_p, SIMD_SUFFIX)
 #define vstoreu_v_p_vd CONCAT_SIMD_SUFFIX(vstoreu_v_p_vd, SIMD_SUFFIX)
 #define vloadu_vf_p CONCAT_SIMD_SUFFIX(vloadu_vf_p, SIMD_SUFFIX)
 #define vstoreu_v_p_vf CONCAT_SIMD_SUFFIX(vstoreu_v_p_vf, SIMD_SUFFIX)
 #define vloadu_vi_p CONCAT_SIMD_SUFFIX(vloadu_vi_p, SIMD_SUFFIX)
 #define vstoreu_v_p_vi CONCAT_SIMD_SUFFIX(vstoreu_v_p_vi, SIMD_SUFFIX)
 #endif
 //
 int check_feature(double d, float f) {
 #ifdef ENABLE_DP
  {
    double s[VECTLENDP];
    int i;
    for(i=0;i<VECTLENDP;i++) {
      s[i] = d;
    }
    vdouble a = vloadu_vd_p(s);
    a = xpow(a, a);
    vstoreu_v_p_vd(s, a);
    if (s[0] == s[0]) return 1;
  }
 #endif
 #ifdef ENABLE_SP
  {
    float s[VECTLENSP];
    int i;
    for(i=0;i<VECTLENSP;i++) {
      s[i] = d;
    }
    vfloat a = vloadu_vf_p(s);
    a = xpowf(a, a);
    vstoreu_v_p_vf(s, a);
    if (s[0] == s[0]) return 1;
  }
 #endif
  return 0;
 }
 #if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
 static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
 static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
 #endif
 #if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
 static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
 static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
 #endif
 //
 #define func_d_d(funcStr, funcName) {                           \
    while (startsWith(buf, funcStr " ")) {                      \
      uint64_t u;                                               \
      sscanf(buf, funcStr " %" PRIx64, &u);                     \
      double s[VECTLENDP];                                      \
      memrand(s, sizeof(s));                                    \
      int idx = xrand() & (VECTLENDP-1);                        \
      s[idx] = u2d(u);                                          \
      vdouble a = vloadu_vd_p(s);                               \
      a = funcName(a);                                          \
      vstoreu_v_p_vd(s, a);                                     \
      u = d2u(s[idx]);                                          \
      printf("%" PRIx64 "\n", u);                               \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 #define func_d2_d(funcStr, funcName) {                                  \
    while (startsWith(buf, funcStr " ")) {                              \
      uint64_t u;                                                       \
      sscanf(buf, funcStr " %" PRIx64, &u);                             \
      double s[VECTLENDP], t[VECTLENDP];                                \
      memrand(s, sizeof(s));                                            \
      memrand(t, sizeof(t));                                            \
      int idx = xrand() & (VECTLENDP-1);                                \
      s[idx] = u2d(u);                                                  \
      vdouble2 v;                                                       \
      vdouble a = vloadu_vd_p(s);                                       \
      v = funcName(a);                                                  \
      vstoreu_v_p_vd(s, vd2getx_vd_vd2(v));                             \
      vstoreu_v_p_vd(t, vd2gety_vd_vd2(v));                             \
      Sleef_double2 d2;                                                 \
      d2.x = s[idx];                                                    \
      d2.y = t[idx];                                                    \
      printf("%" PRIx64 " %" PRIx64 "\n", d2u(d2.x), d2u(d2.y));        \
      fflush(stdout);                                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
    }                                                                   \
  }
 #define func_d_d_d(funcStr, funcName) {                         \
    while (startsWith(buf, funcStr " ")) {                      \
      uint64_t u, v;                                            \
      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);     \
      double s[VECTLENDP], t[VECTLENDP];                        \
      memrand(s, sizeof(s));                                    \
      memrand(t, sizeof(t));                                    \
      int idx = xrand() & (VECTLENDP-1);                        \
      s[idx] = u2d(u);                                          \
      t[idx] = u2d(v);                                          \
      vdouble a, b;                                             \
      a = vloadu_vd_p(s);                                       \
      b = vloadu_vd_p(t);                                       \
      a = funcName(a, b);                                       \
      vstoreu_v_p_vd(s, a);                                     \
      u = d2u(s[idx]);                                          \
      printf("%" PRIx64 "\n", u);                               \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 #define func_d_d_i(funcStr, funcName) {                                 \
    while (startsWith(buf, funcStr " ")) {                              \
      uint64_t u, v;                                                    \
      sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v);             \
      double s[VECTLENDP];                                              \
      int t[VECTLENDP*2];                                               \
      memrand(s, sizeof(s));                                            \
      memrand(t, sizeof(t));                                            \
      int idx = xrand() & (VECTLENDP-1);                                \
      s[idx] = u2d(u);                                                  \
      t[idx] = (int)u2d(v);                                             \
      vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t)));      \
      u = d2u(s[idx]);                                                  \
      printf("%" PRIx64 "\n", u);                                       \
      fflush(stdout);                                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;                  \
    }                                                                   \
  }
 #define func_i_d(funcStr, funcName) {                           \
    while (startsWith(buf, funcStr " ")) {                      \
      uint64_t u;                                               \
      int i;                                                    \
      sscanf(buf, funcStr " %" PRIx64, &u);                     \
      double s[VECTLENDP];                                      \
      int t[VECTLENDP*2];                                       \
      memrand(s, sizeof(s));                                    \
      memrand(t, sizeof(t));                                    \
      int idx = xrand() & (VECTLENDP-1);                        \
      s[idx] = u2d(u);                                          \
      vdouble a = vloadu_vd_p(s);                               \
      vint vi = funcName(a);                                    \
      vstoreu_v_p_vi(t, vi);                                    \
      i = t[idx];                                               \
      printf("%d\n", i);                                        \
      fflush(stdout);                                           \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;          \
    }                                                           \
  }
 //
 #define func_f_f(funcStr, funcName) {                   \
    while (startsWith(buf, funcStr " ")) {              \
      uint32_t u;                                       \
      sscanf(buf, funcStr " %x", &u);                   \
      float s[VECTLENSP];                               \
      memrand(s, sizeof(s));                            \
      int idx = xrand() & (VECTLENSP-1);                \
      s[idx] = u2f(u);                                  \
      vfloat a = vloadu_vf_p(s);                        \
      a = funcName(a);                                  \
      vstoreu_v_p_vf(s, a);                             \
      u = f2u(s[idx]);                                  \
      printf("%x\n", u);                                \
      fflush(stdout);                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
    }                                                   \
  }
 #define func_f2_f(funcStr, funcName) {                  \
    while (startsWith(buf, funcStr " ")) {              \
      uint32_t u;                                       \
      sscanf(buf, funcStr " %x", &u);                   \
      float s[VECTLENSP], t[VECTLENSP];                 \
      memrand(s, sizeof(s));                            \
      memrand(t, sizeof(t));                            \
      int idx = xrand() & (VECTLENSP-1);                \
      s[idx] = u2f(u);                                  \
      vfloat2 v;                                        \
      vfloat a = vloadu_vf_p(s);                        \
      v = funcName(a);                                  \
      vstoreu_v_p_vf(s, vf2getx_vf_vf2(v));             \
      vstoreu_v_p_vf(t, vf2gety_vf_vf2(v));             \
      Sleef_float2 d2;                                  \
      d2.x = s[idx];                                    \
      d2.y = t[idx];                                    \
      printf("%x %x\n", f2u(d2.x), f2u(d2.y));          \
      fflush(stdout);                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
    }                                                   \
  }
 #define func_f_f_f(funcStr, funcName) {                 \
    while (startsWith(buf, funcStr " ")) {              \
      uint32_t u, v;                                    \
      sscanf(buf, funcStr " %x %x", &u, &v);            \
      float s[VECTLENSP], t[VECTLENSP];                 \
      memrand(s, sizeof(s));                            \
      memrand(t, sizeof(t));                            \
      int idx = xrand() & (VECTLENSP-1);                \
      s[idx] = u2f(u);                                  \
      t[idx] = u2f(v);                                  \
      vfloat a, b;                                      \
      a = vloadu_vf_p(s);                               \
      b = vloadu_vf_p(t);                               \
      a = funcName(a, b);                               \
      vstoreu_v_p_vf(s, a);                             \
      u = f2u(s[idx]);                                  \
      printf("%x\n", u);                                \
      fflush(stdout);                                   \
      if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;  \
    }                                                   \
  }
 //
 #define BUFSIZE 1024
 int main2(int argc, char **argv) {
  xsrand(time(NULL));
  {
    int k = 0;
 #ifdef ENABLE_DP
    k += 1;
 #endif
 #ifdef ENABLE_SP
    k += 2;
 #endif
 #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
    k += 4; // flush to zero
 #elif defined(ENABLE_VECEXT)
    if (vcast_f_vf(xpowf(vcast_vf_f(0.5f), vcast_vf_f(140))) == 0) k += 4;
 #endif
 #if defined(DETERMINISTIC)
    k += 8;
 #endif
    printf("%d\n", k);
    fflush(stdout);
  }
 #if !defined(USE_INLINE_HEADER)
  fprintf(stderr, "IUT : %s\n", (const char *)xgetPtrf(0));
 #endif
  fflush(stderr);
  char buf[BUFSIZE];
  fgets(buf, BUFSIZE-1, stdin);
  while(!feof(stdin)) {
 #ifdef ENABLE_DP
    func_d_d("sin", xsin);
    func_d_d("cos", xcos);
    func_d_d("tan", xtan);
    func_d_d("asin", xasin);
    func_d_d("acos", xacos);
    func_d_d("atan", xatan);
    func_d_d("log", xlog);
    func_d_d("exp", xexp);
 #ifndef DETERMINISTIC
    func_d_d("sqrt", xsqrt);
    func_d_d("sqrt_u05", xsqrt_u05);
    func_d_d("sqrt_u35", xsqrt_u35);
 #endif
    func_d_d("cbrt", xcbrt);
    func_d_d("cbrt_u1", xcbrt_u1);
    func_d_d("sinh", xsinh);
    func_d_d("cosh", xcosh);
    func_d_d("tanh", xtanh);
    func_d_d("sinh_u35", xsinh_u35);
    func_d_d("cosh_u35", xcosh_u35);
    func_d_d("tanh_u35", xtanh_u35);
    func_d_d("asinh", xasinh);
    func_d_d("acosh", xacosh);
    func_d_d("atanh", xatanh);
    func_d_d("sin_u1", xsin_u1);
    func_d_d("cos_u1", xcos_u1);
    func_d_d("tan_u1", xtan_u1);
    func_d_d("sinpi_u05", xsinpi_u05);
    func_d_d("cospi_u05", xcospi_u05);
    func_d_d("asin_u1", xasin_u1);
    func_d_d("acos_u1", xacos_u1);
    func_d_d("atan_u1", xatan_u1);
    func_d_d("log_u1", xlog_u1);
    func_d_d("exp2", xexp2);
    func_d_d("exp10", xexp10);
    func_d_d("exp2_u35", xexp2_u35);
    func_d_d("exp10_u35", xexp10_u35);
    func_d_d("expm1", xexpm1);
    func_d_d("log10", xlog10);
    func_d_d("log2", xlog2);
    func_d_d("log2_u35", xlog2_u35);
    func_d_d("log1p", xlog1p);
    func_d2_d("sincos", xsincos);
    func_d2_d("sincos_u1", xsincos_u1);
    func_d2_d("sincospi_u35", xsincospi_u35);
    func_d2_d("sincospi_u05", xsincospi_u05);
    func_d_d_d("pow", xpow);
    func_d_d_d("atan2", xatan2);
    func_d_d_d("atan2_u1", xatan2_u1);
    func_d_d_i("ldexp", xldexp);
    func_i_d("ilogb", xilogb);
    func_d_d("fabs", xfabs);
    func_d_d("trunc", xtrunc);
    func_d_d("floor", xfloor);
    func_d_d("ceil", xceil);
    func_d_d("round", xround);
    func_d_d("rint", xrint);
    func_d_d("frfrexp", xfrfrexp);
    func_i_d("expfrexp", xexpfrexp);
    func_d_d_d("hypot_u05", xhypot_u05);
    func_d_d_d("hypot_u35", xhypot_u35);
    func_d_d_d("copysign", xcopysign);
    func_d_d_d("fmax", xfmax);
    func_d_d_d("fmin", xfmin);
    func_d_d_d("fdim", xfdim);
    func_d_d_d("nextafter", xnextafter);
    func_d_d_d("fmod", xfmod);
    func_d_d_d("remainder", xremainder);
    func_d2_d("modf", xmodf);
    func_d_d("tgamma_u1", xtgamma_u1);
    func_d_d("lgamma_u1", xlgamma_u1);
    func_d_d("erf_u1", xerf_u1);
    func_d_d("erfc_u15", xerfc_u15);
 #endif
 #ifdef ENABLE_SP
    func_f_f("sinf", xsinf);
    func_f_f("cosf", xcosf);
    func_f_f("tanf", xtanf);
    func_f_f("asinf", xasinf);
    func_f_f("acosf", xacosf);
    func_f_f("atanf", xatanf);
    func_f_f("logf", xlogf);
    func_f_f("expf", xexpf);
 #ifndef DETERMINISTIC
    func_f_f("sqrtf", xsqrtf);
    func_f_f("sqrtf_u05", xsqrtf_u05);
    func_f_f("sqrtf_u35", xsqrtf_u35);
 #endif
    func_f_f("cbrtf", xcbrtf);
    func_f_f("cbrtf_u1", xcbrtf_u1);
    func_f_f("sinhf", xsinhf);
    func_f_f("coshf", xcoshf);
    func_f_f("tanhf", xtanhf);
    func_f_f("sinhf_u35", xsinhf_u35);
    func_f_f("coshf_u35", xcoshf_u35);
    func_f_f("tanhf_u35", xtanhf_u35);
    func_f_f("asinhf", xasinhf);
    func_f_f("acoshf", xacoshf);
    func_f_f("atanhf", xatanhf);
    func_f_f("sinf_u1", xsinf_u1);
    func_f_f("cosf_u1", xcosf_u1);
    func_f_f("tanf_u1", xtanf_u1);
    func_f_f("sinpif_u05", xsinpif_u05);
    func_f_f("cospif_u05", xcospif_u05);
    func_f_f("asinf_u1", xasinf_u1);
    func_f_f("acosf_u1", xacosf_u1);
    func_f_f("atanf_u1", xatanf_u1);
    func_f_f("logf_u1", xlogf_u1);
    func_f_f("exp2f", xexp2f);
    func_f_f("exp10f", xexp10f);
    func_f_f("exp2f_u35", xexp2f_u35);
    func_f_f("exp10f_u35", xexp10f_u35);
    func_f_f("expm1f", xexpm1f);
    func_f_f("log10f", xlog10f);
    func_f_f("log2f", xlog2f);
    func_f_f("log2f_u35", xlog2f_u35);
    func_f_f("log1pf", xlog1pf);
    func_f2_f("sincosf", xsincosf);
    func_f2_f("sincosf_u1", xsincosf_u1);
    func_f2_f("sincospif_u35", xsincospif_u35);
    func_f2_f("sincospif_u05", xsincospif_u05);
    func_f_f_f("powf", xpowf);
    func_f_f_f("atan2f", xatan2f);
    func_f_f_f("atan2f_u1", xatan2f_u1);
    func_f_f("fabsf", xfabsf);
    func_f_f("truncf", xtruncf);
    func_f_f("floorf", xfloorf);
    func_f_f("ceilf", xceilf);
    func_f_f("roundf", xroundf);
    func_f_f("rintf", xrintf);
    func_f_f("frfrexpf", xfrfrexpf);
    func_f_f_f("hypotf_u05", xhypotf_u05);
    func_f_f_f("hypotf_u35", xhypotf_u35);
    func_f_f_f("copysignf", xcopysignf);
    func_f_f_f("fmaxf", xfmaxf);
    func_f_f_f("fminf", xfminf);
    func_f_f_f("fdimf", xfdimf);
    func_f_f_f("nextafterf", xnextafterf);
    func_f_f_f("fmodf", xfmodf);
    func_f_f_f("remainderf", xremainderf);
    func_f2_f("modff", xmodff);
    func_f_f("tgammaf_u1", xtgammaf_u1);
    func_f_f("lgammaf_u1", xlgammaf_u1);
    func_f_f("erff_u1", xerff_u1);
    func_f_f("erfcf_u15", xerfcf_u15);
    func_f_f("fastsinf_u3500", xfastsinf_u3500);
    func_f_f("fastcosf_u3500", xfastcosf_u3500);
    func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
 #endif
  }
  return 0;
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/mveclibtest.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/mveclibtest.c
@@ -0,0 +1,92 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <x86intrin.h>
 #include <sleef.h>
 #define N 64
 #define M 256
 double r0[N], a0[N], a1[N], a2[N];
 void do_libm() { for(int i=0;i<N;i++) r0[i] = sin(a0[i]); }
 #if defined(__SSE2__)
 void do_sleef_sse2() { _mm_storeu_pd(r0, Sleef_sind2_u10sse2(_mm_loadu_pd(a0))); }
 #endif
 #if defined(__AVX__)
 void do_sleef_avx() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx(_mm256_loadu_pd(a0))); }
 #endif
 #if defined(__AVX2__)
 void do_sleef_avx2() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx2(_mm256_loadu_pd(a0))); }
 #endif
 #if defined(__AVX512F__)
 void do_sleef_avx512f() { _mm512_storeu_pd(r0, Sleef_sind8_u10avx512f(_mm512_loadu_pd(a0))); }
 #endif
 int do_test_once(double d) {
  for(int i=0;i<N;i++) a0[i] = d;
  do_libm();
  double rm = r0[0];
 #if defined(__SSE2__)
  for(int i=0;i<N;i++) a0[i] = d;
  do_sleef_sse2();
  if (rm == r0[0]) return 1;
 #endif
 #if defined(__AVX__)
  for(int i=0;i<N;i++) a0[i] = d;
  do_sleef_avx();
  if (rm == r0[0]) return 1;
 #endif
 #if defined(__AVX2__)
  for(int i=0;i<N;i++) a0[i] = d;
  do_sleef_avx2();
  if (rm == r0[0]) return 1;
 #endif
 #if defined(__AVX512F__)
  for(int i=0;i<N;i++) a0[i] = d;
  do_sleef_avx512f();
  if (rm == r0[0]) return 1;
 #endif
  return 0;
 }
 int check_feature(double d, float f) {
 #if defined(__SSE2__)
  do_sleef_sse2();
 #endif
 #if defined(__AVX__)
  do_sleef_avx();
 #endif
 #if defined(__AVX2__)
  do_sleef_avx2();
 #endif
 #if defined(__AVX512F__)
  do_sleef_avx512f();
 #endif
  return 1;
 }
 int main2(int argc, char **argv) {
  for(int i=0;i<M;i++) {
    if (!do_test_once(10.0 * ((2.0 * rand() / RAND_MAX) - 1))) {
      printf("fail\n");
      exit(-1);
    }
  }
  printf("pass\n");
  exit(0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester.c
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2dp.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2dp.c
@@ -0,0 +1,991 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2021.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <mpfr.h>
 #include <time.h>
 #include <float.h>
 #include <limits.h>
 #include <math.h>
 #ifdef ENABLE_SYS_getrandom
 #define _GNU_SOURCE
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/random.h>
 #endif
 #include "sleef.h"
 #include "testerutil.h"
 #define DORENAME
 #include "rename.h"
 #define DENORMAL_DBL_MIN (4.9406564584124654418e-324)
 #define POSITIVE_INFINITY INFINITY
 #define NEGATIVE_INFINITY (-INFINITY)
 typedef union {
  double d;
  uint64_t u64;
  int64_t i64;
 } conv_t;
 double nexttoward0(double x, int n) {
  union {
    double f;
    uint64_t u;
  } cx;
  cx.f = x;
  cx.u -=n ;
  return cx.f;
 }
 double rnd() {
  conv_t c;
  switch(random() & 63) {
  case 0: return nexttoward0( 0.0, -(random() & ((1 << (random() & 31)) - 1)));
  case 1: return nexttoward0(-0.0, -(random() & ((1 << (random() & 31)) - 1)));
  case 2: return nexttoward0( INFINITY, (random() & ((1 << (random() & 31)) - 1)));
  case 3: return nexttoward0(-INFINITY, (random() & ((1 << (random() & 31)) - 1)));
  }
 #ifdef ENABLE_SYS_getrandom
  syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
 #else
  c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
 #endif
  return c.d;
 }
 double rnd_fr() {
  conv_t c;
  do {
 #ifdef ENABLE_SYS_getrandom
    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
 #else
    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
 #endif
  } while(!isnumber(c.d));
  return c.d;
 }
 double rnd_zo() {
  conv_t c;
  do {
 #ifdef ENABLE_SYS_getrandom
    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
 #else
    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
 #endif
  } while(!isnumber(c.d) || c.d < -1 || 1 < c.d);
  return c.d;
 }
 int main(int argc,char **argv)
 {
  mpfr_t frw, frx, fry, frz;
  mpfr_set_default_prec(1280);
  mpfr_inits(frw, frx, fry, frz, NULL);
  conv_t cd;
  double d, t;
  double d2, d3, zo;
  int cnt, ecnt = 0;
  srandom(time(NULL));
  for(cnt = 0;ecnt < 1000;cnt++) {
    switch(cnt & 7) {
    case 0:
      d = rnd();
      d2 = rnd();
      d3 = rnd();
      zo = rnd();
      break;
    case 1:
      cd.d = rint(rnd_zo() * 1e+10) * M_PI_4;
      cd.i64 += (random() & 0xff) - 0x7f;
      d = cd.d;
      d2 = rnd();
      d3 = rnd();
      zo = rnd();
      break;
    case 2:
      cd.d = rnd_fr() * M_PI_4;
      cd.i64 += (random() & 0xf) - 0x7;
      d = cd.d;
      d2 = rnd();
      d3 = rnd();
      zo = rnd();
      break;
    default:
      d = rnd_fr();
      d2 = rnd_fr();
      d3 = rnd_fr();
      zo = rnd_zo();
      break;
    }
    Sleef_double2 sc  = xsincospi_u05(d);
    Sleef_double2 sc2 = xsincospi_u35(d);
    {
      const double rangemax2 = 1e+9/4;
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_sinpi(frx, frx, GMP_RNDN);
      double u0 = countULP2dp(t = sc.x, frx);
      if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincospi_u05 sin arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULP2dp(t = sc2.x, frx);
      if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincospi_u35 sin arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
      double u2 = countULP2dp(t = xsinpi_u05(d), frx);
      if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sinpi_u05 arg=%.20g ulp=%.20g\n", d, u2);
        fflush(stdout); ecnt++;
      }
    }
    {
      const double rangemax2 = 1e+9/4;
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_cospi(frx, frx, GMP_RNDN);
      double u0 = countULP2dp(t = sc.y, frx);
      if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincospi_u05 cos arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULP2dp(t = sc.y, frx);
      if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincospi_u35 cos arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
      double u2 = countULP2dp(t = xcospi_u05(d), frx);
      if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C cospi_u05 arg=%.20g ulp=%.20g\n", d, u2);
        fflush(stdout); ecnt++;
      }
    }
    sc = xsincos(d);
    sc2 = xsincos_u1(d);
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_sin(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xsin(d), frx);
      if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(sc.x, frx);
      if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
      double u2 = countULPdp(t = xsin_u1(d), frx);
      if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
      double u3 = countULPdp(t = sc2.x, frx);
      if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_cos(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xcos(d), frx);
      if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = sc.y, frx);
      if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
      double u2 = countULPdp(t = xcos_u1(d), frx);
      if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2);
        fflush(stdout); ecnt++;
      }
      double u3 = countULPdp(t = sc2.y, frx);
      if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
        printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_tan(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xtan(d), frx);
      if (u0 != 0 && (u0 > 3.5 || isnan(t))) {
        printf("Pure C tan arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xtan_u1(d), frx);
      if (u1 != 0 && (u1 > 1 || isnan(t))) {
        printf("Pure C tan_u1 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, fabs(d), GMP_RNDN);
      mpfr_log(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xlog(fabs(d)), frx);
      if (u0 > 3.5) {
        printf("Pure C log arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xlog_u1(fabs(d)), frx);
      if (u1 > 1) {
        printf("Pure C log_u1 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, fabs(d), GMP_RNDN);
      mpfr_log10(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xlog10(fabs(d)), frx);
      if (u0 > 1) {
        printf("Pure C log10 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, fabs(d), GMP_RNDN);
      mpfr_log2(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xlog2(fabs(d)), frx);
      if (u0 > 1) {
        printf("Pure C log2 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xlog2_u35(fabs(d)), frx);
      if (u1 > 3.5) {
        printf("Pure C log2_u35 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_log1p(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xlog1p(d), frx);
      if ((-1 <= d && d <= 1e+307 && u0 > 1) ||
          (d < -1 && !isnan(t)) ||
          (d > 1e+307 && !(u0 <= 1 || isinf(t)))) {
        printf("Pure C log1p arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_exp(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xexp(d), frx);
      if (u0 > 1) {
        printf("Pure C exp arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_exp2(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xexp2(d), frx);
      if (u0 > 1) {
        printf("Pure C exp2 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xexp2_u35(d), frx);
      if (u1 > 3.5) {
        printf("Pure C exp2_u35 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_exp10(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xexp10(d), frx);
      if (u0 > 1.09) {
        printf("Pure C exp10 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xexp10_u35(d), frx);
      if (u1 > 3.5) {
        printf("Pure C exp10_u35 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_expm1(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xexpm1(d), frx);
      if (u0 > 1) {
        printf("Pure C expm1 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_pow(frx, fry, frx, GMP_RNDN);
      double u0 = countULPdp(t = xpow(d2, d), frx);
      if (u0 > 1) {
        printf("Pure C pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
        printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_cbrt(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xcbrt(d), frx);
      if (u0 > 3.5) {
        printf("Pure C cbrt arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xcbrt_u1(d), frx);
      if (u1 > 1) {
        printf("Pure C cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, zo, GMP_RNDN);
      mpfr_asin(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xasin(zo), frx);
      if (u0 > 3.5) {
        printf("Pure C asin arg=%.20g ulp=%.20g\n", zo, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xasin_u1(zo), frx);
      if (u1 > 1) {
        printf("Pure C asin_u1 arg=%.20g ulp=%.20g\n", zo, u1);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, zo, GMP_RNDN);
      mpfr_acos(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xacos(zo), frx);
      if (u0 > 3.5) {
        printf("Pure C acos arg=%.20g ulp=%.20g\n", zo, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xacos_u1(zo), frx);
      if (u1 > 1) {
        printf("Pure C acos_u1 arg=%.20g ulp=%.20g\n", zo, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_atan(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xatan(d), frx);
      if (u0 > 3.5) {
        printf("Pure C atan arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULPdp(t = xatan_u1(d), frx);
      if (u1 > 1) {
        printf("Pure C atan_u1 arg=%.20g ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_atan2(frx, fry, frx, GMP_RNDN);
      double u0 = countULPdp(t = xatan2(d2, d), frx);
      if (u0 > 3.5) {
        printf("Pure C atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULP2dp(t = xatan2_u1(d2, d), frx);
      if (u1 > 1) {
        printf("Pure C atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_sinh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xsinh(d), frx);
      if ((fabs(d) <= 709 && u0 > 1) ||
          (d >  709 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
          (d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
        printf("Pure C sinh arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_cosh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xcosh(d), frx);
      if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
        printf("Pure C cosh arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_tanh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xtanh(d), frx);
      if (u0 > 1) {
        printf("Pure C tanh arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_sinh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xsinh_u35(d), frx);
      if ((fabs(d) <= 709 && u0 > 3.5) ||
          (d >  709 && !(u0 <= 3.5 || (isinf(t) && t > 0))) ||
          (d < -709 && !(u0 <= 3.5 || (isinf(t) && t < 0)))) {
        printf("Pure C sinh_u35 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_cosh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xcosh_u35(d), frx);
      if ((fabs(d) <= 709 && u0 > 3.5) || !(u0 <= 3.5 || (isinf(t) && t > 0))) {
        printf("Pure C cosh_u35 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_tanh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xtanh_u35(d), frx);
      if (u0 > 3.5) {
        printf("Pure C tanh_u35 arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_asinh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xasinh(d), frx);
      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
          (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
          (d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) {
        printf("Pure C asinh arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_acosh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xacosh(d), frx);
      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
          (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
          (d <= -sqrt(DBL_MAX) && !isnan(t))) {
        printf("Pure C acosh arg=%.20g ulp=%.20g\n", d, u0);
        printf("%.20g\n", t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_atanh(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xatanh(d), frx);
      if (u0 > 1) {
        printf("Pure C atanh arg=%.20g ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
    }
    //
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_abs(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xfabs(d), frx);
      if (u0 != 0) {
        printf("Pure C fabs arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_copysign(frx, frx, fry, GMP_RNDN);
      double u0 = countULPdp(t = xcopysign(d, d2), frx);
      if (u0 != 0 && !isnan(d2)) {
        printf("Pure C copysign arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
        printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_max(frx, frx, fry, GMP_RNDN);
      double u0 = countULPdp(t = xfmax(d, d2), frx);
      if (u0 != 0) {
        printf("Pure C fmax arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_min(frx, frx, fry, GMP_RNDN);
      double u0 = countULPdp(t = xfmin(d, d2), frx);
      if (u0 != 0) {
        printf("Pure C fmin arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_dim(frx, frx, fry, GMP_RNDN);
      double u0 = countULPdp(t = xfdim(d, d2), frx);
      if (u0 > 0.5) {
        printf("Pure C fdim arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_trunc(frx, frx);
      double u0 = countULPdp(t = xtrunc(d), frx);
      if (u0 != 0) {
        printf("Pure C trunc arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_floor(frx, frx);
      double u0 = countULPdp(t = xfloor(d), frx);
      if (u0 != 0) {
        printf("Pure C floor arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_ceil(frx, frx);
      double u0 = countULPdp(t = xceil(d), frx);
      if (u0 != 0) {
        printf("Pure C ceil arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_round(frx, frx);
      double u0 = countULPdp(t = xround(d), frx);
      if (u0 != 0) {
        printf("Pure C round arg=%.24g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_rint(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xrint(d), frx);
      if (u0 != 0) {
        printf("Pure C rint arg=%.24g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_set_d(frz, d3, GMP_RNDN);
      mpfr_fma(frx, frx, fry, frz, GMP_RNDN);
      double u0 = countULP2dp(t = xfma(d, d2, d3), frx);
      double c = mpfr_get_d(frx, GMP_RNDN);
      if ((-1e+303 < c && c < 1e+303 && u0 > 0.5) ||
          !(u0 <= 0.5 || isinf(t))) {
        printf("Pure C fma arg=%.20g, %.20g, %.20g  ulp=%.20g\n", d, d2, d3, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_sqrt(frx, frx, GMP_RNDN);
      double u0 = countULPdp(t = xsqrt_u05(d), frx);
      if (u0 > 0.50001) {
        printf("Pure C sqrt_u05 arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_hypot(frx, frx, fry, GMP_RNDN);
      double u0 = countULP2dp(t = xhypot_u05(d, d2), frx);
      if (u0 > 0.5) {
        printf("Pure C hypot arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_hypot(frx, frx, fry, GMP_RNDN);
      double u0 = countULP2dp(t = xhypot_u35(d, d2), frx);
      double c = mpfr_get_d(frx, GMP_RNDN);
      if ((-1e+308 < c && c < 1e+308 && u0 > 3.5) ||
          !(u0 <= 3.5 || isinf(t))) {
        printf("Pure C hypot arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      t = xnextafter(d, d2);
      double c = nextafter(d, d2);
      if (!(isnan(t) && isnan(c)) && t != c) {
        printf("Pure C nextafter arg=%.20g, %.20g\n", d, d2);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_exp(frx, 0);
      double u0 = countULPdp(t = xfrfrexp(d), frx);
      if (d != 0 && isnumber(d) && u0 != 0) {
        printf("Pure C frfrexp arg=%.20g ulp=%.20g\n", d, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      int cexp = mpfr_get_exp(frx);
      int texp = xexpfrexp(d);
      if (d != 0 && isnumber(d) && cexp != texp) {
        printf("Pure C expfrexp arg=%.20g\n", d);
        printf("correct = %d, test = %d\n", cexp, texp);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_fmod(frx, frx, fry, GMP_RNDN);
      double u0 = countULPdp(t = xfmod(d, d2), frx);
      if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
        printf("Pure C fmod arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_d(fry, d2, GMP_RNDN);
      mpfr_remainder(frx, frx, fry, GMP_RNDN);
      double u0 = countULPdp(t = xremainder(d, d2), frx);
      if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
        printf("Pure C remainder arg=%.20g, %.20g  ulp=%.20g\n", d, d2, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      int exp = (random() & 8191) - 4096;
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_set_exp(frx, mpfr_get_exp(frx) + exp);
      double u0 = countULPdp(t = xldexp(d, exp), frx);
      if (u0 > 0.5) {
        printf("Pure C ldexp arg=%.20g %d ulp=%.20g\n", d, exp, u0);
        printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_modf(fry, frz, frx, GMP_RNDN);
      Sleef_double2 t2 = xmodf(d);
      double u0 = countULPdp(t2.x, frz);
      double u1 = countULPdp(t2.y, fry);
      if (u0 != 0 || u1 != 0) {
        printf("Pure C modf arg=%.20g ulp=%.20g %.20g\n", d, u0, u1);
        printf("correct = %.20g, %.20g\n", mpfr_get_d(frz, GMP_RNDN), mpfr_get_d(fry, GMP_RNDN));
        printf("test    = %.20g, %.20g\n", t2.x, t2.y);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      int s;
      mpfr_lgamma(frx, &s, frx, GMP_RNDN);
      double u0 = countULPdp(t = xlgamma_u1(d), frx);
      if (((d < 0 && fabsl(t - mpfr_get_ld(frx, GMP_RNDN)) > 1e-15 && u0 > 1) || (0 <= d && d < 2e+305 && u0 > 1) || (2e+305 <= d && !(u0 <= 1 || isinf(t))))) {
        printf("Pure C xlgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
        printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
        printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_gamma(frx, frx, GMP_RNDN);
      double u0 = countULP2dp(t = xtgamma_u1(d), frx);
      if (u0 > 1.0) {
        printf("Pure C xtgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
        printf("Correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
        printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_erfc(frx, frx, GMP_RNDN);
      static double ebz = 9.8813129168249308835e-324; // nextafter(nextafter(0, 1), 1);
      double u0 = countULP2dp(t = xerfc_u15(d), frx);
      if ((d > 26.2 && u0 > 2.5 && !(mpfr_get_d(frx, GMP_RNDN) == 0 && t <= ebz)) || (d <= 26.2 && u0 > 1.5)) {
        printf("Pure C xerfc_u15 arg=%.20g ulp=%.20g\n", d, u0);
        printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
    {
      mpfr_set_d(frx, d, GMP_RNDN);
      mpfr_erf(frx, frx, GMP_RNDN);
      double u0 = countULP2dp(t = xerf_u1(d), frx);
      if (u0 > 0.75) {
        printf("Pure C xerf_u1 arg=%.20g ulp=%.20g\n", d, u0);
        printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
        fflush(stdout); ecnt++;
      }
    }
  }
  mpfr_clears(frw, frx, fry, frz, NULL);
  exit(0);
 }
--- a/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2ld.c
+++ b/src/jdk.incubator.vector/linux/native/libsleef/upstream/src/libm-tester/tester2ld.c
@@ -0,0 +1,241 @@
 //   Copyright Naoki Shibata and contributors 2010 - 2024.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <mpfr.h>
 #include <time.h>
 #include <float.h>
 #include <limits.h>
 #include <math.h>
 #include "misc.h"
 #ifdef ENABLE_SYS_getrandom
 #define _GNU_SOURCE
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/random.h>
 #endif
 #include "sleef.h"
 #include "testerutil.h"
 #define DORENAME
 #include "rename.h"
 #define DENORMAL_LDBL_MIN (3.6451995318824746025284059336194e-4951L)
 #define XLDBL_MIN (3.3621031431120935062626778173218e-4932L)
 #ifndef M_PIl
 #define M_PIl 3.141592653589793238462643383279502884L
 #endif
 #ifndef M_PI_4l
 #define M_PI_4l .785398163397448309615660845819875721049292L
 #endif
 #define POSITIVE_INFINITY INFINITY
 #define NEGATIVE_INFINITY (-INFINITY)
 int isnumberl(long double x) { return x != SLEEF_INFINITYl && x != -SLEEF_INFINITYl && x == x; }
 int isPlusZerol(long double x) { return x == 0 && copysignl(1, x) == 1; }
 int isMinusZerol(long double x) { return x == 0 && copysignl(1, x) == -1; }
 mpfr_t fra, frb, frd;
 double countULP(long double d, mpfr_t c) {
  long double c2 = mpfr_get_ld(c, GMP_RNDN);
  if (c2 == 0 && d != 0) return 10000;
  //if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
  //if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
  if (isnanl(c2) && isnanl(d)) return 0;
  if (isnanl(c2) || isnanl(d)) return 10001;
  if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
  if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
  if (!isnumberl(c2) && !isnumberl(d)) return 0;
  int e;
  frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
  mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), DENORMAL_LDBL_MIN), GMP_RNDN);
  mpfr_set_ld(frd, d, GMP_RNDN);
  mpfr_sub(fra, frd, c, GMP_RNDN);
  mpfr_div(fra, fra, frb, GMP_RNDN);
  double u = fabs(mpfr_get_d(fra, GMP_RNDN));
  return u;
 }
 double countULP2(long double d, mpfr_t c) {
  long double c2 = mpfr_get_ld(c, GMP_RNDN);
  if (c2 == 0 && d != 0) return 10000;
  //if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
  //if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
  if (isnanl(c2) && isnanl(d)) return 0;
  if (isnanl(c2) || isnanl(d)) return 10001;
  if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
  if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
  if (!isnumberl(c2) && !isnumberl(d)) return 0;
  int e;
  frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
  mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), LDBL_MIN), GMP_RNDN);
  mpfr_set_ld(frd, d, GMP_RNDN);
  mpfr_sub(fra, frd, c, GMP_RNDN);
  mpfr_div(fra, fra, frb, GMP_RNDN);
  double u = fabs(mpfr_get_d(fra, GMP_RNDN));
  return u;
 }
 typedef union {
  long double d;
  __int128 u128;
 } conv_t;
 long double rnd() {
  conv_t c;
  switch(random() & 15) {
  case 0: return  INFINITY;
  case 1: return -INFINITY;
  }
 #ifdef ENABLE_SYS_getrandom
  syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
 #else
  c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
 #endif
  return c.d;
 }
 long double rnd_fr() {
  conv_t c;
  do {
 #ifdef ENABLE_SYS_getrandom
    syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
 #else
    c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
 #endif
  } while(!isnumberl(c.d));
  return c.d;
 }
 long double rnd_zo() {
  conv_t c;
  do {
 #ifdef ENABLE_SYS_getrandom
    syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
 #else
    c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
 #endif
  } while(!isnumberl(c.d) || c.d < -1 || 1 < c.d);
  return c.d;
 }
 void sinpifr(mpfr_t ret, long double d) {
  mpfr_t frpi, frd;
  mpfr_inits(frpi, frd, NULL);
  mpfr_const_pi(frpi, GMP_RNDN);
  mpfr_set_d(frd, 1.0, GMP_RNDN);
  mpfr_mul(frpi, frpi, frd, GMP_RNDN);
  mpfr_set_ld(frd, d, GMP_RNDN);
  mpfr_mul(frd, frpi, frd, GMP_RNDN);
  mpfr_sin(ret, frd, GMP_RNDN);
  mpfr_clears(frpi, frd, NULL);
 }
 void cospifr(mpfr_t ret, long double d) {
  mpfr_t frpi, frd;
  mpfr_inits(frpi, frd, NULL);
  mpfr_const_pi(frpi, GMP_RNDN);
  mpfr_set_d(frd, 1.0, GMP_RNDN);
  mpfr_mul(frpi, frpi, frd, GMP_RNDN);
  mpfr_set_ld(frd, d, GMP_RNDN);
  mpfr_mul(frd, frpi, frd, GMP_RNDN);
  mpfr_cos(ret, frd, GMP_RNDN);
  mpfr_clears(frpi, frd, NULL);
 }
 int main(int argc,char **argv)
 {
  mpfr_t frx;
  mpfr_set_default_prec(256);
  mpfr_inits(fra, frb, frd, frx, NULL);
  conv_t cd;
  long double d, t;
  int cnt, ecnt = 0;
  srandom(time(NULL));
  for(cnt = 0;ecnt < 1000;cnt++) {
    switch(cnt & 7) {
    case 0:
      d = rnd();
      break;
    case 1:
      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+10) * M_PI_4;
      cd.u128 += (random() & 0xff) - 0x7f;
      d = cd.d;
      break;
    default:
      d = rnd_fr();
      break;
    }
    Sleef_longdouble2 sc  = xsincospil_u05(d);
    Sleef_longdouble2 sc2 = xsincospil_u35(d);
    {
      const double rangemax2 = 1e+9;
      sinpifr(frx, d);
      double u0 = countULP2(t = sc.x, frx);
      if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
        printf("Pure C sincospil_u05 sin arg=%.30Lg ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULP2(t = sc2.x, frx);
      if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
        printf("Pure C sincospil_u35 sin arg=%.30Lg ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
    {
      const double rangemax2 = 1e+9;
      cospifr(frx, d);
      double u0 = countULP2(t = sc.y, frx);
      if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
        printf("Pure C sincospil_u05 cos arg=%.30Lg ulp=%.20g\n", d, u0);
        fflush(stdout); ecnt++;
      }
      double u1 = countULP2(t = sc.y, frx);
      if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
        printf("Pure C sincospil_u35 cos arg=%.30Lg ulp=%.20g\n", d, u1);
        fflush(stdout); ecnt++;
      }
    }
  }
 }
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`include("${CMAKE_CURRENT_LIST_DIR}/sleefTargets.cmake")`