8329816: Add SLEEF version 3.6.1

Reviewed-by: erikj, mli, luhenry
This commit is contained in:
Magnus Ihse Bursie
2024-09-17 12:58:36 +00:00
parent 80db6e71b0
commit b39e6a84ef
175 changed files with 120709 additions and 0 deletions

View File

@@ -568,6 +568,10 @@ $(eval $(call SetupTarget, update-build-docs, \
MAKEFILE := UpdateBuildDocs, \ MAKEFILE := UpdateBuildDocs, \
)) ))
$(eval $(call SetupTarget, update-sleef-source, \
MAKEFILE := UpdateSleefSource, \
))
$(eval $(call SetupTarget, update-x11wrappers, \ $(eval $(call SetupTarget, update-x11wrappers, \
MAKEFILE := UpdateX11Wrappers, \ MAKEFILE := UpdateX11Wrappers, \
DEPS := java.base-copy buildtools-jdk, \ DEPS := java.base-copy buildtools-jdk, \

153
make/UpdateSleefSource.gmk Normal file
View File

@@ -0,0 +1,153 @@
#
# Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation. Oracle designates this
# particular file as subject to the "Classpath" exception as provided
# by Oracle in the LICENSE file that accompanied this code.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, write to the Free Software Foundation,
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.
#
################################################################################
default: all
include $(SPEC)
include MakeBase.gmk
include CopyFiles.gmk
include Execute.gmk
################################################################################
# This file is responsible for updating the generated sleef source code files
# that are checked in to the JDK repo, and that are actually used when building.
# This target needs to be re-run every time the source code of libsleef is
# updated from upstream.
################################################################################
ifneq ($(COMPILE_TYPE), cross)
$(error Only cross-compilation of libsleef is currently supported)
endif
ifeq ($(CMAKE), )
$(error CMake not found. Please install cmake and rerun configure)
endif
ifneq ($(OPENJDK_BUILD_OS), linux)
$(error This target is only supported on linux)
endif
SLEEF_SUPPORT_DIR := $(MAKESUPPORT_OUTPUTDIR)/sleef
SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/linux/native/libsleef
SLEEF_SOURCE_DIR := $(SLEEF_SOURCE_BASE_DIR)/upstream
SLEEF_TARGET_DIR := $(SLEEF_SOURCE_BASE_DIR)/generated
SLEEF_NATIVE_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/native
SLEEF_CROSS_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/cross
ifeq ($(OPENJDK_TARGET_CPU), aarch64)
CROSS_COMPILATION_FILENAMES := sleefinline_advsimd.h sleefinline_sve.h
EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_SVE=TRUE
else ifeq ($(OPENJDK_TARGET_CPU), riscv64)
CROSS_COMPILATION_FILENAMES := sleefinline_rvvm1.h
EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_RVVM1=TRUE
else
$(error Unsupported platform)
endif
CROSS_COMPILATION_SRC_FILES := $(addprefix $(SLEEF_CROSS_BUILD_DIR)/include/, \
$(CROSS_COMPILATION_FILENAMES))
ifeq ($(TOOLCHAIN_TYPE), clang)
SLEEF_TOOLCHAIN_TYPE := llvm
else
SLEEF_TOOLCHAIN_TYPE := $(TOOLCHAIN_TYPE)
endif
SLEEF_CMAKE_FILE := toolchains/$(OPENJDK_TARGET_CPU)-$(SLEEF_TOOLCHAIN_TYPE).cmake
# We need to run CMake twice, first using it to configure the build, and then
# to actually build; and we need to do this twice, once for a native build
# and once for the cross-compilation build.
$(eval $(call SetupExecute, sleef_native_config, \
INFO := Configuring native sleef build, \
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
$(SLEEF_NATIVE_BUILD_DIR), \
))
TARGETS := $(sleef_native_config)
$(eval $(call SetupExecute, sleef_native_build, \
INFO := Building native sleef, \
DEPS := $(sleef_native_config), \
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
$(SLEEF_NATIVE_BUILD_DIR) -j, \
))
TARGETS := $(sleef_native_build)
$(eval $(call SetupExecute, sleef_cross_config, \
INFO := Configuring cross-compiling sleef build, \
DEPS := $(sleef_native_build), \
OUTPUT_DIR := $(SLEEF_CROSS_BUILD_DIR), \
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
$(SLEEF_CROSS_BUILD_DIR) \
-DCMAKE_C_COMPILER=$(CC) \
-DCMAKE_TOOLCHAIN_FILE=$(SLEEF_CMAKE_FILE) \
-DNATIVE_BUILD_DIR=$(SLEEF_NATIVE_BUILD_DIR) \
-DSLEEF_BUILD_INLINE_HEADERS=TRUE \
$(EXTRA_CROSS_OPTIONS), \
))
TARGETS := $(sleef_cross_config)
$(eval $(call SetupExecute, sleef_cross_build, \
INFO := Building cross-compiling sleef, \
DEPS := $(sleef_cross_config), \
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
$(SLEEF_CROSS_BUILD_DIR) -j, \
))
TARGETS := $(sleef_cross_build)
$(CROSS_COMPILATION_SRC_FILES): $(sleef_cross_build)
# Finally, copy the generated files (and one needed static file) into our
# target directory.
$(eval $(call SetupCopyFiles, copy_static_sleef_source, \
FILES := $(SLEEF_SOURCE_DIR)/src/common/misc.h, \
DEST := $(SLEEF_TARGET_DIR), \
))
TARGETS := $(copy_static_sleef_source)
$(eval $(call SetupCopyFiles, copy_generated_sleef_source, \
FILES := $(CROSS_COMPILATION_SRC_FILES), \
DEST := $(SLEEF_TARGET_DIR), \
))
TARGETS := $(copy_generated_sleef_source)
################################################################################
all: $(TARGETS)
.PHONY: all default

View File

@@ -99,6 +99,7 @@ AC_DEFUN_ONCE([BASIC_SETUP_TOOLS],
UTIL_REQUIRE_SPECIAL(FGREP, [AC_PROG_FGREP]) UTIL_REQUIRE_SPECIAL(FGREP, [AC_PROG_FGREP])
# Optional tools, we can do without them # Optional tools, we can do without them
UTIL_LOOKUP_PROGS(CMAKE, cmake)
UTIL_LOOKUP_PROGS(DF, df) UTIL_LOOKUP_PROGS(DF, df)
UTIL_LOOKUP_PROGS(GIT, git) UTIL_LOOKUP_PROGS(GIT, git)
UTIL_LOOKUP_PROGS(NICE, nice) UTIL_LOOKUP_PROGS(NICE, nice)

View File

@@ -719,6 +719,7 @@ CCACHE := @CCACHE@
# CD is going away, but remains to cater for legacy makefiles. # CD is going away, but remains to cater for legacy makefiles.
CD := cd CD := cd
CHMOD := @CHMOD@ CHMOD := @CHMOD@
CMAKE := @CMAKE@
CODESIGN := @CODESIGN@ CODESIGN := @CODESIGN@
CP := @CP@ CP := @CP@
CUT := @CUT@ CUT := @CUT@

View File

@@ -0,0 +1,439 @@
## SLEEF v3.6.1
### Notice
```
Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors
-------
src/arch/helpersve.h has the following copyright:
Copyright ARM Ltd. 2010 - 2024.
-------
src/gencoef/{dp.h, gencoef.c, ld.h, qp.h, simplexfr.c, sp.h} have no copyright but has the following license text:
// The code is distributed under the Creative Commons Attribution 4.0 International License.
// https://creativecommons.org/licenses/by/4.0/
Attribution 4.0 International
```
### LICENSE Boost v1.0
```
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
```
### LICENSE Creative Commons Attribution 4.0 International License
```
Creative Commons Corporation ("Creative Commons") is not a law firm and
does not provide legal services or legal advice. Distribution of
Creative Commons public licenses does not create a lawyer-client or
other relationship. Creative Commons makes its licenses and related
information available on an "as-is" basis. Creative Commons gives no
warranties regarding its licenses, any material licensed under their
terms and conditions, or any related information. Creative Commons
disclaims all liability for damages resulting from their use to the
fullest extent possible.
Using Creative Commons Public Licenses
Creative Commons public licenses provide a standard set of terms and
conditions that creators and other rights holders may use to share
original works of authorship and other material subject to copyright
and certain other rights specified in the public license below. The
following considerations are for informational purposes only, are not
exhaustive, and do not form part of our licenses.
Considerations for licensors: Our public licenses are
intended for use by those authorized to give the public
permission to use material in ways otherwise restricted by
copyright and certain other rights. Our licenses are
irrevocable. Licensors should read and understand the terms
and conditions of the license they choose before applying it.
Licensors should also secure all rights necessary before
applying our licenses so that the public can reuse the
material as expected. Licensors should clearly mark any
material not subject to the license. This includes other CC-
licensed material, or material used under an exception or
limitation to copyright. More considerations for licensors:
wiki.creativecommons.org/Considerations_for_licensors
Considerations for the public: By using one of our public
licenses, a licensor grants the public permission to use the
licensed material under specified terms and conditions. If
the licensor's permission is not necessary for any reason--for
example, because of any applicable exception or limitation to
copyright--then that use is not regulated by the license. Our
licenses grant only permissions under copyright and certain
other rights that a licensor has authority to grant. Use of
the licensed material may still be restricted for other
reasons, including because others have copyright or other
rights in the material. A licensor may make special requests,
such as asking that all changes be marked or described.
Although not required by our licenses, you are encouraged to
respect those requests where reasonable. More considerations
for the public:
wiki.creativecommons.org/Considerations_for_licensees
=======================================================================
Creative Commons Attribution 4.0 International Public License
By exercising the Licensed Rights (defined below), You accept and agree
to be bound by the terms and conditions of this Creative Commons
Attribution 4.0 International Public License ("Public License"). To the
extent this Public License may be interpreted as a contract, You are
granted the Licensed Rights in consideration of Your acceptance of
these terms and conditions, and the Licensor grants You such rights in
consideration of benefits the Licensor receives from making the
Licensed Material available under these terms and conditions.
Section 1 -- Definitions.
a. Adapted Material means material subject to Copyright and Similar
Rights that is derived from or based upon the Licensed Material
and in which the Licensed Material is translated, altered,
arranged, transformed, or otherwise modified in a manner requiring
permission under the Copyright and Similar Rights held by the
Licensor. For purposes of this Public License, where the Licensed
Material is a musical work, performance, or sound recording,
Adapted Material is always produced where the Licensed Material is
synched in timed relation with a moving image.
b. Adapter's License means the license You apply to Your Copyright
and Similar Rights in Your contributions to Adapted Material in
accordance with the terms and conditions of this Public License.
c. Copyright and Similar Rights means copyright and/or similar rights
closely related to copyright including, without limitation,
performance, broadcast, sound recording, and Sui Generis Database
Rights, without regard to how the rights are labeled or
categorized. For purposes of this Public License, the rights
specified in Section 2(b)(1)-(2) are not Copyright and Similar
Rights.
d. Effective Technological Measures means those measures that, in the
absence of proper authority, may not be circumvented under laws
fulfilling obligations under Article 11 of the WIPO Copyright
Treaty adopted on December 20, 1996, and/or similar international
agreements.
e. Exceptions and Limitations means fair use, fair dealing, and/or
any other exception or limitation to Copyright and Similar Rights
that applies to Your use of the Licensed Material.
f. Licensed Material means the artistic or literary work, database,
or other material to which the Licensor applied this Public
License.
g. Licensed Rights means the rights granted to You subject to the
terms and conditions of this Public License, which are limited to
all Copyright and Similar Rights that apply to Your use of the
Licensed Material and that the Licensor has authority to license.
h. Licensor means the individual(s) or entity(ies) granting rights
under this Public License.
i. Share means to provide material to the public by any means or
process that requires permission under the Licensed Rights, such
as reproduction, public display, public performance, distribution,
dissemination, communication, or importation, and to make material
available to the public including in ways that members of the
public may access the material from a place and at a time
individually chosen by them.
j. Sui Generis Database Rights means rights other than copyright
resulting from Directive 96/9/EC of the European Parliament and of
the Council of 11 March 1996 on the legal protection of databases,
as amended and/or succeeded, as well as other essentially
equivalent rights anywhere in the world.
k. You means the individual or entity exercising the Licensed Rights
under this Public License. Your has a corresponding meaning.
Section 2 -- Scope.
a. License grant.
1. Subject to the terms and conditions of this Public License,
the Licensor hereby grants You a worldwide, royalty-free,
non-sublicensable, non-exclusive, irrevocable license to
exercise the Licensed Rights in the Licensed Material to:
a. reproduce and Share the Licensed Material, in whole or
in part; and
b. produce, reproduce, and Share Adapted Material.
2. Exceptions and Limitations. For the avoidance of doubt, where
Exceptions and Limitations apply to Your use, this Public
License does not apply, and You do not need to comply with
its terms and conditions.
3. Term. The term of this Public License is specified in Section
6(a).
4. Media and formats; technical modifications allowed. The
Licensor authorizes You to exercise the Licensed Rights in
all media and formats whether now known or hereafter created,
and to make technical modifications necessary to do so. The
Licensor waives and/or agrees not to assert any right or
authority to forbid You from making technical modifications
necessary to exercise the Licensed Rights, including
technical modifications necessary to circumvent Effective
Technological Measures. For purposes of this Public License,
simply making modifications authorized by this Section 2(a)
(4) never produces Adapted Material.
5. Downstream recipients.
a. Offer from the Licensor -- Licensed Material. Every
recipient of the Licensed Material automatically
receives an offer from the Licensor to exercise the
Licensed Rights under the terms and conditions of this
Public License.
b. No downstream restrictions. You may not offer or impose
any additional or different terms or conditions on, or
apply any Effective Technological Measures to, the
Licensed Material if doing so restricts exercise of the
Licensed Rights by any recipient of the Licensed
Material.
6. No endorsement. Nothing in this Public License constitutes or
may be construed as permission to assert or imply that You
are, or that Your use of the Licensed Material is, connected
with, or sponsored, endorsed, or granted official status by,
the Licensor or others designated to receive attribution as
provided in Section 3(a)(1)(A)(i).
b. Other rights.
1. Moral rights, such as the right of integrity, are not
licensed under this Public License, nor are publicity,
privacy, and/or other similar personality rights; however, to
the extent possible, the Licensor waives and/or agrees not to
assert any such rights held by the Licensor to the limited
extent necessary to allow You to exercise the Licensed
Rights, but not otherwise.
2. Patent and trademark rights are not licensed under this
Public License.
3. To the extent possible, the Licensor waives any right to
collect royalties from You for the exercise of the Licensed
Rights, whether directly or through a collecting society
under any voluntary or waivable statutory or compulsory
licensing scheme. In all other cases the Licensor expressly
reserves any right to collect such royalties.
Section 3 -- License Conditions.
Your exercise of the Licensed Rights is expressly made subject to the
following conditions.
a. Attribution.
1. If You Share the Licensed Material (including in modified
form), You must:
a. retain the following if it is supplied by the Licensor
with the Licensed Material:
i. identification of the creator(s) of the Licensed
Material and any others designated to receive
attribution, in any reasonable manner requested by
the Licensor (including by pseudonym if
designated);
ii. a copyright notice;
iii. a notice that refers to this Public License;
iv. a notice that refers to the disclaimer of
warranties;
v. a URI or hyperlink to the Licensed Material to the
extent reasonably practicable;
b. indicate if You modified the Licensed Material and
retain an indication of any previous modifications; and
c. indicate the Licensed Material is licensed under this
Public License, and include the text of, or the URI or
hyperlink to, this Public License.
2. You may satisfy the conditions in Section 3(a)(1) in any
reasonable manner based on the medium, means, and context in
which You Share the Licensed Material. For example, it may be
reasonable to satisfy the conditions by providing a URI or
hyperlink to a resource that includes the required
information.
3. If requested by the Licensor, You must remove any of the
information required by Section 3(a)(1)(A) to the extent
reasonably practicable.
4. If You Share Adapted Material You produce, the Adapter's
License You apply must not prevent recipients of the Adapted
Material from complying with this Public License.
Section 4 -- Sui Generis Database Rights.
Where the Licensed Rights include Sui Generis Database Rights that
apply to Your use of the Licensed Material:
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
to extract, reuse, reproduce, and Share all or a substantial
portion of the contents of the database;
b. if You include all or a substantial portion of the database
contents in a database in which You have Sui Generis Database
Rights, then the database in which You have Sui Generis Database
Rights (but not its individual contents) is Adapted Material; and
c. You must comply with the conditions in Section 3(a) if You Share
all or a substantial portion of the contents of the database.
For the avoidance of doubt, this Section 4 supplements and does not
replace Your obligations under this Public License where the Licensed
Rights include other Copyright and Similar Rights.
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
c. The disclaimer of warranties and limitation of liability provided
above shall be interpreted in a manner that, to the extent
possible, most closely approximates an absolute disclaimer and
waiver of all liability.
Section 6 -- Term and Termination.
a. This Public License applies for the term of the Copyright and
Similar Rights licensed here. However, if You fail to comply with
this Public License, then Your rights under this Public License
terminate automatically.
b. Where Your right to use the Licensed Material has terminated under
Section 6(a), it reinstates:
1. automatically as of the date the violation is cured, provided
it is cured within 30 days of Your discovery of the
violation; or
2. upon express reinstatement by the Licensor.
For the avoidance of doubt, this Section 6(b) does not affect any
right the Licensor may have to seek remedies for Your violations
of this Public License.
c. For the avoidance of doubt, the Licensor may also offer the
Licensed Material under separate terms or conditions or stop
distributing the Licensed Material at any time; however, doing so
will not terminate this Public License.
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
License.
Section 7 -- Other Terms and Conditions.
a. The Licensor shall not be bound by any additional or different
terms or conditions communicated by You unless expressly agreed.
b. Any arrangements, understandings, or agreements regarding the
Licensed Material not stated herein are separate from and
independent of the terms and conditions of this Public License.
Section 8 -- Interpretation.
a. For the avoidance of doubt, this Public License does not, and
shall not be interpreted to, reduce, limit, restrict, or impose
conditions on any use of the Licensed Material that could lawfully
be made without permission under this Public License.
b. To the extent possible, if any provision of this Public License is
deemed unenforceable, it shall be automatically reformed to the
minimum extent necessary to make it enforceable. If the provision
cannot be reformed, it shall be severed from this Public License
without affecting the enforceability of the remaining terms and
conditions.
c. No term or condition of this Public License will be waived and no
failure to comply consented to unless expressly agreed to by the
Licensor.
d. Nothing in this Public License constitutes or may be interpreted
as a limitation upon, or waiver of, any privileges and immunities
that apply to the Licensor or You, including from the legal
processes of any jurisdiction or authority.
=======================================================================
Creative Commons is not a party to its public
licenses. Notwithstanding, Creative Commons may elect to apply one of
its public licenses to material it publishes and in those instances
will be considered the “Licensor.” The text of the Creative Commons
public licenses is dedicated to the public domain under the CC0 Public
Domain Dedication. Except for the limited purpose of indicating that
material is shared under a Creative Commons public license or as
otherwise permitted by the Creative Commons policies published at
creativecommons.org/policies, Creative Commons does not authorize the
use of the trademark "Creative Commons" or any other trademark or logo
of Creative Commons without its prior written consent including,
without limitation, in connection with any unauthorized modifications
to any of its public licenses or any other arrangements,
understandings, or agreements concerning use of licensed material. For
the avoidance of doubt, this paragraph does not form part of the
public licenses.
Creative Commons may be contacted at creativecommons.org.
```

View File

@@ -0,0 +1,54 @@
# About SLEEF
This directory contains the source code for the SLEEF library, the
**SIMD Library for Evaluating Elementary Functions**. For more information on
SLEEF, see https://sleef.org/.
The currently imported libsleef sources is version 3.6.1, which has
git tag `3.6.1` and git commit hash `6ee14bcae5fe92c2ff8b000d5a01102dab08d774`.
# About the libsleef integration in the JDK
The upstream original source code is available in
`src/jdk.incubator.vector/linux/native/libsleef/upstream`. However, this code is
not directly usable in the JDK build system, but is instead used as the base for
the generation of additional souce code files. This generation is done by
the libsleef CMake files. If this should have been done at build time, it would
have meant adding CMake as a required dependency to build the JDK.
Instead, we create these generated files only once, when we import a new
version of the libsleef source code, and check in the generated files into
the JDK source tree. The generated files reside in
`src/jdk.incubator.vector/linux/native/libsleef/generated`.
# Import instructions
To update the version of libsleef that is used in the JDK, clone
`https://github.com/shibatch/sleef.git`, and copy all files, except the `docs`,
`.github` and `.git` directories, into
`src/jdk.incubator.vector/linux/native/libsleef/upstream`.
The libsleef source code does not follow the JDK whitespace rules as enforced by
jcheck. You will need to remove trailing whitespace, and expand tabs to 8
spaces in the imported source code.
Update the note above with information about what version you import.
You will need to repeat the process below for each of the platforms in the JDK
that uses libsleef; currently this is aarch64 and riscv64. The rest of this
instruction assumes you are doing this on linux/x64; at this point, any other
setup is not supported. Also, make sure you have CMake installed.
First, run configure for cross-compiling to your selected target platform
(e.g. aarch64).
Run `make update-sleef-source` to process the upstream source code and
store the generated files in the `generated` directory.
Now, you can repeat this for the next platform. For instance, you can
create a separate configuration using `configure --with-conf-name=riscv64` and
then generate the updated libsleef source code by
`make update-sleef-source CONF=riscv64`.
Finally, verify with git that the local changes made to the files in
`src/jdk.incubator.vector/linux/native/libsleef/generated` look okay.

View File

@@ -0,0 +1,332 @@
// Copyright Naoki Shibata and contributors 2010 - 2024.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
#ifndef __MISC_H__
#define __MISC_H__
#if !defined(SLEEF_GENHEADER)
#include <stdint.h>
#include <string.h>
#endif
#ifndef M_PI
#define M_PI 3.141592653589793238462643383279502884
#endif
#ifndef M_PIl
#define M_PIl 3.141592653589793238462643383279502884L
#endif
#ifndef M_1_PI
#define M_1_PI 0.318309886183790671537767526745028724
#endif
#ifndef M_1_PIl
#define M_1_PIl 0.318309886183790671537767526745028724L
#endif
#ifndef M_2_PI
#define M_2_PI 0.636619772367581343075535053490057448
#endif
#ifndef M_2_PIl
#define M_2_PIl 0.636619772367581343075535053490057448L
#endif
#if !defined(SLEEF_GENHEADER)
#ifndef SLEEF_FP_ILOGB0
#define SLEEF_FP_ILOGB0 ((int)0x80000000)
#endif
#ifndef SLEEF_FP_ILOGBNAN
#define SLEEF_FP_ILOGBNAN ((int)2147483647)
#endif
#endif
#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
#define SLEEF_FLT_MIN 0x1p-126
#define SLEEF_DBL_MIN 0x1p-1022
#define SLEEF_INT_MAX 2147483647
#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
//
/*
PI_A to PI_D are constants that satisfy the following two conditions.
* For PI_A, PI_B and PI_C, the last 28 bits are zero.
* PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
The argument of a trig function is multiplied by 1/PI, and the
integral part is divided into two parts, each has at most 28
bits. So, the maximum argument that could be correctly reduced
should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
double precision calculation, the actual maximum argument that can
be correctly reduced is around 2^47.
*/
#define PI_A 3.1415926218032836914
#define PI_B 3.1786509424591713469e-08
#define PI_C 1.2246467864107188502e-16
#define PI_D 1.2736634327021899816e-24
#define TRIGRANGEMAX 1e+14
/*
PI_A2 and PI_B2 are constants that satisfy the following two conditions.
* The last 3 bits of PI_A2 are zero.
* PI_A2 + PI_B2 is close to PI as much as possible.
The argument of a trig function is multiplied by 1/PI, and the
integral part is multiplied by PI_A2. So, the maximum argument that
could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
we confirmed that it correctly reduces the argument up to around 15.
*/
#define PI_A2 3.141592653589793116
#define PI_B2 1.2246467991473532072e-16
#define TRIGRANGEMAX2 15
#define M_2_PI_H 0.63661977236758138243
#define M_2_PI_L -3.9357353350364971764e-17
#define SQRT_DBL_MAX 1.3407807929942596355e+154
#define TRIGRANGEMAX3 1e+9
#define M_4_PI 1.273239544735162542821171882678754627704620361328125
#define L2U .69314718055966295651160180568695068359375
#define L2L .28235290563031577122588448175013436025525412068e-12
#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
#define L10U 0.30102999566383914498 // log 2 / log 10
#define L10L 1.4205023227266099418e-13
#define LOG10_2 3.3219280948873623478703194294893901758648313930
#define L10Uf 0.3010253906f
#define L10Lf 4.605038981e-06f
//
#define PI_Af 3.140625f
#define PI_Bf 0.0009670257568359375f
#define PI_Cf 6.2771141529083251953e-07f
#define PI_Df 1.2154201256553420762e-10f
#define TRIGRANGEMAXf 39000
#define PI_A2f 3.1414794921875f
#define PI_B2f 0.00011315941810607910156f
#define PI_C2f 1.9841872589410058936e-09f
#define TRIGRANGEMAX2f 125.0f
#define TRIGRANGEMAX4f 8e+6f
#define SQRT_FLT_MAX 18446743523953729536.0
#define L2Uf 0.693145751953125f
#define L2Lf 1.428606765330187045e-06f
#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
#ifndef M_PIf
# define M_PIf ((float)M_PI)
#endif
//
#ifndef MIN
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#endif
#ifndef MAX
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#endif
#ifndef ABS
#define ABS(x) ((x) < 0 ? -(x) : (x))
#endif
#define stringify(s) stringify_(s)
#define stringify_(s) #s
#if !defined(SLEEF_GENHEADER)
typedef long double longdouble;
#endif
#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_double2_DEFINED
typedef struct {
double x, y;
} Sleef_double2;
#endif
#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_float2_DEFINED
typedef struct {
float x, y;
} Sleef_float2;
#endif
#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_longdouble2_DEFINED
typedef struct {
long double x, y;
} Sleef_longdouble2;
#endif
#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
#define LIKELY(condition) __builtin_expect(!!(condition), 1)
#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
#define RESTRICT __restrict__
#ifndef __arm__
#define ALIGNED(x) __attribute__((aligned(x)))
#else
#define ALIGNED(x)
#endif
#if defined(SLEEF_GENHEADER)
#define INLINE SLEEF_ALWAYS_INLINE
#define EXPORT SLEEF_INLINE
#define CONST SLEEF_CONST
#define NOEXPORT
#else // #if defined(SLEEF_GENHEADER)
#define CONST __attribute__((const))
#define INLINE __attribute__((always_inline))
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#ifndef SLEEF_STATIC_LIBS
#define EXPORT __stdcall __declspec(dllexport)
#define NOEXPORT
#else // #ifndef SLEEF_STATIC_LIBS
#define EXPORT
#define NOEXPORT
#endif // #ifndef SLEEF_STATIC_LIBS
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#define EXPORT __attribute__((visibility("default")))
#define NOEXPORT __attribute__ ((visibility ("hidden")))
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#endif // #if defined(SLEEF_GENHEADER)
#define SLEEF_NAN __builtin_nan("")
#define SLEEF_NANf __builtin_nanf("")
#define SLEEF_NANl __builtin_nanl("")
#define SLEEF_INFINITY __builtin_inf()
#define SLEEF_INFINITYf __builtin_inff()
#define SLEEF_INFINITYl __builtin_infl()
#if defined(__INTEL_COMPILER) || defined (__clang__)
#define SLEEF_INFINITYq __builtin_inf()
#define SLEEF_NANq __builtin_nan("")
#else
#define SLEEF_INFINITYq __builtin_infq()
#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
#endif
#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
#if defined(SLEEF_GENHEADER)
#define INLINE SLEEF_ALWAYS_INLINE
#define CONST SLEEF_CONST
#define EXPORT SLEEF_INLINE
#define NOEXPORT
#else // #if defined(SLEEF_GENHEADER)
#define INLINE __forceinline
#define CONST
#ifndef SLEEF_STATIC_LIBS
#define EXPORT __declspec(dllexport)
#define NOEXPORT
#else
#define EXPORT
#define NOEXPORT
#endif
#endif // #if defined(SLEEF_GENHEADER)
#define RESTRICT
#define ALIGNED(x)
#define LIKELY(condition) (condition)
#define UNLIKELY(condition) (condition)
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
#include <x86intrin.h>
#endif
#define SLEEF_INFINITY (1e+300 * 1e+300)
#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
#define SLEEF_NANf ((float)SLEEF_NAN)
#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
#define SLEEF_NANl ((long double)SLEEF_NAN)
#if (defined(_M_AMD64) || defined(_M_X64))
#ifndef __SSE2__
#define __SSE2__
#define __SSE3__
#define __SSE4_1__
#endif
#elif _M_IX86_FP == 2
#ifndef __SSE2__
#define __SSE2__
#define __SSE3__
#define __SSE4_1__
#endif
#elif _M_IX86_FP == 1
#ifndef __SSE__
#define __SSE__
#endif
#endif
#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
#if !defined(__linux__)
#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
#define isnanf(x) ((x) != (x))
#define isnanl(x) ((x) != (x))
#endif
#endif // #ifndef __MISC_H__
#ifdef ENABLE_AAVPCS
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
#else
#define VECTOR_CC
#endif
//
#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
#if !defined (__clang__)
#pragma GCC diagnostic ignored "-Wattribute-alias"
#pragma GCC diagnostic ignored "-Wlto-type-mismatch"
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#endif
#endif
#if defined(_MSC_VER)
#pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
#pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,255 @@
## 3.6.1 - 2024-06-10
This patch release provides important bug fixes, including a fix
for API compatibility with 3.5 (#534).
The support and test for some features is still limited, as
documented in [README](./README.md), however significant progress
was made in order to test on Linux, macOS and Windows.
### Added
- Add support for RISC-V in DFT, QUAD and inline headers (#503,
#522).
- Add GHA workflow to run CI tests on Windows x86 (#540) and macOS
x86/aarch64 (#543). And update test matrix.
- Add GHA workflows to run examples in CI (#550).
### Changed
- Cleanup/Improve support for RISC-V in LIBM (#520, #521).
- Update supported environment in documentation (#529, #549),
including website and test matrix from README.
### Fixed
- Major fix and cleanup of CMakeLists.txt (#531).
- Fix compatibility issue after removal of quad and long double
sincospi (#545). Restores functions that are missing in 3.6.
- Various bug fixes (#528, #533, #536, #537).
## 3.6 - 2024-02-14
This release follows a long period of inactivity. The library is now
being actively maintained. However, the support and test for some
features is currently limited, as documented in [README](./README.md).
### Added
- Add documentation for the quad precision math library
- Enable generation of inline header file for CUDA (PR #337)
- Add support for System/390 z15 support (PR #343)
- Add support for POWER 9 (PR #360)
- Add quad-precision functions (PR #375, #377, #380, #381, #382, #383,
#385, #386, #387)
- Add preliminary support for iOS and Android (PR #388, #389)
- Add OpenMP pragmas to the function declarations in sleef.h to enable
auto-vectorization by GCC (PR #404, #406)
- Add new public CI test infrastructure using GitHub Actions (PR #476)
- Add support for RISC-V in libm (PR #477)
### Removed
- Remove old CI scripts based on Travis/Jenkins/Appveyor (PR #502)
### Changed
- Optimise error functions (PR #370)
- Update CMake package config (PR #412)
- Update documentation and move doc/website to main repository (PR #504,
#513)
- Add SLEEF_ prefix to user-facing CMake options (PR #509)
- Disable SVE on Darwin (PR #512)
### Fixed
- Fix parallel builds with GNU make (PR #491)
- Various bug fixes (PR #492, #499, #508)
## 3.5.1 - 2020-09-15
### Changed
- Fixed a bug in handling compiler options
## 3.5 - 2020-09-01
- IBM System/390 support is added.
- The library can be built with Clang on Windows.
- Static libraries with LTO can be generated.
- Alternative division and sqrt methods can be chosen with AArch64.
- Header files for inlining the whole SLEEF functions can be generated.
- IEEE remainder function is added.
- GCC-10 can now build SLEEF with SVE support.
## 3.4.1 - 2019-10-01
### Changed
- Fixed accuracy problem with tan_u35, atan_u10, log2f_u35 and exp10f_u10.
https://github.com/shibatch/sleef/pull/260
https://github.com/shibatch/sleef/pull/265
https://github.com/shibatch/sleef/pull/267
- SVE intrinsics that are not supported in newer ACLE are replaced.
https://github.com/shibatch/sleef/pull/268
- FMA4 detection problem is fixed.
https://github.com/shibatch/sleef/pull/262
- Compilation problem under Windows with MinGW is fixed.
https://github.com/shibatch/sleef/pull/266
## 3.4 - 2019-04-28
### Added
- Faster and low precision functions are added.
https://github.com/shibatch/sleef/pull/229
- Functions that return consistent results across platforms are
added
https://github.com/shibatch/sleef/pull/216
https://github.com/shibatch/sleef/pull/224
- Quad precision math library(libsleefquad) is added
https://github.com/shibatch/sleef/pull/235
https://github.com/shibatch/sleef/pull/237
https://github.com/shibatch/sleef/pull/240
- AArch64 Vector Procedure Call Standard (AAVPCS) support.
### Changed
- Many functions are now faster
- Testers are now faster
## 3.3.1 - 2018-08-20
### Added
- FreeBSD support is added
### Changed
- i386 build problem is fixed
- Trigonometric functions now evaluate correctly with full FP
domain.
https://github.com/shibatch/sleef/pull/210
## 3.3 - 2018-07-06
### Added
- SVE target support is added to libsleef.
https://github.com/shibatch/sleef/pull/180
- SVE target support is added to DFT. With this patch, DFT operations
can be carried out using 256, 512, 1024 and 2048-bit wide vectors
according to runtime availability of vector registers and operators.
https://github.com/shibatch/sleef/pull/182
- 3.5-ULP versions of sinh, cosh, tanh, sinhf, coshf, tanhf, and the
corresponding testing functionalities are added.
https://github.com/shibatch/sleef/pull/192
- Power VSX target support is added to libsleef.
https://github.com/shibatch/sleef/pull/195
- Payne-Hanek like argument reduction is added to libsleef.
https://github.com/shibatch/sleef/pull/197
## 3.2 - 2018-02-26
### Added
- The whole build system of the project migrated from makefiles to
cmake. In particualr this includes `libsleef`, `libsleefgnuabi`,
`libdft` and all the tests.
- Benchmarks that compare `libsleef` vs `SVML` on X86 Linux are
available in the project tree under src/libm-benchmarks directory.
- Extensive upstream testing via Travis CI and Appveyor, on the
following systems:
* OS: Windows / Linux / OSX.
* Compilers: gcc / clang / MSVC.
* Targets: X86 (SSE/AVX/AVX2/AVX512F), AArch64 (Advanced SIMD), ARM
(NEON). Emulators like QEMU or SDE can be used to run the tests.
- Added the following new vector functions (with relative testing):
* `log2`
- New compatibility tests have been added to check that
`libsleefgnuabi` exports the GNUABI symbols correctly.
- The library can be compiled to an LLVM bitcode object.
- Added masked interface to the library to support AVX512F masked
vectorization.
### Changed
- Use native instructions if available for `sqrt`.
- Fixed fmax and fmin behavior on AArch64:
https://github.com/shibatch/sleef/pull/140
- Speed improvements for `asin`, `acos`, `fmod` and `log`. Computation
speed of other functions are also improved by general optimization.
https://github.com/shibatch/sleef/pull/97
- Removed `libm` dependency.
### Removed
- Makefile build system
## 3.1 - 2017-07-19
- Added AArch64 support
- Implemented the remaining C99 math functions : lgamma, tgamma,
erf, erfc, fabs, copysign, fmax, fmin, fdim, trunc, floor, ceil,
round, rint, modf, ldexp, nextafter, frexp, hypot, and fmod.
- Added dispatcher for x86 functions
- Improved reduction of trigonometric functions
- Added support for 32-bit x86, Cygwin, etc.
- Improved tester
## 3.0 - 2017-02-07
- New API is defined
- Functions for DFT are added
- sincospi functions are added
- gencoef now supports single, extended and quad precision in addition to double precision
- Linux, Windows and Mac OS X are supported
- GCC, Clang, Intel Compiler, Microsoft Visual C++ are supported
- The library can be compiled as DLLs
- Files needed for creating a debian package are now included
## 2.120 - 2017-01-30
- Relicensed to Boost Software License Version 1.0
## 2.110 - 2016-12-11
- The valid range of argument is extended for trig functions
- Specification of each functions regarding to the domain and accuracy is added
- A coefficient generation tool is added
- New testing tools are introduced
- Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh
- SIMD xsin and xcos returned values more than 1 when FMA is enabled
- Pure C cbrt returned incorrect values when the argument is negative
- tan_u1 returned values with more than 1 ulp of error on rare occasions
- Removed support for Java language(because no one seems using this)
## 2.100 - 2016-12-04
- Added support for AVX-512F and Clang Extended Vectors.
## 2.90 - 2016-11-27
- Added ilogbf. All the reported bugs(listed below) are fixed.
- Log function returned incorrect values when the argument is very small.
- Signs of returned values were incorrect when the argument is signed zero.
- Tester incorrectly counted ULP in some cases.
- ilogb function returned incorrect values in some cases.
## 2.80 - 2013-05-18
- Added support for ARM NEON. Added higher accuracy single
precision functions : sinf_u1, cosf_u1, sincosf_u1, tanf_u1, asinf_u1,
acosf_u1, atanf_u1, atan2f_u1, logf_u1, and cbrtf_u1.
## 2.70 - 2013-04-30
- Added higher accuracy functions : sin_u1, cos_u1, sincos_u1,
tan_u1, asin_u1, acos_u1, atan_u1, atan2_u1, log_u1, and
cbrt_u1. These functions evaluate the corresponding function with at
most 1 ulp of error.
## 2.60 - 2013-03-26
- Added the remaining single precision functions : powf, sinhf,
coshf, tanhf, exp2f, exp10f, log10f, log1pf. Added support for FMA4
(for AMD Bulldozer). Added more test cases. Fixed minor bugs (which
degraded accuracy in some rare cases).
## 2.50 - 2013-03-12
- Added support for AVX2. SLEEF now compiles with ICC.
## 2.40 - 2013-03-07
- Fixed incorrect denormal/nonnumber handling in ldexp, ldexpf,
sinf and cosf. Removed support for Go language.
## 2.31 - 2012-07-05
- Added sincosf.
## 2.30 - 2012-01-20
- Added single precision functions : sinf, cosf, tanf, asinf,
acosf, atanf, logf, expf, atan2f and cbrtf.
## 2.20 - 2012-01-09
- Added exp2, exp10, expm1, log10, log1p, and cbrt.
## 2.10 - 2012-01-05
- asin() and acos() are back.
- Added ilogb() and ldexp().
- Added hyperbolic functions.
- Eliminated dependency on frexp, ldexp, fabs, isnan and isinf.
## 2.00 - 2011-12-30
- All of the algorithm has been updated.
- Both accuracy and speed are improved since version 1.10.
- Denormal number handling is also improved.
## 1.10 - 2010-06-22
- AVX support is added. Accuracy tester is added.
## 1.00 - 2010-05-15
- Initial release

View File

@@ -0,0 +1,339 @@
cmake_minimum_required(VERSION 3.18)
project(SLEEF VERSION 3.6.1 LANGUAGES C)
set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
# Options
option(SLEEF_BUILD_STATIC_TEST_BINS "Build statically linked test executables" OFF)
option(SLEEF_ENABLE_LTO "Enable LTO on GCC or ThinLTO on clang" OFF)
option(SLEEF_BUILD_LIBM "libsleef will be built." ON)
option(SLEEF_BUILD_DFT "libsleefdft will be built." OFF)
option(SLEEF_BUILD_QUAD "libsleefquad will be built." OFF)
option(SLEEF_BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON)
option(SLEEF_BUILD_SCALAR_LIB "libsleefscalar will be built." OFF)
option(SLEEF_BUILD_TESTS "Tests will be built." ON)
option(SLEEF_BUILD_INLINE_HEADERS "Build header for inlining whole SLEEF functions" OFF)
option(SLEEF_TEST_ALL_IUT "Perform tests on implementations with all vector extensions" OFF)
option(SLEEF_SHOW_CONFIG "Show SLEEF configuration status messages." ON)
option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
option(SLEEF_ASAN "Enable address sanitizing on all targets." OFF)
option(SLEEF_ENFORCE_TESTER "Build fails if tester is not available" OFF)
option(SLEEF_ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)
option(SLEEF_ENABLE_ALTDIV "Enable alternative division method (aarch64 only)" OFF)
option(SLEEF_ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
option(SLEEF_DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)
option(SLEEF_DISABLE_MPFR "Disable testing with the MPFR library" OFF)
option(SLEEF_DISABLE_SSL "Disable testing with the SSL library" OFF)
option(SLEEF_ENABLE_CUDA "Enable CUDA" OFF)
option(SLEEF_ENABLE_CXX "Enable C++" OFF)
#
if (DEFINED SLEEF_BUILD_SHARED_LIBS)
set(BUILD_SHARED_LIBS ${SLEEF_BUILD_SHARED_LIBS})
endif ()
if (SLEEF_SHOW_CONFIG)
# Normalize the value of BUILD_SHARED_LIBS so that it displays nicely
# in the configuration display
if (BUILD_SHARED_LIBS)
set(BUILD_SHARED_LIBS ON)
else ()
set(BUILD_SHARED_LIBS OFF)
endif ()
endif ()
# Function used to generate safe command arguments for add_custom_command
function(command_arguments PROPNAME)
set(quoted_args "")
foreach(arg ${ARGN})
list(APPEND quoted_args "\"${arg}\"" )
endforeach()
set(${PROPNAME} ${quoted_args} PARENT_SCOPE)
endfunction()
# Helper function for concatenating several files
function(sleef_concat_files)
cmake_parse_arguments(concat_required "" "OUTPUT" "SOURCES" ${ARGN})
if("${concat_required_OUTPUT}" STREQUAL "")
message(FATAL_ERROR "Must pass OUTPUT to sleef_concat_files")
endif()
if(NOT concat_required_SOURCES)
message(FATAL_ERROR "sleef_concat_files not passed any SOURCES")
endif()
add_custom_command(
OUTPUT ${concat_required_OUTPUT}
COMMAND ${CMAKE_COMMAND} -E cat ${concat_required_SOURCES} > ${concat_required_OUTPUT}
DEPENDS ${concat_required_SOURCES}
COMMAND_EXPAND_LISTS)
endfunction()
# Settings
set(SLEEF_ALL_SUPPORTED_EXTENSIONS
AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2 # x86
SVENOFMA SVE ADVSIMDNOFMA ADVSIMD # Aarch64
NEON32 NEON32VFPV4 # Aarch32
VSX VSXNOFMA VSX3 VSX3NOFMA # PPC64
VXE VXENOFMA VXE2 VXE2NOFMA # IBM Z
RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2 # RISC-V Vectors
PUREC_SCALAR PURECFMA_SCALAR # Generic type
CACHE STRING "List of SIMD architectures supported by libsleef."
)
set(SLEEF_SUPPORTED_LIBM_EXTENSIONS
AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2 # x86
SVENOFMA SVE ADVSIMDNOFMA ADVSIMD # Aarch64
NEON32 NEON32VFPV4 # Aarch32
VSX VSXNOFMA VSX3 VSX3NOFMA # PPC64
VXE VXENOFMA VXE2 VXE2NOFMA # IBM Z
RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2 # RISC-V Vectors
PUREC_SCALAR PURECFMA_SCALAR # Generic type
CACHE STRING "List of SIMD architectures supported by libsleef."
)
set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS
SSE2 AVX AVX2 AVX512F ADVSIMD SVE
CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI."
)
set(SLEEF_SUPPORTED_QUAD_EXTENSIONS
PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX2128 AVX2 AVX512F ADVSIMD SVE VSX VSX3 VXE VXE2 RVVM1 RVVM2)
# MKMASKED_PARAMS
command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_dp avx512f e 8)
command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_sp avx512f e -16)
command_arguments(MKMASKED_PARAMS_GNUABI_SVE_dp sve s 2)
command_arguments(MKMASKED_PARAMS_GNUABI_SVE_sp sve s -4)
#
set(COSTOVERRIDE_AVX512F 10)
set(COSTOVERRIDE_AVX512FNOFMA 10)
set(COSTOVERRIDE_AVX2 2)
set(COSTOVERRIDE_AVX 2)
set(COSTOVERRIDE_NEON32 2)
set(COSTOVERRIDE_NEON32VFPV4 2)
set(COSTOVERRIDE_SVE 10)
set(COSTOVERRIDE_SVENOFMA 10)
set(COSTOVERRIDE_RVVM1 10)
set(COSTOVERRIDE_RVVM1NOFMA 10)
set(COSTOVERRIDE_RVVM2 20)
set(COSTOVERRIDE_RVVM2NOFMA 20)
#
enable_testing()
if (SLEEF_ENABLE_CXX)
enable_language(CXX)
endif()
if (SLEEF_ENABLE_CUDA)
enable_language(CUDA)
endif()
# For specifying installation directories
include(GNUInstallDirs)
if(NOT DEFINED sleef_SOURCE_DIR)
set(sleef_SOURCE_DIR ${CMAKE_SOURCE_DIR})
endif()
if(NOT DEFINED sleef_BINARY_DIR)
set(sleef_BINARY_DIR ${CMAKE_BINARY_DIR})
endif()
# Sanity check for in-source builds which we do not want to happen
if(sleef_SOURCE_DIR STREQUAL sleef_BINARY_DIR)
message(FATAL_ERROR "SLEEF does not allow in-source builds.
You can refer to docs/build-with-cmake.md for instructions on how provide a \
separate build directory. Note: Please remove autogenerated file \
`CMakeCache.txt` and directory `CMakeFiles` in the current directory.")
endif()
if(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
message(FATAL_ERROR "SLEEF_ENABLE_LTO and BUILD_SHARED_LIBS cannot be specified at the same time")
endif(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
if(SLEEF_ENABLE_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT supported OUTPUT error)
endif()
# Set output directories for the library files
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
string(TOUPPER ${CONFIG} CONFIG)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/bin)
endforeach(CONFIG CMAKE_CONFIGURATION_TYPES)
# Path for finding cmake modules
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
set(SLEEF_SCRIPT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Scripts CACHE PATH
"Path for finding sleef specific cmake scripts")
if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND "x${CMAKE_C_SIMULATE_ID}" STREQUAL "xMSVC")
message(STATUS "Building with Clang on Windows")
set(SLEEF_CLANG_ON_WINDOWS TRUE)
endif()
# sleef-config.h.in passes cmake settings to the source code
include(Configure.cmake)
configure_file(
${PROJECT_SOURCE_DIR}/sleef-config.h.in
${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY)
# We like to have a documented index of all targets in the project. The
# variables listed below carry the names of the targets defined throughout
# the project.
# Generates object file (shared library) `libsleef`
# Defined in src/libm/CMakeLists.txt via command add_library
set(TARGET_LIBSLEEF "sleef")
set(TARGET_LIBSLEEFGNUABI "sleefgnuabi")
# Generates the sleef.h headers and all the rename headers
# Defined in src/libm/CMakeLists.txt via custom commands and a custom target
set(TARGET_HEADERS "headers")
set(TARGET_INLINE_HEADERS "inline_headers")
set(TARGET_QINLINE_HEADERS "quad_inline_headers")
set(TARGET_LIBINLINE "sleefinline")
# Generates executable files for running the test suite
# Defined in src/libm-tester/CMakeLists.txt via command add_executable
set(TARGET_TESTER "tester")
set(TARGET_IUT "iut")
# The target to generate LLVM bitcode only, available when SLEEF_ENABLE_LLVM_BITCODE is passed to cmake
set(TARGET_LLVM_BITCODE "llvm-bitcode")
# Generates the helper executable file mkrename needed to write the sleef header
set(TARGET_MKRENAME "mkrename")
set(TARGET_MKRENAME_GNUABI "mkrename_gnuabi")
set(TARGET_MKMASKED_GNUABI "mkmasked_gnuabi")
# Generates the helper executable file mkdisp needed to write the sleef header
set(TARGET_MKDISP "mkdisp")
set(TARGET_MKALIAS "mkalias")
# Generates static library common
# Defined in src/common/CMakeLists.txt via command add_library
set(TARGET_LIBCOMMON_OBJ "common")
set(TARGET_LIBARRAYMAP_OBJ "arraymap")
# Function used to add an executable that is executed on host
function(add_host_executable TARGETNAME)
if (NOT CMAKE_CROSSCOMPILING)
add_executable(${TARGETNAME} ${ARGN})
# Ensure that Darwin host executable is built as universal binary
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
target_compile_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
target_link_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
endif()
else()
add_executable(${TARGETNAME} IMPORTED GLOBAL)
set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
endif()
endfunction()
function(host_target_AAVPCS_definitions TARGETNAME)
if (NOT CMAKE_CROSSCOMPILING)
target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1)
endif()
endfunction()
# Generates object file (shared library) `libsleefdft`
# Defined in src/dft/CMakeLists.txt via command add_library
set(TARGET_LIBDFT "sleefdft")
# Check subdirectories
add_subdirectory("src")
# Install the CMake package config
include(CMakePackageConfigHelpers)
write_basic_package_version_file(
sleefConfigVersion.cmake
COMPATIBILITY SameMajorVersion
)
set(
SLEEF_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/sleef"
CACHE STRING "CMake package config location relative to the install prefix"
)
mark_as_advanced(SLEEF_INSTALL_CMAKEDIR)
install(
FILES
"${PROJECT_SOURCE_DIR}/sleefConfig.cmake"
"${PROJECT_BINARY_DIR}/sleefConfigVersion.cmake"
DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
COMPONENT sleef_Development
)
install(
EXPORT sleefTargets
NAMESPACE sleef::
DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
COMPONENT sleef_Development
)
# Extra messages at configuration time. By default is active, it can be
# turned off by invoking cmake with "-DSLEEF_SHOW_CONFIG=OFF".
if(SLEEF_SHOW_CONFIG)
message(STATUS "Configuring build for ${PROJECT_NAME}-v${SLEEF_VERSION}")
message(" Target system: ${CMAKE_SYSTEM}")
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
message(" Target processor: ${CMAKE_OSX_ARCHITECTURES}")
else()
message(" Target processor: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
message(" Host system: ${CMAKE_HOST_SYSTEM}")
message(" Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
message(" Detected C compiler: ${CMAKE_C_COMPILER_ID} @ ${CMAKE_C_COMPILER}")
message(" CMake: ${CMAKE_VERSION}")
message(" Make program: ${CMAKE_MAKE_PROGRAM}")
if(CMAKE_CROSSCOMPILING)
message(" Crosscompiling SLEEF.")
message(" Native build dir: ${NATIVE_BUILD_DIR}")
endif(CMAKE_CROSSCOMPILING)
message(STATUS "Using option `${SLEEF_C_FLAGS}` to compile libsleef")
message(STATUS "Building shared libs : " ${BUILD_SHARED_LIBS})
message(STATUS "Building static test bins: " ${SLEEF_BUILD_STATIC_TEST_BINS})
message(STATUS "MPFR : " ${LIB_MPFR})
if (MPFR_INCLUDE_DIR)
message(STATUS "MPFR header file in " ${MPFR_INCLUDE_DIR})
endif()
message(STATUS "GMP : " ${LIBGMP})
message(STATUS "RT : " ${LIBRT})
message(STATUS "FFTW3 : " ${LIBFFTW3})
message(STATUS "OPENSSL : " ${OPENSSL_VERSION})
message(STATUS "SDE : " ${SDE_COMMAND})
if (SLEEF_BUILD_INLINE_HEADERS)
message(STATUS "SED : " ${SED_COMMAND})
endif()
message(STATUS "COMPILER_SUPPORTS_OPENMP : " ${COMPILER_SUPPORTS_OPENMP})
if(ENABLE_GNUABI)
message(STATUS "A version of SLEEF compatible with libm and libmvec in GNU libc will be produced (${TARGET_LIBSLEEFGNUABI}.so)")
endif()
if (COMPILER_SUPPORTS_SVE)
message(STATUS "Building SLEEF with VLA SVE support")
if (ARMIE_COMMAND)
message(STATUS "Arm Instruction Emulator found at ${ARMIE_COMMAND}")
message(STATUS "SVE testing is done with ${SVE_VECTOR_BITS}-bits vectors.")
endif()
endif()
if(FORCE_AAVPCS)
message(STATUS "Building SLEEF with AArch64 Vector PCS support")
endif()
endif(SLEEF_SHOW_CONFIG)

View File

@@ -0,0 +1,27 @@
# List of contributors
These lists are not exhaustive and only provide most relevant contact information.
For an exhausitive list of contributors please refer to the
[GitHub contributors section for SLEEF](https://github.com/shibatch/sleef/graphs/contributors).
## Maintainers
| Name | Affiliation | Github profile |
| -------------------- | ----------------------- | ---------------------------------- |
| Pierre Blanchard | Arm Ltd. | https://github.com/blapie |
| Joana Cruz | Arm Ltd. | https://github.com/joanaxcruz |
| Joe Ramsay | Arm Ltd. | https://github.com/joeramsay |
| Naoki Shibata | Nara Institute of Science and Technology | https://github.com/shibatch |
## Contributors
| Name | Affiliation | Github profile |
| -------------------- | ----------------------- | ---------------------------------- |
| Anonymous | | https://github.com/friendlyanon |
| Diana Bite | Former Arm Ltd. | https://github.com/diaena |
| Ludovic Henry | Rivos Inc. | https://github.com/luhenry |
| Martin Krastev | Chaos Group | https://github.com/blu |
| Jilayne Lovejoy | Former Arm Inc. | https://github.com/jlovejoy |
| Kerry McLaughlin | Arm Ltd. | https://github.com/kmclaughlin-arm |
| Alexandre Mutel | Unity Technologies | https://github.com/xoofx |
| Francesco Petrogalli | Former Arm Ltd. | https://github.com/fpetrogalli-arm |

View File

@@ -0,0 +1,860 @@
include(CheckCCompilerFlag)
include(CheckCSourceCompiles)
include(CheckTypeSize)
include(CheckLanguage)
#
if (SLEEF_BUILD_STATIC_TEST_BINS)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
set(BUILD_SHARED_LIBS OFF)
set(CMAKE_EXE_LINKER_FLAGS "-static")
endif()
set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl")
if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL)
if (SLEEF_BUILD_STATIC_TEST_BINS)
set(OPENSSL_USE_STATIC_LIBS TRUE)
endif()
find_package(OpenSSL)
if (OPENSSL_FOUND)
set(SLEEF_OPENSSL_FOUND TRUE)
set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES})
# Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically.
# This is a known issue https://github.com/openssl/openssl/issues/13872.
if (SLEEF_BUILD_STATIC_TEST_BINS)
string(REGEX REPLACE
"-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}")
endif()
set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION})
set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES})
set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
endif()
else()
# find_package cannot find OpenSSL when cross-compiling
find_library(LIBSSL ssl)
find_library(LIBCRYPTO crypto)
if (LIBSSL AND LIBCRYPTO)
set(SLEEF_OPENSSL_FOUND TRUE)
set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES})
set(SLEEF_OPENSSL_VERSION ${LIBSSL})
endif()
endif()
if (SLEEF_ENFORCE_TESTER3 AND NOT SLEEF_OPENSSL_FOUND)
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER3 is specified and OpenSSL not found")
endif()
# Some toolchains require explicit linking of the libraries following.
find_library(LIB_MPFR mpfr)
find_library(LIBM m)
find_library(LIBGMP gmp)
find_library(LIBRT rt)
find_library(LIBFFTW3 fftw3)
if (LIB_MPFR)
find_path(MPFR_INCLUDE_DIR
NAMES mpfr.h
ONLY_CMAKE_FIND_ROOT_PATH)
endif(LIB_MPFR)
if (LIBFFTW3)
find_path(FFTW3_INCLUDE_DIR
NAMES fftw3.h
ONLY_CMAKE_FIND_ROOT_PATH)
endif(LIBFFTW3)
if (NOT LIBM)
set(LIBM "")
endif()
if (NOT LIBRT)
set(LIBRT "")
endif()
if (SLEEF_DISABLE_MPFR)
set(LIB_MPFR "")
endif()
if (SLEEF_DISABLE_SSL)
set(SLEEF_OPENSSL_FOUND FALSE)
endif()
# Force set default build type if none was specified
# Note: some sleef code requires the optimisation flags turned on
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Setting build type to 'Release' (required for full support).")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
"Debug" "Release" "RelWithDebInfo" "MinSizeRel")
endif()
# Sanitizers
if(SLEEF_ASAN)
# Add address sanitizing to all targets
add_compile_options(-fno-omit-frame-pointer -fsanitize=address)
add_link_options(-fno-omit-frame-pointer -fsanitize=address)
endif()
# TARGET PROCESSOR DETECTION
set(SLEEF_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
set(SLEEF_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
endif()
# PLATFORM DETECTION
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(SLEEF_ARCH_32BIT ON CACHE INTERNAL "True for 32-bit architecture.")
endif()
if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)")
set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64")
set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
# Aarch64 requires support for advsimdfma4
set(COMPILER_SUPPORTS_ADVSIMD 1)
set(COMPILER_SUPPORTS_ADVSIMDNOFMA 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
set(SLEEF_ARCH_AARCH32 ON CACHE INTERNAL "True for Aarch32 architecture.")
set(COMPILER_SUPPORTS_NEON32 1)
set(COMPILER_SUPPORTS_NEON32VFPV4 1)
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mfpu=vfpv4")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
set(SLEEF_ARCH_PPC64 ON CACHE INTERNAL "True for PPC64 architecture.")
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mvsx")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
set(SLEEF_ARCH_S390X ON CACHE INTERNAL "True for IBM Z architecture.")
set(CLANG_FLAGS_ENABLE_PUREC_SCALAR "-march=z14;-mzvector")
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-march=z14;-mzvector")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
set(SLEEF_ARCH_RISCV64 ON CACHE INTERNAL "True for RISCV64 architecture.")
endif()
set(COMPILER_SUPPORTS_PUREC_SCALAR 1)
set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
# Compiler feature detection
# Detect CLANG executable path (on both Windows and Linux/OSX)
if(NOT CLANG_EXE_PATH)
# If the current compiler used by CMAKE is already clang, use this one directly
if(CMAKE_C_COMPILER MATCHES "clang")
set(CLANG_EXE_PATH ${CMAKE_C_COMPILER})
else()
# Else we may find clang on the path?
find_program(CLANG_EXE_PATH NAMES clang "clang-11" "clang-10" "clang-9" "clang-8" "clang-7" "clang-6.0" "clang-5.0" "clang-4.0" "clang-3.9")
endif()
endif()
# Allow to define the Gcc/Clang here
# As we might compile the lib with MSVC, but generates bitcode with CLANG
# Intel vector extensions.
set(CLANG_FLAGS_ENABLE_SSE2 "-msse2")
set(CLANG_FLAGS_ENABLE_SSE4 "-msse4.1")
set(CLANG_FLAGS_ENABLE_AVX "-mavx")
set(CLANG_FLAGS_ENABLE_FMA4 "-mfma4")
set(CLANG_FLAGS_ENABLE_AVX2 "-mavx2;-mfma")
set(CLANG_FLAGS_ENABLE_AVX2128 "-mavx2;-mfma")
set(CLANG_FLAGS_ENABLE_AVX512F "-mavx512f")
set(CLANG_FLAGS_ENABLE_AVX512FNOFMA "-mavx512f")
set(CLANG_FLAGS_ENABLE_NEON32 "--target=arm-linux-gnueabihf;-mcpu=cortex-a8")
set(CLANG_FLAGS_ENABLE_NEON32VFPV4 "-march=armv7-a;-mfpu=neon-vfpv4")
# Arm AArch64 vector extensions.
set(CLANG_FLAGS_ENABLE_SVE "-march=armv8-a+sve")
set(CLANG_FLAGS_ENABLE_SVENOFMA "-march=armv8-a+sve")
# PPC64
set(CLANG_FLAGS_ENABLE_VSX "-mcpu=power8")
set(CLANG_FLAGS_ENABLE_VSXNOFMA "-mcpu=power8")
set(CLANG_FLAGS_ENABLE_VSX3 "-mcpu=power9")
set(CLANG_FLAGS_ENABLE_VSX3NOFMA "-mcpu=power9")
# IBM z
set(CLANG_FLAGS_ENABLE_VXE "-march=z14;-mzvector")
set(CLANG_FLAGS_ENABLE_VXENOFMA "-march=z14;-mzvector")
set(CLANG_FLAGS_ENABLE_VXE2 "-march=z15;-mzvector")
set(CLANG_FLAGS_ENABLE_VXE2NOFMA "-march=z15;-mzvector")
# RISC-V
set(CLANG_FLAGS_ENABLE_RVVM1 "-march=rv64gcv_zba_zbb_zbs")
set(CLANG_FLAGS_ENABLE_RVVM1NOFMA "-march=rv64gcv_zba_zbb_zbs")
set(CLANG_FLAGS_ENABLE_RVVM2 "-march=rv64gcv_zba_zbb_zbs")
set(CLANG_FLAGS_ENABLE_RVVM2NOFMA "-march=rv64gcv_zba_zbb_zbs")
set(FLAGS_OTHERS "")
# All variables storing compiler flags should be prefixed with FLAGS_
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
# Always compile sleef with -ffp-contract.
set(FLAGS_STRICTMATH "-ffp-contract=off")
set(FLAGS_FASTMATH "-ffast-math")
set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
if (SLEEF_ARCH_X86 AND SLEEF_ARCH_32BIT)
string(CONCAT FLAGS_STRICTMATH ${FLAGS_STRICTMATH} " -msse2 -mfpmath=sse")
string(CONCAT FLAGS_FASTMATH ${FLAGS_FASTMATH} " -msse2 -mfpmath=sse")
endif()
# Without the options below, gcc generates calls to libm
string(CONCAT FLAGS_OTHERS "-fno-math-errno -fno-trapping-math")
# Intel vector extensions.
foreach(SIMD ${SLEEF_ALL_SUPPORTED_EXTENSIONS})
set(FLAGS_ENABLE_${SIMD} ${CLANG_FLAGS_ENABLE_${SIMD}})
endforeach()
# Warning flags.
set(FLAGS_WALL "-Wall -Wno-unused-function -Wno-attributes -Wno-unused-result")
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
# The following compiler option is needed to suppress the warning
# "AVX vector return without AVX enabled changes the ABI" at
# src/arch/helpervecext.h:88
string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi")
set(FLAGS_ENABLE_NEON32 "-mfpu=neon")
endif(CMAKE_C_COMPILER_ID MATCHES "GNU")
if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
if (NOT SLEEF_LLVM_AR_COMMAND)
find_program(SLEEF_LLVM_AR_COMMAND "llvm-ar")
endif()
if (SLEEF_LLVM_AR_COMMAND)
SET(CMAKE_AR ${SLEEF_LLVM_AR_COMMAND})
SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs <TARGET> <LINK_FLAGS> <OBJECTS>")
SET(CMAKE_C_ARCHIVE_FINISH "true")
endif(SLEEF_LLVM_AR_COMMAND)
string(CONCAT FLAGS_OTHERS "-flto=thin")
endif(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
# Flags for generating inline headers
set(FLAG_PREPROCESS "-E")
set(FLAG_PRESERVE_COMMENTS "-C")
set(FLAG_INCLUDE "-I")
set(FLAG_DEFINE "-D")
if (SLEEF_CLANG_ON_WINDOWS)
# The following line is required to prevent clang from displaying
# many warnings. Clang on Windows references MSVC header files,
# which have deprecation and security attributes for many
# functions.
string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE -Wno-deprecated-declarations")
endif()
elseif(MSVC)
# Intel vector extensions.
if (CMAKE_CL_64)
set(FLAGS_ENABLE_SSE2 /D__SSE2__)
set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__)
else()
set(FLAGS_ENABLE_SSE2 /D__SSE2__ /arch:SSE2)
set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /arch:SSE2)
endif()
set(FLAGS_ENABLE_AVX /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /arch:AVX)
set(FLAGS_ENABLE_FMA4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__FMA4__ /arch:AVX2)
set(FLAGS_ENABLE_AVX2 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
set(FLAGS_ENABLE_AVX2128 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
set(FLAGS_ENABLE_AVX512F /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
set(FLAGS_ENABLE_AVX512FNOFMA /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
set(FLAGS_ENABLE_PURECFMA_SCALAR /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
set(FLAGS_WALL "/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE")
set(FLAGS_NO_ERRNO "")
set(FLAG_PREPROCESS "/E")
set(FLAG_PRESERVE_COMMENTS "/C")
set(FLAG_INCLUDE "/I")
set(FLAG_DEFINE "/D")
elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
set(FLAGS_ENABLE_SSE2 "-msse2")
set(FLAGS_ENABLE_SSE4 "-msse4.1")
set(FLAGS_ENABLE_AVX "-mavx")
set(FLAGS_ENABLE_AVX2 "-march=core-avx2")
set(FLAGS_ENABLE_AVX2128 "-march=core-avx2")
set(FLAGS_ENABLE_AVX512F "-xCOMMON-AVX512")
set(FLAGS_ENABLE_AVX512FNOFMA "-xCOMMON-AVX512")
set(FLAGS_ENABLE_PURECFMA_SCALAR "-march=core-avx2;-fno-strict-aliasing")
set(FLAGS_ENABLE_FMA4 "-msse2") # This is a dummy flag
if(CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_types")
set(FLAGS_FASTMATH "-fp-model fast -Qoption,cpp,--extended_float_types")
else()
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type")
set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type")
endif()
set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
set(FLAGS_WALL "-fmax-errors=3 -Wall -Wno-unused -Wno-attributes")
set(FLAGS_NO_ERRNO "")
set(FLAG_PREPROCESS "-E")
set(FLAG_PRESERVE_COMMENTS "-C")
set(FLAG_INCLUDE "-I")
set(FLAG_DEFINE "-D")
endif()
set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_OTHERS}")
if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_OTHERS}")
else()
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_FASTMATH} ${FLAGS_OTHERS}")
endif()
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
set(FLAGS_ENABLE_SVE "${FLAGS_ENABLE_SVE};-fno-tree-vrp")
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse")
endif()
if(CYGWIN OR MINGW)
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-asynchronous-unwind-tables")
set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-asynchronous-unwind-tables")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 9.3 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10.2)
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
endif()
# FEATURE DETECTION
# Long double
option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF)
option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF)
if(NOT SLEEF_DISABLE_LONG_DOUBLE)
CHECK_TYPE_SIZE("long double" LD_SIZE)
if(LD_SIZE GREATER "9")
# This is needed to check since internal compiler error occurs with gcc 4.x
CHECK_C_SOURCE_COMPILES("
typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*2)));
vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
int main() { vlongdouble vld = vcast_vl_l(0);
}" COMPILER_SUPPORTS_LONG_DOUBLE)
endif()
else()
message(STATUS "Support for long double disabled by CMake option")
endif()
if (SLEEF_ENFORCE_LONG_DOUBLE AND NOT COMPILER_SUPPORTS_LONG_DOUBLE)
message(FATAL_ERROR "SLEEF_ENFORCE_LONG_DOUBLE is specified and that feature is disabled or not supported by the compiler")
endif()
# float128
option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF)
option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF)
if(NOT SLEEF_DISABLE_FLOAT128)
CHECK_C_SOURCE_COMPILES("
int main() { __float128 r = 1;
}" COMPILER_SUPPORTS_FLOAT128)
else()
message(STATUS "Support for float128 disabled by CMake option")
endif()
if (SLEEF_ENFORCE_FLOAT128 AND NOT COMPILER_SUPPORTS_FLOAT128)
message(FATAL_ERROR "SLEEF_ENFORCE_FLOAT128 is specified and that feature is disabled or not supported by the compiler")
endif()
if(COMPILER_SUPPORTS_FLOAT128)
CHECK_C_SOURCE_COMPILES("
#include <quadmath.h>
int main() { __float128 r = 1;
}" COMPILER_SUPPORTS_QUADMATH)
endif()
# SSE2
option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF)
option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF)
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE2)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE2}")
CHECK_C_SOURCE_COMPILES("
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
int main() {
__m128d r = _mm_mul_pd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
COMPILER_SUPPORTS_SSE2)
endif()
if (SLEEF_ENFORCE_SSE2 AND NOT COMPILER_SUPPORTS_SSE2)
message(FATAL_ERROR "SLEEF_ENFORCE_SSE2 is specified and that feature is disabled or not supported by the compiler")
endif()
# SSE 4.1
option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF)
option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF)
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE4)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE4}")
CHECK_C_SOURCE_COMPILES("
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
int main() {
__m128d r = _mm_floor_sd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
COMPILER_SUPPORTS_SSE4)
endif()
if (SLEEF_ENFORCE_SSE4 AND NOT COMPILER_SUPPORTS_SSE4)
message(FATAL_ERROR "SLEEF_ENFORCE_SSE4 is specified and that feature is disabled or not supported by the compiler")
endif()
# AVX
option(SLEEF_ENFORCE_AVX "Disable AVX" OFF)
option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF)
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX}")
CHECK_C_SOURCE_COMPILES("
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
int main() {
__m256d r = _mm256_add_pd(_mm256_set1_pd(1), _mm256_set1_pd(2));
}" COMPILER_SUPPORTS_AVX)
endif()
if (SLEEF_ENFORCE_AVX AND NOT COMPILER_SUPPORTS_AVX)
message(FATAL_ERROR "SLEEF_ENFORCE_AVX is specified and that feature is disabled or not supported by the compiler")
endif()
# FMA4
option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF)
option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF)
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_FMA4)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_FMA4}")
CHECK_C_SOURCE_COMPILES("
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
int main() {
__m256d r = _mm256_macc_pd(_mm256_set1_pd(1), _mm256_set1_pd(2), _mm256_set1_pd(3)); }"
COMPILER_SUPPORTS_FMA4)
endif()
if (SLEEF_ENFORCE_FMA4 AND NOT COMPILER_SUPPORTS_FMA4)
message(FATAL_ERROR "SLEEF_ENFORCE_FMA4 is specified and that feature is disabled or not supported by the compiler")
endif()
# AVX2
option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF)
option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF)
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX2)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX2}")
CHECK_C_SOURCE_COMPILES("
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
int main() {
__m256i r = _mm256_abs_epi32(_mm256_set1_epi32(1)); }"
COMPILER_SUPPORTS_AVX2)
# AVX2 implies AVX2128
if(COMPILER_SUPPORTS_AVX2)
set(COMPILER_SUPPORTS_AVX2128 1)
endif()
endif()
if (SLEEF_ENFORCE_AVX2 AND NOT COMPILER_SUPPORTS_AVX2)
message(FATAL_ERROR "SLEEF_ENFORCE_AVX2 is specified and that feature is disabled or not supported by the compiler")
endif()
# AVX512F
option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF)
option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF)
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX512F)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX512F}")
CHECK_C_SOURCE_COMPILES("
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
__m512 addConstant(__m512 arg) {
return _mm512_add_ps(arg, _mm512_set1_ps(1.f));
}
int main() {
__m512i a = _mm512_set1_epi32(1);
__m256i ymm = _mm512_extracti64x4_epi64(a, 0);
__mmask16 m = _mm512_cmp_epi32_mask(a, a, _MM_CMPINT_EQ);
__m512i r = _mm512_andnot_si512(a, a); }"
COMPILER_SUPPORTS_AVX512F)
if (COMPILER_SUPPORTS_AVX512F)
set(COMPILER_SUPPORTS_AVX512FNOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_AVX512F AND NOT COMPILER_SUPPORTS_AVX512F)
message(FATAL_ERROR "SLEEF_ENFORCE_AVX512F is specified and that feature is disabled or not supported by the compiler")
endif()
# SVE
option(SLEEF_DISABLE_SVE "Disable SVE" OFF)
option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF)
# Darwin does not support SVE yet (see issue #474),
# therefore we disable SVE on Darwin systems.
if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SVE}")
CHECK_C_SOURCE_COMPILES("
#include <arm_sve.h>
int main() {
svint32_t r = svdup_n_s32(1); }"
COMPILER_SUPPORTS_SVE)
if(COMPILER_SUPPORTS_SVE)
set(COMPILER_SUPPORTS_SVENOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_SVE AND NOT COMPILER_SUPPORTS_SVE)
message(FATAL_ERROR "SLEEF_ENFORCE_SVE is specified and that feature is disabled or not supported by the compiler")
endif()
# VSX
option(SLEEF_DISABLE_VSX "Disable VSX" OFF)
option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF)
if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX}")
CHECK_C_SOURCE_COMPILES("
#include <altivec.h>
#ifndef __LITTLE_ENDIAN__
#error \"Only VSX(ISA2.07) little-endian mode is supported \"
#endif
int main() {
vector double d;
vector unsigned char p = {
4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
};
d = vec_perm(d, d, p);
}"
COMPILER_SUPPORTS_VSX)
if (COMPILER_SUPPORTS_VSX)
set(COMPILER_SUPPORTS_VSXNOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_VSX AND NOT COMPILER_SUPPORTS_VSX)
message(FATAL_ERROR "SLEEF_ENFORCE_VSX is specified and that feature is disabled or not supported by the compiler")
endif()
# VSX3
option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF)
option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF)
if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX3)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX3}")
CHECK_C_SOURCE_COMPILES("
#include <altivec.h>
#ifndef __LITTLE_ENDIAN__
#error \"Only VSX3 little-endian mode is supported \"
#endif
int main() {
static vector double d;
static vector unsigned long long a, b;
d = vec_insert_exp(a, b);
}"
COMPILER_SUPPORTS_VSX3)
if (COMPILER_SUPPORTS_VSX3)
set(COMPILER_SUPPORTS_VSX3NOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_VSX3 AND NOT COMPILER_SUPPORTS_VSX3)
message(FATAL_ERROR "SLEEF_ENFORCE_VSX3 is specified and that feature is disabled or not supported by the compiler")
endif()
# IBM Z
option(SLEEF_DISABLE_VXE "Disable VXE" OFF)
option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF)
if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE}")
CHECK_C_SOURCE_COMPILES("
#include <vecintrin.h>
int main() {
__vector float d;
d = vec_sqrt(d);
}"
COMPILER_SUPPORTS_VXE)
if(COMPILER_SUPPORTS_VXE)
set(COMPILER_SUPPORTS_VXENOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_VXE AND NOT COMPILER_SUPPORTS_VXE)
message(FATAL_ERROR "SLEEF_ENFORCE_VXE is specified and that feature is disabled or not supported by the compiler")
endif()
#
option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF)
option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF)
if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE2)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE2}")
CHECK_C_SOURCE_COMPILES("
#include <vecintrin.h>
int main() {
__vector float d;
d = vec_sqrt(d);
}"
COMPILER_SUPPORTS_VXE2)
if(COMPILER_SUPPORTS_VXE2)
set(COMPILER_SUPPORTS_VXE2NOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_VXE2 AND NOT COMPILER_SUPPORTS_VXE2)
message(FATAL_ERROR "SLEEF_ENFORCE_VXE2 is specified and that feature is disabled or not supported by the compiler")
endif()
# RVVM1
option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF)
option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF)
if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM1)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM1}")
CHECK_C_SOURCE_COMPILES("
#include <riscv_vector.h>
int main() {
vint32m1_t r = __riscv_vmv_v_x_i32m1(1, __riscv_vlenb() * 8 / 32); }"
COMPILER_SUPPORTS_RVVM1)
if(COMPILER_SUPPORTS_RVVM1)
set(COMPILER_SUPPORTS_RVVM1NOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_RVVM1 AND NOT COMPILER_SUPPORTS_RVVM1)
message(FATAL_ERROR "SLEEF_ENFORCE_RVVM1 is specified and that feature is disabled or not supported by the compiler")
endif()
# RVVM2
option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF)
option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF)
if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM2)
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM2}")
CHECK_C_SOURCE_COMPILES("
#include <riscv_vector.h>
int main() {
vint32m2_t r = __riscv_vmv_v_x_i32m2(1, 2 * __riscv_vlenb() * 8 / 32); }"
COMPILER_SUPPORTS_RVVM2)
if(COMPILER_SUPPORTS_RVVM2)
set(COMPILER_SUPPORTS_RVVM2NOFMA 1)
endif()
endif()
if (SLEEF_ENFORCE_RVVM2 AND NOT COMPILER_SUPPORTS_RVVM2)
message(FATAL_ERROR "SLEEF_ENFORCE_RVVM2 is specified and that feature is disabled or not supported by the compiler")
endif()
# CUDA
option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
if (SLEEF_ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "SLEEF_ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
endif()
# OpenMP
option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
if(NOT SLEEF_DISABLE_OPENMP)
find_package(OpenMP)
# Check if compilation with OpenMP really succeeds
# It might not succeed even though find_package(OpenMP) succeeds.
if(OPENMP_FOUND)
set (CMAKE_REQUIRED_FLAGS "${OpenMP_C_FLAGS}")
CHECK_C_SOURCE_COMPILES("
#include <stdio.h>
int main() {
int i;
#pragma omp parallel for
for(i=0;i < 10;i++) { putchar(0); }
}"
COMPILER_SUPPORTS_OPENMP)
CHECK_C_SOURCE_COMPILES("
#pragma omp declare simd notinbranch
double func(double x) { return x + 1; }
double a[1024];
int main() {
#pragma omp parallel for simd
for (int i = 0; i < 1024; i++) a[i] = func(a[i]);
}
"
COMPILER_SUPPORTS_OMP_SIMD)
endif(OPENMP_FOUND)
else()
message(STATUS "Support for OpenMP disabled by CMake option")
endif()
if (SLEEF_ENFORCE_OPENMP AND NOT COMPILER_SUPPORTS_OPENMP)
message(FATAL_ERROR "SLEEF_ENFORCE_OPENMP is specified and that feature is disabled or not supported by the compiler")
endif()
# Weak aliases
CHECK_C_SOURCE_COMPILES("
#if defined(__CYGWIN__)
#define EXPORT __stdcall __declspec(dllexport)
#else
#define EXPORT
#endif
EXPORT int f(int a) {
return a + 2;
}
EXPORT int g(int a) __attribute__((weak, alias(\"f\")));
int main(void) {
return g(2);
}"
COMPILER_SUPPORTS_WEAK_ALIASES)
if (COMPILER_SUPPORTS_WEAK_ALIASES AND
NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND
NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND
NOT SLEEF_CLANG_ON_WINDOWS AND
NOT MINGW AND SLEEF_BUILD_GNUABI_LIBS)
set(ENABLE_GNUABI ${COMPILER_SUPPORTS_WEAK_ALIASES})
endif()
# Built-in math functions
CHECK_C_SOURCE_COMPILES("
int main(void) {
double a = __builtin_sqrt (2);
float b = __builtin_sqrtf(2);
}"
COMPILER_SUPPORTS_BUILTIN_MATH)
# SYS_getrandom
CHECK_C_SOURCE_COMPILES("
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/random.h>
int main(void) {
int i;
syscall(SYS_getrandom, &i, sizeof(i), 0);
}"
COMPILER_SUPPORTS_SYS_GETRANDOM)
#
# Reset used flags
set(CMAKE_REQUIRED_FLAGS)
set(CMAKE_REQUIRED_LIBRARIES)
# Save the default C flags
set(ORG_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
##
# Check if sde64 command is available
find_program(SDE_COMMAND sde64)
if (NOT SDE_COMMAND)
find_program(SDE_COMMAND sde)
endif()
# Check if armie command is available
find_program(ARMIE_COMMAND armie)
if (NOT SVE_VECTOR_BITS)
set(SVE_VECTOR_BITS 128)
endif()
#
find_program(FILECHECK_COMMAND NAMES FileCheck FileCheck-11 FileCheck-10 FileCheck-9)
#
find_program(SED_COMMAND sed)
##
if(SLEEF_SHOW_ERROR_LOG)
if (EXISTS ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log)
file(READ ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log FILE_CONTENT)
message("")
message("")
message("====== Content of CMakeError.log ======")
message("")
message("${FILE_CONTENT}")
message("")
message("======== End of CMakeError.log ========")
message("")
message("")
endif()
endif(SLEEF_SHOW_ERROR_LOG)
if (MSVC OR SLEEF_CLANG_ON_WINDOWS)
set(COMPILER_SUPPORTS_OPENMP FALSE) # At this time, OpenMP is not supported on MSVC
endif()
##
# Set common definitions
if (NOT BUILD_SHARED_LIBS)
set(COMMON_TARGET_DEFINITIONS SLEEF_STATIC_LIBS=1)
set(SLEEF_STATIC_LIBS 1)
endif()
if (COMPILER_SUPPORTS_WEAK_ALIASES)
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_ALIAS=1)
endif()
if (COMPILER_SUPPORTS_SYS_GETRANDOM)
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_SYS_getrandom=1)
endif()

View File

@@ -0,0 +1,23 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,221 @@
# SLEEF
![Github Actions](https://github.com/shibatch/sleef/actions/workflows/build_and_test.yml/badge.svg?event=push&branch=master)
[![DOI:10.1109/TPDS.2019.2960333](http://img.shields.io/badge/DOI-10.1109/TPDS.2019.2960333-blue.svg)](https://ieeexplore.ieee.org/document/8936472)
[![License](https://img.shields.io/badge/License-Boost_1.0-lightblue.svg)](https://www.boost.org/LICENSE_1_0.txt)
![CMake](https://img.shields.io/badge/cmake-v3.18+-yellow.svg)
[![Spack](https://img.shields.io/spack/v/sleef)](https://spack.readthedocs.io/en/v0.16.2/package_list.html#sleef)
[![SourceForge Downloads](https://img.shields.io/sourceforge/dt/sleef)](https://sourceforge.net/projects/sleef/)
SLEEF is a library that implements vectorized versions of C standard math functions. This library also includes DFT subroutines.
- **Web Page:** [https://sleef.org/][webpage_url]
- **Sources:** [https://github.com/shibatch/sleef][repo_url]
## Supported environment
### Test matrix
The following table summarises currently supported vector extensions, compilers and OS-es.
:green_circle: : Tested extensively in CI.
:yellow_circle: : Tested partially in CI.
:x: : Currently failing some tests in CI.
:white_circle: : Not tested in CI. Might have passed tests in previous CI framework.
[This issue](https://github.com/shibatch/sleef/issues/481) tracks progress on improving test coverage.
Compilation of SLEEF on previously supported environments might still be safe, we just cannot verify it yet.
<table>
<tr>
<th colspan="2" rowspan="2"></th>
<th colspan="9">OS/Compiler</th>
</tr>
<tr>
<th colspan="3">Linux</th>
<th colspan="2">macOS</th>
<th colspan="4">Windows</th>
</tr>
<tr>
<th>Arch.</th>
<th>Vector Extensions</th>
<th>gcc</th><th>llvm</th><th>icc</th>
<th>gcc</th><th>llvm</th>
<th>gcc</th><th>llvm-gnu</th><th>llvm-msvc</th><th>msvc</th>
</tr>
<tr align="center"><th>x86_64</th><th>SSE2, SSE4,<br>AVX, AVX2, AVX512F</th>
<td>:green_circle:</td><td>:green_circle:</td><td>:white_circle:</td>
<td>:white_circle:</td><td>:green_circle:</td>
<td>:white_circle:</td><td>:yellow_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
</tr>
<tr align="center"><th>x86 32bit<br>(i386)</th><th>SSE</th>
<td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
<td colspan="2">N/A</td>
<td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
</tr>
<tr align="center"><th>AArch64<br>(arm)</th><th>Neon, SVE</th>
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
<td colspan="1">N/A</td><td>:green_circle:</td>
<td colspan="1">N/A</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
</tr>
<tr align="center"><th>AArch32<br>(armhf)</th><th>NEON</th>
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
<td colspan="2">N/A</td>
<td colspan="4">N/A</td>
</tr>
<tr align="center"><th>PowerPC<br>(ppc64el)</th><th>VSX, VSX3</th>
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
<td colspan="2">N/A</td>
<td colspan="4">N/A</td>
</tr>
<tr align="center"><th>IBM/Z<br>(s390x)</th><th>VXE, VXE2</th>
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
<td colspan="2">N/A</td>
<td colspan="4">N/A</td>
</tr>
<tr align="center"><th>RISC-V<br>(riscv64)</th><th>RVV1, RVV2</th>
<td>N/A (14+)</td><td>:green_circle:</td><td>N/A</td>
<td colspan="2">N/A</td>
<td colspan="4">N/A</td>
</tr>
</table>
### Component support
The above table is valid for libm in single, double and quadruple precision, as well as fast Discrete Fourier Transform (DFT).
Generation of inline headers is also supported for most vector extensions.
LTO is not tested in CI yet, except on Windows.
### Compiler support
Results are displayed for gcc 11 and llvm 17, the compiler versions used in CI tests with GitHub Actions.
Older versions should be supported too, while newer ones are either not tested or have known issues.
Some compiler versions simply do not support certain vector extensions, for instance SVE is only supported for gcc version 9 onwards.
Similarly, the RISC-V interface in SLEEF is based on version 1.0 of the intrinsics, which is only supported from llvm version 17 and gcc version 14 onwards.
Toolchain files provide some information on supported compiler versions.
### OS support
Only Linux distributions and macOS are fully tested in CI and thus officially supported.
Building SLEEF for Windows on x86 machines was officially supported ( :white_circle: ), as of 3.5.1,
however it is only partially tested due to [known limitations of the test suite with MinGW or MSYS2](https://github.com/shibatch/sleef/issues/544).
As a result tests for Windows on x86 only include DFT for now (other tests are disabled in build system),
but all components are built.
Support for iOS and Android is only preliminary on AArch64.
SVE is not supported on Darwin-based system and therefore automatically disabled by SLEEF on Darwin.
### More on supported environment
Refer to our web page for [more on supported environment][supported_env_url].
## Install SLEEF dependencies
The library itself does not have any additional dependency.
However some tests require:
- libssl and libcrypto, that can be provided by installing openssl.
- libm, libgmp and libmpfr
- libfftw.
These tests can be disabled if necessary.
## How to build SLEEF
We recommend relying on CMake as much as possible in the build process to ensure portability.
**CMake 3.18+** is the minimum required.
1. Check out the source code from our GitHub repository
```
git clone https://github.com/shibatch/sleef
```
2. Make a separate directory to create an out-of-source build
```
cd sleef && mkdir build
```
3. Run cmake to configure the project
```
cmake -S . -B build
```
By default this will generate shared libraries. In order to generate static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`.
For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`.
4. Run make to build the project
```
cmake --build build -j --clean-first
```
5. Run tests using ctests
```
ctest --test-dir build -j
```
For more detailed build instructions please refer to the [dedicated section on CMake](./docs/build-with-cmake.md) or to [our web page][build_info_url].
## Install SLEEF
### From source
Assuming following instructions were followed.
6. Install to specified directory `<prefix>`
```
cmake --install build --prefix=<prefix>
```
### Using Spack
SLEEF can also be directly installed using Spack.
```
spack install sleef@master
```
### Uninstall
In order to uninstall SLEEF library and headers run
```
sudo xargs rm -v < build/install_manifest.txt
```
## License
The software is distributed under the Boost Software License, Version 1.0.
See accompanying file [LICENSE.txt](./LICENSE.txt) or copy at [http://www.boost.org/LICENSE_1_0.txt][license_url].
Contributions to this project are accepted under the same license.
Copyright &copy; 2010-2024 SLEEF Project, Naoki Shibata and contributors.<br/>
<!-- Repository links -->
[webpage_url]: https://sleef.org/
[build_info_url]: https://sleef.org/compile.xhtml
[supported_env_url]: https://sleef.org/index.xhtml#environment
[repo_url]: https://github.com/shibatch/sleef
[repo_license_url]: https://github.com/shibatch/sleef/blob/main/LICENSE.txt
[license_url]: http://www.boost.org/LICENSE_1_0.txt

View File

@@ -0,0 +1,71 @@
#ifndef __SLEEFDFT_H__
#define __SLEEFDFT_H__
#ifdef __cplusplus
extern "C"
{
#endif
#include <stdlib.h>
#include <stdint.h>
#define SLEEF_MODE_FORWARD (0 << 0)
#define SLEEF_MODE_BACKWARD (1 << 0)
#define SLEEF_MODE_COMPLEX (0 << 1)
#define SLEEF_MODE_REAL (1 << 1)
#define SLEEF_MODE_ALT (1 << 2)
#define SLEEF_MODE_FFTWCOMPAT (1 << 3)
#define SLEEF_MODE_DEBUG (1 << 10)
#define SLEEF_MODE_VERBOSE (1 << 11)
#define SLEEF_MODE_NO_MT (1 << 12)
#define SLEEF_MODE_ESTIMATE (1 << 20)
#define SLEEF_MODE_MEASURE (2 << 20)
#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
#ifdef IMPORT_IS_EXPORT
#define IMPORT __declspec(dllexport)
#else // #ifdef IMPORT_IS_EXPORT
#define IMPORT __declspec(dllimport)
#if (defined(_MSC_VER))
#pragma comment(lib,"sleefdft.lib")
#endif // #if (defined(_MSC_VER))
#endif // #ifdef IMPORT_IS_EXPORT
#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
#define IMPORT
#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
IMPORT struct SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *out, uint64_t mode);
IMPORT struct SleefDFT *SleefDFT_double_init2d(uint32_t n, uint32_t m, const double *in, double *out, uint64_t mode);
IMPORT void SleefDFT_double_execute(struct SleefDFT *ptr, const double *in, double *out);
IMPORT struct SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, uint64_t mode);
IMPORT struct SleefDFT *SleefDFT_float_init2d(uint32_t n, uint32_t m, const float *in, float *out, uint64_t mode);
IMPORT void SleefDFT_float_execute(struct SleefDFT *ptr, const float *in, float *out);
IMPORT void SleefDFT_dispose(struct SleefDFT *ptr);
IMPORT void SleefDFT_setPath(struct SleefDFT *ptr, char *pathStr);
//
IMPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode);
#define SLEEF_PLAN_AUTOMATIC 0
#define SLEEF_PLAN_READONLY (1 << 0)
#define SLEEF_PLAN_RESET (1 << 1)
#define SLEEF_PLAN_BUILDALLPLAN (1 << 2)
#define SLEEF_PLAN_NOLOCK (1 << 3)
#define SLEEF_PLAN_MEASURE (1 << 29)
#define SLEEF_PLAN_REFERTOENVVAR (1 << 30)
#undef IMPORT
#ifdef __cplusplus
}
#endif
#endif // #ifndef __SLEEFDFT_H__

View File

@@ -0,0 +1,11 @@
// Configuration of @PROJECT_NAME@ /////////////////////////////////////////////
#ifndef SLEEF_CONFIG_H
#define SLEEF_CONFIG_H
#define SLEEF_VERSION_MAJOR @SLEEF_VERSION_MAJOR@
#define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@
#cmakedefine SLEEF_STATIC_LIBS
#endif // SLEEF_CONFIG_H

View File

@@ -0,0 +1 @@
include("${CMAKE_CURRENT_LIST_DIR}/sleefTargets.cmake")

View File

@@ -0,0 +1,22 @@
include_directories("common")
include_directories("arch")
add_subdirectory("libm")
if (SLEEF_BUILD_TESTS AND NOT MINGW)
add_subdirectory("libm-tester")
endif()
add_subdirectory("common")
if (SLEEF_BUILD_DFT)
add_subdirectory("dft")
if (SLEEF_BUILD_TESTS)
add_subdirectory("dft-tester")
endif()
endif()
if (SLEEF_BUILD_QUAD)
add_subdirectory("quad")
if (SLEEF_BUILD_TESTS AND NOT MINGW)
add_subdirectory("quad-tester")
endif()
endif()

View File

@@ -0,0 +1,837 @@
/*********************************************************************/
/* Copyright ARM Ltd. 2010 - 2024. */
/* Distributed under the Boost Software License, Version 1.0. */
/* (See accompanying file LICENSE.txt or copy at */
/* http://www.boost.org/LICENSE_1_0.txt) */
/*********************************************************************/
#if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
#error Please specify advsimd flags.
#endif
#if !defined(SLEEF_GENHEADER)
#include <arm_neon.h>
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP 2
//@#define LOG2VECTLENSP 2
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#if CONFIG == 1
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#define ISANAME "AArch64 AdvSIMD"
// Mask definition
typedef uint32x4_t vmask;
typedef uint32x4_t vopmask;
// Single precision definitions
typedef float32x4_t vfloat;
typedef int32x4_t vint2;
// Double precision definitions
typedef float64x2_t vdouble;
typedef int32x2_t vint;
typedef int64x2_t vint64;
typedef uint64x2_t vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
#define DFTPRIORITY 10
static INLINE int vavailability_i(int name) { return 3; }
static INLINE void vprefetch_v_p(const void *ptr) { }
static INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) {
uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
uint32x2_t x1 = vpmin_u32(x0, x0);
return vget_lane_u32(x1, 0);
}
static INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) {
uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
uint32x2_t x1 = vpmin_u32(x0, x0);
return vget_lane_u32(x1, 0);
}
// Vector load / store
static INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
static INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
static INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
static INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
static INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
static INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
static INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
static INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
static INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
}
static INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}
// Basic logical operations for mask
static INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) {
return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
// Mask <--> single precision reinterpret
static INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) {
return vreinterpretq_u32_f32(vf);
}
static INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) {
return vreinterpretq_f32_u32(vm);
}
static INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
static INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }
// Mask <--> double precision reinterpret
static INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) {
return vreinterpretq_u32_f64(vd);
}
static INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) {
return vreinterpretq_f64_u32(vm);
}
static INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) {
return vreinterpretq_f32_s32(vm);
}
static INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) {
return vreinterpretq_s32_f32(vf);
}
/****************************************/
/* Single precision FP operations */
/****************************************/
// Broadcast
static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
// Add, Sub, Mul
static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
return vaddq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
return vsubq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
return vmulq_f32(x, y);
}
// |x|, -x
static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
static INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
#if CONFIG == 1
// Multiply accumulate: z = z + x * y
static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
return vfmaq_f32(z, x, y);
}
// Multiply subtract: z = z - x * y
static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
return vfmsq_f32(z, x, y);
}
// Multiply subtract: z = x * y - z
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
return vneg_vf_vf(vfmsq_f32(z, x, y));
}
#else
static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif
static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
return vfmaq_f32(z, x, y);
}
static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
return vfmsq_f32(z, x, y);
}
static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
return vfma_vf_vf_vf_vf(x, y, vneg_vf_vf(z));
}
// Reciprocal 1/x, Division, Square root
static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
#ifndef SLEEF_ENABLE_ALTDIV
return vdivq_f32(n, d);
#else
// Finite numbers (including denormal) only, gives mostly correctly rounded result
float32x4_t t, u, x, y;
uint32x4_t i0, i1;
i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
t = vreinterpretq_f32_u32(i0);
y = vmulq_f32(d, t);
x = vmulq_f32(n, t);
t = vrecpeq_f32(y);
t = vmulq_f32(t, vrecpsq_f32(y, t));
t = vmulq_f32(t, vrecpsq_f32(y, t));
u = vmulq_f32(x, t);
u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
return u;
#endif
}
static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
#ifndef SLEEF_ENABLE_ALTDIV
return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
#else
return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
#endif
}
static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
#ifndef SLEEF_ENABLE_ALTSQRT
return vsqrtq_f32(d);
#else
// Gives correctly rounded result for all input range
vfloat w, x, y, z;
y = vrsqrteq_f32(d);
x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w);
y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w);
w = vmul_vf_vf_vf(w, y);
x = vmul_vf_vf_vf(w, d);
y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
w = vfma_vf_vf_vf_vf(w, z, y);
w = vadd_vf_vf_vf(w, x);
return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
#endif
}
// max, min
static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
return vmaxq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
return vminq_f32(x, y);
}
// Comparisons
static INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
static INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
return vmvnq_u32(vceqq_f32(x, y));
}
static INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
static INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
static INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
static INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
// Conditional select
static INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
return vbslq_f32(mask, x, y);
}
// int <--> float conversions
static INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
static INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
static INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
static INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) {
return vcvtq_s32_f32(vrndnq_f32(d));
}
/***************************************/
/* Single precision integer operations */
/***************************************/
// Add, Sub, Neg (-x)
static INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
return vaddq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
return vsubq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
// Logical operations
static INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
return vandq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
return vbicq_s32(y, x);
}
static INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
return vorrq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
return veorq_s32(x, y);
}
// Shifts
#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
#define vsrl_vi2_vi2_i(x, c) \
vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
//@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
//@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
#define vsrl_vi_vi_i(x, c) \
vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
//@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
// Comparison returning masks
static INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
static INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
// Comparison returning integers
static INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
return vreinterpretq_s32_u32(vcgtq_s32(x, y));
}
static INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
return vreinterpretq_s32_u32(vceqq_s32(x, y));
}
// Conditional select
static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
return vbslq_s32(m, x, y);
}
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/****************************************/
/* Double precision FP operations */
/****************************************/
// Broadcast
static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
// Add, Sub, Mul
static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
return vaddq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
return vsubq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
return vmulq_f64(x, y);
}
// |x|, -x
static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
static INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }
// max, min
static INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
return vmaxq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
return vminq_f64(x, y);
}
#if CONFIG == 1
// Multiply accumulate: z = z + x * y
static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
return vfmaq_f64(z, x, y);
}
static INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
return vfmsq_f64(z, x, y);
}
//[z = x * y - z]
static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
return vneg_vd_vd(vfmsq_f64(z, x, y));
}
#else
static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
#endif
static INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
return vfmaq_f64(z, x, y);
}
static INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
return vfmsq_f64(z, x, y);
}
static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
return vfma_vd_vd_vd_vd(x, y, vneg_vd_vd(z));
}
// Reciprocal 1/x, Division, Square root
static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
#ifndef SLEEF_ENABLE_ALTDIV
return vdivq_f64(n, d);
#else
// Finite numbers (including denormal) only, gives mostly correctly rounded result
float64x2_t t, u, x, y;
uint64x2_t i0, i1;
i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
t = vreinterpretq_f64_u64(i0);
y = vmulq_f64(d, t);
x = vmulq_f64(n, t);
t = vrecpeq_f64(y);
t = vmulq_f64(t, vrecpsq_f64(y, t));
t = vmulq_f64(t, vrecpsq_f64(y, t));
t = vmulq_f64(t, vrecpsq_f64(y, t));
u = vmulq_f64(x, t);
u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
return u;
#endif
}
static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
#ifndef SLEEF_ENABLE_ALTDIV
return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
#else
return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
#endif
}
static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
#ifndef SLEEF_ENABLE_ALTSQRT
return vsqrtq_f64(d);
#else
// Gives correctly rounded result for all input range
vdouble w, x, y, z;
y = vrsqrteq_f64(d);
x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w);
w = vmul_vd_vd_vd(w, y);
x = vmul_vd_vd_vd(w, d);
y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
w = vfma_vd_vd_vd_vd(w, z, y);
w = vadd_vd_vd_vd(w, x);
return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
#endif
}
/* Comparisons */
static INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
return vreinterpretq_u32_u64(vceqq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));
}
static INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
return vreinterpretq_u32_u64(vcltq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
return vreinterpretq_u32_u64(vcgtq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
return vreinterpretq_u32_u64(vcleq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
return vreinterpretq_u32_u64(vcgeq_f64(x, y));
}
// Conditional select
static INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);
}
#if 1
static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
#else
// This implementation is slower on the current CPU models (as of May 2017.)
// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.
static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
(uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });
uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };
return (vdouble) vqtbl1q_u8(tab, idx);
}
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },
vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },
(uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));
uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } };
return (vdouble) vqtbl2q_u8(tab, idx);
}
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
}
#endif
static INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
static INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }
/****************************************/
/* int <--> float conversions */
/****************************************/
static INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) {
return vmovn_s64(vcvtq_s64_f64(vf));
}
static INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) {
return vcvtq_f64_s64(vmovl_s32(vi));
}
static INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); }
static INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) {
return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d)));
}
/***************************************/
/* Integer operations */
/***************************************/
// Add, Sub, Neg (-x)
static INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
static INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
static INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); }
// Logical operations
static INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
static INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
static INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
static INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }
// Comparison returning masks
static INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) {
return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));
}
// Conditional select
static INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
return vbsl_s32(vget_low_u32(m), x, y);
}
/***************************************/
/* Predicates */
/***************************************/
static INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) {
const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY);
const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY);
uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));
return vreinterpretq_u32_u64(cmp);
}
static INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) {
return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));
}
static INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) {
return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY)));
}
static INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) {
return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY)));
}
static INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
return vbslq_f32(mask, x, y);
}
static INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
return vceqq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
return vmvnq_u32(vceqq_f32(x, y));
}
static INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
return vcltq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
return vcleq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
return vcgtq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
return vcgeq_f32(x, y);
}
static INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
return vceqq_s32(x, y);
}
static INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
return vcgtq_s32(x, y);
}
static INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) {
return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0));
}
static INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) {
return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf));
}
static INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) {
return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf));
}
static INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) {
return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf));
}
static INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
static INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) {
return vuzpq_u32(m, m).val[0];
}
static INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) {
return vzipq_u32(m, m).val[0];
}
static INLINE VECTOR_CC vopmask vcast_vo_i(int i) {
return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)(i ? -1 : 0)));
}
static INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
return vandq_u32(x, y);
}
static INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
return vorrq_u32(x, y);
}
static INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
return veorq_u32(x, y);
}
static INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return vbslq_s32(m, x, y);
}
static INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
return vandq_s32(vreinterpretq_s32_u32(x), y);
}
static INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
return vbicq_s32(y, vreinterpretq_s32_u32(x));
}
static INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) {
return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));
}
static INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) {
return vandq_u32(x, y);
}
static INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) {
return vandq_u32(x, y);
}
static INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) {
return vorrq_u32(x, y);
}
static INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) {
return vorrq_u32(x, y);
}
static INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
return veorq_u32(x, y);
}
static INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }
static INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) {
return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
}
static INLINE vmask vcast_vm_i64(int64_t i) {
return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)i));
}
static INLINE vmask vcast_vm_u64(uint64_t i) {
return vreinterpretq_u32_u64(vdupq_n_u64(i));
}
static INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) {
return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}
static INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) {
return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}
static INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
return vbsl_s32(vget_low_u32(m), x, y);
}
// Logical operations
static INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) {
return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);
}
static INLINE VECTOR_CC vmask vcastu_vm_vi(vint vi) {
return vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi))));
}
static INLINE VECTOR_CC vint vcastu_vi_vm(vmask vi2) {
return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vi2))));
}
static INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }
//
#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
static INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
static INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }
static INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; }
static INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
static INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
static INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
static INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}
static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
static INLINE int vtestallzeros_i_vo64(vopmask g) {
uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));
uint32x2_t x1 = vpmax_u32(x0, x0);
return ~vget_lane_u32(x1, 0);
}
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}
static INLINE vmask vneg64_vm_vm(vmask x) {
return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));
}
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}
#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
//@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
//@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
static INLINE vmask vcast_vm_vi(vint vi) {
vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
return vor_vm_vm_vm(vcastu_vm_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi)))), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return vreinterpretq_u32_s64(v); }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return vreinterpretq_s64_u32(m); }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return vreinterpretq_u32_u64(v); }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return vreinterpretq_u64_u32(m); }

View File

@@ -0,0 +1,638 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 1
#if !defined(__AVX__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx.
#endif
#elif CONFIG == 4
#if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx and -mfma4.
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 2
//@#define LOG2VECTLENDP 2
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
typedef __m256i vmask;
typedef __m256i vopmask;
typedef __m256d vdouble;
typedef __m128i vint;
typedef __m256 vfloat;
typedef struct { __m128i x, y; } vint2;
typedef __m256i vint64;
typedef __m256i vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
//
#if !defined(SLEEF_GENHEADER)
#ifndef __SLEEF_H__
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
#endif
static INLINE int cpuSupportsAVX() {
int32_t reg[4];
Sleef_x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 28)) != 0;
}
static INLINE int cpuSupportsFMA4() {
int32_t reg[4];
Sleef_x86CpuID(reg, 0x80000001, 0);
return (reg[2] & (1 << 16)) != 0;
}
#if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__)
static INLINE int vavailability_i(int name) {
int d = cpuSupportsAVX() && cpuSupportsFMA4();
return d ? 3 : 0;
}
#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
#define ISANAME "AVX + AMD FMA4"
#define DFTPRIORITY 21
#else
static INLINE int vavailability_i(int name) {
int d = cpuSupportsAVX();
return d ? 3 : 0;
}
#define ISANAME "AVX"
#define DFTPRIORITY 20
#endif
#endif // #if !defined(SLEEF_GENHEADER)
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
static INLINE int vtestallones_i_vo32(vopmask g) {
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}
static INLINE int vtestallones_i_vo64(vopmask g) {
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}
//
static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); }
//
static vint2 vloadu_vi2_p(int32_t *p) {
vint2 r;
r.x = _mm_loadu_si128((__m128i *) p );
r.y = _mm_loadu_si128((__m128i *)(p + 4));
return r;
}
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
_mm_storeu_si128((__m128i *) p , v.x);
_mm_storeu_si128((__m128i *)(p + 4), v.y);
}
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vcast_vo32_vo64(vopmask o) {
return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));
}
static INLINE vopmask vcast_vo64_vo32(vopmask o) {
return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));
}
static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
//
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
static INLINE vmask vcastu_vm_vi(vint vi) {
__m256i m = _mm256_castsi128_si256(_mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0)));
return _mm256_insertf128_si256(m, _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0)), 1);
}
static INLINE vint vcastu_vi_vm(vmask vi) {
return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(_mm256_castsi256_si128(vi) , 0x0d), _mm_set_epi32( 0, 0, -1, -1)),
_mm_and_si128(_mm_shuffle_epi32(_mm256_extractf128_si256(vi, 1), 0xd0), _mm_set_epi32(-1, -1, 0, 0)));
}
static INLINE vmask vcast_vm_i_i(int i0, int i1) {
return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
}
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));
}
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
//
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
#if CONFIG == 1
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }
#endif
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
//
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask vispinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask visminf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask visnan_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
}
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[VECTLENDP];
vstoreu_v_p_vi(a, vi);
return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}
#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
double a[VECTLENDP];
vstoreu_v_p_vd(a, v);
return a[0];
}
#endif
//
static INLINE vint2 vcast_vi2_vm(vmask vm) {
vint2 r;
r.x = _mm256_castsi256_si128(vm);
r.y = _mm256_extractf128_si256(vm, 1);
return r;
}
static INLINE vmask vcast_vm_vi2(vint2 vi) {
vmask m = _mm256_castsi128_si256(vi.x);
m = _mm256_insertf128_si256(m, vi.y, 1);
return m;
}
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
#if CONFIG == 1
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }
#endif
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };
return vi;
}
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };
return vi;
}
static INLINE vint2 vneg_vi2_vi2(vint2 e) {
vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };
return vi;
}
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };
return vi;
}
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };
return vi;
}
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };
return vi;
}
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };
return vi;
}
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };
return vi;
}
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };
return vi;
}
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };
return vi;
}
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
vint2 r;
r.x = _mm_cmpeq_epi32(x.x, y.x);
r.y = _mm_cmpeq_epi32(x.y, y.y);
return vcast_vm_vi2(r);
}
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
vint2 r;
r.x = _mm_cmpgt_epi32(x.x, y.x);
r.y = _mm_cmpgt_epi32(x.y, y.y);
return vcast_vm_vi2(r);
}
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 r;
r.x = _mm_cmpeq_epi32(x.x, y.x);
r.y = _mm_cmpeq_epi32(x.y, y.y);
return r;
}
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
vint2 r;
r.x = _mm_cmpgt_epi32(x.x, y.x);
r.y = _mm_cmpgt_epi32(x.y, y.y);
return r;
}
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
vint2 n = vcast_vi2_vm(m);
vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };
return r;
}
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;
iz.x = _mm_add_epi64(ix.x, iy.x);
iz.y = _mm_add_epi64(ix.y, iy.y);
return vcast_vm_vi2(iz);
}
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
//
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[VECTLENSP];
vstoreu_v_p_vi2(a, vi2);
return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}
#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
float a[VECTLENSP];
vstoreu_v_p_vf(a, v);
return a[0];
}
#endif
//
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
#if CONFIG == 1
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#else
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
#endif
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
_mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
_mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
_mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
_mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}
//
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
_mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
_mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
static INLINE int vtestallzeros_i_vo64(vopmask g) {
return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
}
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));
}
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
__m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
__m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));
return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);
}
static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
__m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
__m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));
return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);
}
#define vsll64_vm_vm_i(x, c) \
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \
_mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
#define vsrl64_vm_vm_i(x, c) \
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
_mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
//@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
//@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
static INLINE vmask vcast_vm_vi(vint vi) {
vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
return vor_vm_vm_vm(vcastu_vm_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1))), m);
}
static INLINE vint vcast_vi_vm(vmask vm) {
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
}
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }

View File

@@ -0,0 +1,485 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 1
#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx2.
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 2
//@#define LOG2VECTLENDP 2
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
typedef __m256i vmask;
typedef __m256i vopmask;
typedef __m256d vdouble;
typedef __m128i vint;
typedef __m256 vfloat;
typedef __m256i vint2;
typedef __m256i vint64;
typedef __m256i vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
//
#if !defined(SLEEF_GENHEADER)
#ifndef __SLEEF_H__
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
#endif
static INLINE int cpuSupportsAVX2() {
int32_t reg[4];
Sleef_x86CpuID(reg, 7, 0);
return (reg[1] & (1 << 5)) != 0;
}
static INLINE int cpuSupportsFMA() {
int32_t reg[4];
Sleef_x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 12)) != 0;
}
#if CONFIG == 1 && defined(__AVX2__)
static INLINE int vavailability_i(int name) {
int d = cpuSupportsAVX2() && cpuSupportsFMA();
return d ? 3 : 0;
}
#define ISANAME "AVX2"
#define DFTPRIORITY 25
#endif
#endif // #if !defined(SLEEF_GENHEADER)
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
static INLINE int vtestallones_i_vo32(vopmask g) {
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}
static INLINE int vtestallones_i_vo64(vopmask g) {
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}
//
static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); }
//
static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vcast_vo32_vo64(vopmask o) {
return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));
}
static INLINE vopmask vcast_vo64_vo32(vopmask o) {
return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
}
static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
//
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
static INLINE vmask vcastu_vm_vi(vint vi) {
return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);
}
static INLINE vint vcastu_vi_vm(vmask vi) {
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
}
static INLINE vmask vcast_vm_i_i(int i0, int i1) {
return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
}
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }
//
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
//
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
__m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),
vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),
vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),
_mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));
return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
}
static INLINE vopmask visinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask vispinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask visminf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask visnan_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
}
#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
double s[4];
_mm256_storeu_pd(s, v);
return s[0];
}
#endif
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
//
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return _mm256_blendv_epi8(y, x, m);
}
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
// At this point, the following three functions are implemented in a generic way,
// but I will try target-specific optimization later on.
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
float s[8];
_mm256_storeu_ps(s, v);
return s[0];
}
#endif
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
//
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
_mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
_mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
_mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
_mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}
//
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
_mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
_mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
static INLINE int vtestallzeros_i_vo64(vopmask g) {
return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
}
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); }
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare
#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } // signed 32-bit => 64-bit
static INLINE vint vcast_vi_vm(vmask vm) { // signed 32-bit <= 64-bit
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
}
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }

View File

@@ -0,0 +1,463 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 1
#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx2.
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
typedef __m128i vmask;
typedef __m128i vopmask;
typedef __m128d vdouble;
typedef __m128i vint;
typedef __m128 vfloat;
typedef __m128i vint2;
typedef __m128i vint64;
typedef __m128i vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
//
#if !defined(SLEEF_GENHEADER)
#ifndef __SLEEF_H__
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
#endif
static INLINE int cpuSupportsAVX2() {
int32_t reg[4];
Sleef_x86CpuID(reg, 7, 0);
return (reg[1] & (1 << 5)) != 0;
}
static INLINE int cpuSupportsFMA() {
int32_t reg[4];
Sleef_x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 12)) != 0;
}
#if CONFIG == 1 && defined(__AVX2__)
static INLINE int vavailability_i(int name) {
int d = cpuSupportsAVX2() && cpuSupportsFMA();
return d ? 3 : 0;
}
#define ISANAME "AVX2"
#define DFTPRIORITY 25
#endif
#endif // #if !defined(SLEEF_GENHEADER)
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
//
static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
//
static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i const *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
//
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
static INLINE vmask vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
static INLINE vint vcastu_vi_vm(vmask vi) { return _mm_shuffle_epi32(vi, 0x0d); }
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
//
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmsub_pd(x, y, z); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GE_OQ)); }
//
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(o)); }
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmp_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask vispinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask visminf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
}
static INLINE vopmask visnan_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmp_pd(d, d, _CMP_NEQ_UQ));
}
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
double a[VECTLENDP];
vstoreu_v_p_vd(a, v);
return a[0];
}
#endif
//
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvttps_epi32(vf)); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmsub_ps(x, y, z); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GE_OQ)); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_and_si128(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_andnot_si128(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_or_si128(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_xor_si128(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm_srai_epi32(x, c); }
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return _mm_blendv_epi8(y, x, m);
}
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(o)); }
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
float a[VECTLENSP];
vstoreu_v_p_vf(a, v);
return a[0];
}
#endif
//
#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
//
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
}
//
static vquad loadu_vq_p(void *p) {
vquad vq = {
vloadu_vi2_p((int32_t *)p),
vloadu_vi2_p((int32_t *)((uint8_t *)p + sizeof(vmask)))
};
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
static void vstoreu_v_p_vq(void *p, vquad vq) {
vstoreu_v_p_vi2((int32_t *)p, vcast_vi2_vm(vq.x));
vstoreu_v_p_vi2((int32_t *)((uint8_t *)p + sizeof(vmask)), vcast_vi2_vm(vq.y));
}
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm_blendv_epi8(y, x, o); }
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpgt_epi64(x, y); } // signed compare
#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
static INLINE vmask vcast_vm_vi(vint vi) {
vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }

View File

@@ -0,0 +1,600 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 1 || CONFIG == 2
#if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx512f.
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 3
//@#define LOG2VECTLENDP 3
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#if CONFIG == 1
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
typedef __m512i vmask;
typedef __mmask16 vopmask;
typedef __m512d vdouble;
typedef __m256i vint;
typedef __m512 vfloat;
typedef __m512i vint2;
typedef __m512i vint64;
typedef __m512i vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
//
#if !defined(SLEEF_GENHEADER)
#ifndef __SLEEF_H__
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
#endif
static INLINE int cpuSupportsAVX512F() {
int32_t reg[4];
Sleef_x86CpuID(reg, 7, 0);
return (reg[1] & (1 << 16)) != 0;
}
#if CONFIG == 1 && defined(__AVX512F__)
static INLINE int vavailability_i(int name) {
int d = cpuSupportsAVX512F();
return d ? 3 : 0;
}
#define ISANAME "AVX512F"
#define DFTPRIORITY 30
#endif
#if CONFIG == 2 && defined(__AVX512F__)
static INLINE int vavailability_i(int name) {
int d = cpuSupportsAVX512F();
return d ? 3 : 0;
}
#define ISANAME "AVX512FNOFMA"
#define DFTPRIORITY 0
#endif
#endif // #if !defined(SLEEF_GENHEADER)
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
#ifdef __INTEL_COMPILER
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }
#else
static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
#endif
//
static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); }
static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); }
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }
static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }
static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
//
static INLINE vint vrint_vi_vd(vdouble vd) {
return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
}
static INLINE vint vtruncate_vi_vd(vdouble vd) {
return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
}
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
}
static INLINE vdouble vrint_vd_vd(vdouble vd) {
return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
}
static INLINE vmask vcastu_vm_vi(vint vi) {
return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
}
static INLINE vint vcastu_vi_vm(vmask vi) {
return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));
}
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm512_set1_epi64(i); }
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm512_set1_epi64((uint64_t)i); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }
//
static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }
#if CONFIG == 1
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
#endif
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }
//
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }
static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));
}
static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));
}
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }
#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
//@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
//@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
//@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);
}
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);
}
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
return _mm512_mask_blend_pd(mask, y, x);
}
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
#if 1
// Probably this is faster
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
__m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),
vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),
vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),
_mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));
return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
}
#else
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
#endif
static INLINE vopmask visinf_vo_vd(vdouble d) {
return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
}
static INLINE vopmask vispinf_vo_vd(vdouble d) {
return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
}
static INLINE vopmask visminf_vo_vd(vdouble d) {
return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ);
}
static INLINE vopmask visnan_vo_vd(vdouble d) {
return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);
}
static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
// vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to
// be a normalized FP value.
static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }
static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }
static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
//@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
//@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
double s[VECTLENDP];
_mm512_storeu_pd(s, v);
return s[0];
}
#endif
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
//
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));
}
//
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); }
static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); }
static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); }
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
}
static INLINE vfloat vrint_vf_vf(vfloat vd) {
return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
}
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }
#if CONFIG == 1
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);
}
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));
}
#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
//@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
//@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
//@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
__mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
}
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
__mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);
return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
}
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return _mm512_mask_blend_epi32(m, y, x);
}
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
return _mm512_mask_blend_ps(m, y, x);
}
// At this point, the following three functions are implemented in a generic way,
// but I will try target-specific optimization later on.
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
float s[VECTLENSP];
_mm512_storeu_ps(s, v);
return s[0];
}
#endif
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
//
static INLINE vdouble vposneg_vd_vd(vdouble d) {
return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
}
static INLINE vdouble vnegpos_vd_vd(vdouble d) {
return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
}
static INLINE vfloat vposneg_vf_vf(vfloat d) {
return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
}
static INLINE vfloat vnegpos_vf_vf(vfloat d) {
return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
}
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); }
static INLINE vdouble vreva2_vd_vd(vdouble vd) {
return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd)));
}
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
_mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
_mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
_mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
_mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
}
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
_mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
_mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
_mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
_mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
}
//
static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); }
static INLINE vfloat vreva2_vf_vf(vfloat vf) {
return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf)));
}
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
_mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
_mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
_mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
_mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
_mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
_mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
#ifdef __INTEL_COMPILER
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; }
#else
static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }
#endif
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); }
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare
#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
static INLINE vmask vcast_vm_vi(vint vi) {
return _mm512_cvtepi32_epi64(vi);
}
static INLINE vint vcast_vi_vm(vmask vm) {
return _mm512_cvtepi64_epi32(vm);
}
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }

View File

@@ -0,0 +1,297 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
#error Please specify -mfpu=neon.
#endif
#ifdef __aarch64__
#warning This implementation is for AARCH32.
#endif
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP 2
//@#define LOG2VECTLENSP 2
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#if CONFIG == 4
#define ISANAME "AARCH32 NEON-VFPV4"
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#else
#define ISANAME "AARCH32 NEON"
#endif
#define DFTPRIORITY 10
#define ENABLE_RECSQRT_SP
//@#define ENABLE_RECSQRT_SP
#include <arm_neon.h>
#include <stdint.h>
#include "misc.h"
typedef uint32x4_t vmask;
typedef uint32x4_t vopmask;
//typedef int32x4_t vint;
typedef float32x4_t vfloat;
typedef int32x4_t vint2;
//
static INLINE void vprefetch_v_p(const void *ptr) { }
static INLINE int vtestallones_i_vo32(vopmask g) {
uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
uint32x2_t x1 = vpmin_u32(x0, x0);
return vget_lane_u32(x1, 0);
}
static vfloat vloaduf(float *p) { return vld1q_f32(p); }
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }
//
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
uint32x4_t t = vceqq_u32(x, y);
return vandq_u32(t, vrev64q_u32(t));
}
//
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
static INLINE vint2 vrint_vi2_vf(vfloat d) {
return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
}
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
#if CONFIG == 4
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
static INLINE vfloat vfma_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) {
float32x4_t t = vrecpeq_f32(y), u;
t = vmulq_f32(t, vrecpsq_f32(y, t));
t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
u = vmulq_f32(x, t);
return vfmaq_f32(u, vfmsq_f32(x, y, u), t);
}
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
float32x4_t x = vrsqrteq_f32(d);
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
float32x4_t u = vmulq_f32(x, d);
u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
}
static INLINE vfloat vrec_vf_vf(vfloat y) {
float32x4_t t = vrecpeq_f32(y);
t = vmulq_f32(t, vrecpsq_f32(y, t));
t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
}
static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
float32x4_t x = vrsqrteq_f32(d);
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
}
#else // #if CONFIG == 4
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
float32x4_t x = vrecpeq_f32(d);
x = vmulq_f32(x, vrecpsq_f32(d, x));
float32x4_t t = vmulq_f32(n, x);
return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
}
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
float32x4_t x = vrsqrteq_f32(d);
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
float32x4_t u = vmulq_f32(x, d);
u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
}
static INLINE vfloat vrec_vf_vf(vfloat d) {
float32x4_t x = vrecpeq_f32(d);
x = vmulq_f32(x, vrecpsq_f32(d, x));
return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
}
static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
float32x4_t x = vrsqrteq_f32(d);
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
}
#endif // #if CONFIG == 4
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }
#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
}
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
float p[4];
vst1q_f32 (p, v);
return p[0];
}
static INLINE int vavailability_i(int name) {
if (name != 2) return 0;
return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
}
static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}

View File

@@ -0,0 +1,873 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 1 || CONFIG == 2 || CONFIG == 3 || CONFIG == 4
#ifndef __VSX__
#error Please specify -mcpu=power8 or -mcpu=power9
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#if CONFIG == 1 || CONFIG == 3
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#if !defined(SLEEF_GENHEADER)
#include <altivec.h>
// undef altivec types since CPP and C99 use them as compiler tokens
// use __vector and __bool instead
#undef vector
#undef bool
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
#if CONFIG == 1 || CONFIG == 2
#define ISANAME "VSX"
#else
#define ISANAME "VSX-3"
#endif
#define DFTPRIORITY 25
static INLINE int vavailability_i(int name) { return 3; }
static INLINE void vprefetch_v_p(const void *ptr) { }
/**********************************************
** Types
***********************************************/
typedef __vector unsigned int vmask;
// using __bool with typedef may cause ambiguous errors
#define vopmask __vector __bool int
//@#define vopmask __vector __bool int
typedef __vector signed int vint;
typedef __vector signed int vint2;
typedef __vector float vfloat;
typedef __vector double vdouble;
// internal use types
typedef __vector unsigned int v__u32;
typedef __vector unsigned char v__u8;
typedef __vector signed long long v__i64;
typedef __vector unsigned long long v__u64;
#define v__b64 __vector __bool long long
typedef __vector long long vint64;
typedef __vector unsigned long long vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
/**********************************************
** Utilities
***********************************************/
#define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1})
#define vset__vi2(...) ((vint2) {__VA_ARGS__})
#define vset__vm(...) ((vmask) {__VA_ARGS__})
#define vset__vo(...) ((vopmask) {__VA_ARGS__})
#define vset__vf(...) ((vfloat) {__VA_ARGS__})
#define vset__vd(...) ((vdouble) {__VA_ARGS__})
#define vset__u8(...) ((v__u8) {__VA_ARGS__})
#define vset__u32(...) ((v__u32) {__VA_ARGS__})
#define vset__s64(...) ((v__i64) {__VA_ARGS__})
#define vset__u64(...) ((v__u64) {__VA_ARGS__})
#define vsetall__vi(v) vset__vi(v, v)
#define vsetall__vi2(v) vset__vi2(v, v, v, v)
#define vsetall__vm(v) vset__vm(v, v, v, v)
#define vsetall__vo(v) vset__vo(v, v, v, v)
#define vsetall__vf(v) vset__vf(v, v, v, v)
#define vsetall__vd(v) vset__vd(v, v)
#define vsetall__u8(v) vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)
#define vsetall__u32(v) vset__u32(v, v, v, v)
#define vsetall__s64(v) vset__s64(v, v)
#define vsetall__u64(v) vset__u64(v, v)
#define vzero__vi() vsetall__vi(0)
#define vzero__vi2() vsetall__vi2(0)
#define vzero__vm() vsetall__vm(0)
#define vzero__vo() vsetall__vo(0)
#define vzero__vf() vsetall__vf(0)
#define vzero__vd() vsetall__vd(0)
#define vzero__u8() vsetall__u8(0)
#define vzero__u32() vsetall__u32(0)
#define vzero__s64() vsetall__s64(0)
#define vzero__u64() vsetall__u64(0)
//// Swap doubleword elements
#if defined(__clang__) || __GNUC__ >= 7
static INLINE v__u64 v__swapd_u64(v__u64 v)
{ return vec_xxpermdi(v, v, 2); }
#else
static INLINE v__u64 v__swapd_u64(v__u64 v)
{
__asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v));
return v;
}
#endif
/**********************************************
** Memory
***********************************************/
////////////// Unaligned memory access //////////////
/**
* It's not safe to use vector assignment via (cast & dereference) for unaligned memory access
* with almost all clang versions and gcc8 when VSX3 isn't enabled,
* these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x'
* for more information check https://github.com/seiko2plus/vsx_mem_test
*
* TODO: check GCC(9, 10)
*/
//// load
#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
static vint vloadu_vi_p(const int32_t *ptr)
{ return *((vint*)ptr); }
static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
{ return *((vint2*)ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr)
{ return *((vfloat*)ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr)
{ return *((vdouble*)ptr); }
#else
static vint vloadu_vi_p(const int32_t *ptr)
{ return vec_vsx_ld(0, ptr); }
static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
{ return vec_vsx_ld(0, ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr)
{ return vec_vsx_ld(0, ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr)
{ return vec_vsx_ld(0, ptr); }
#endif
//// store
#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
static void vstoreu_v_p_vi(int32_t *ptr, vint v)
{ *((vint*)ptr) = v; }
static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
{ *((vint2*)ptr) = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
{ *((vfloat*)ptr) = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
{ *((vdouble*)ptr) = v; }
#else
static void vstoreu_v_p_vi(int32_t *ptr, vint v)
{ vec_vsx_st(v, 0, ptr); }
static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
{ vec_vsx_st(v, 0, ptr); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
{ vec_vsx_st(v, 0, ptr); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
{ vec_vsx_st(v, 0, ptr); }
#endif
////////////// aligned memory access //////////////
//// load
static INLINE vfloat vload_vf_p(const float *ptr)
{ return vec_ld(0, ptr); }
static INLINE vdouble vload_vd_p(const double *ptr)
{ return *((vdouble*)ptr); }
//// store
static INLINE void vstore_v_p_vf(float *ptr, vfloat v)
{ vec_st(v, 0, ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v)
{ *((vdouble*)ptr) = v; }
////////////// non-temporal memory access //////////////
//// store
static INLINE void vstream_v_p_vf(float *ptr, vfloat v)
{ vstore_v_p_vf(ptr, v); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v)
{ vstore_v_p_vd(ptr, v); }
////////////// LUT //////////////
//// load
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi)
{ return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2)
{
return vset__vf(
ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)],
ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)]
);
}
//// store
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
{
const v__u64 vll = (v__u64)v;
float *ptr_low = ptr + offset*2;
float *ptr_high = ptr + (offset + step)*2;
*((uint64_t*)ptr_low) = vec_extract(vll, 0);
*((uint64_t*)ptr_high) = vec_extract(vll, 1);
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
{ vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
{ vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
{ vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
/**********************************************
** Misc
**********************************************/
// vector with a specific value set to all lanes (Vector Splat)
static INLINE vint vcast_vi_i(int i)
{ return vsetall__vi(i); }
static INLINE vint2 vcast_vi2_i(int i)
{ return vsetall__vi2(i); }
static INLINE vfloat vcast_vf_f(float f)
{ return vsetall__vf(f); }
static INLINE vdouble vcast_vd_d(double d)
{ return vsetall__vd(d); }
// cast
static INLINE vint2 vcast_vi2_vm(vmask vm)
{ return (vint2)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi)
{ return (vmask)vi; }
// get the first element
static INLINE float vcast_f_vf(vfloat v)
{ return vec_extract(v, 0); }
static INLINE double vcast_d_vd(vdouble v)
{ return vec_extract(v, 0); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd)
{ return (vmask)vd; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm)
{ return (vdouble)vm; }
static INLINE vmask vreinterpret_vm_vf(vfloat vf)
{ return (vmask)vf; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm)
{ return (vfloat)vm; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi)
{ return (vfloat)vi; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf)
{ return (vint2)vf; }
// per element select via mask (blend)
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y)
{ return vec_sel(y, x, (v__b64)o); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y)
{ return vec_sel(y, x, o); }
static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y)
{ return vec_sel(y, x, o); }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)
{ return vec_sel(y, x, o); }
static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0)
{
return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2)
{
return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3)
{
return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0)
{
return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2)
{
return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3)
{
return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE int vtestallones_i_vo32(vopmask g)
{ return vec_all_ne((vint2)g, vzero__vi2()); }
static INLINE int vtestallones_i_vo64(vopmask g)
{ return vec_all_ne((v__i64)g, vzero__s64()); }
/**********************************************
** Conversions
**********************************************/
////////////// Numeric //////////////
// pack 64-bit mask to 32-bit
static INLINE vopmask vcast_vo32_vo64(vopmask m)
{ return (vopmask)vec_pack((v__u64)m, (v__u64)m); }
// clip 64-bit lanes to lower 32-bit
static INLINE vint vcastu_vi_vi2(vint2 vi2)
{ return vec_mergeo(vi2, vec_splat(vi2, 3)); }
static INLINE vint vcastu_vi_vm(vmask vi2)
{ return vec_mergeo((vint2)vi2, vec_splat((vint2)vi2, 3)); }
// expand lower 32-bit mask
static INLINE vopmask vcast_vo64_vo32(vopmask m)
{ return vec_mergeh(m, m); }
// unsigned expand lower 32-bit integer
static INLINE vint2 vcastu_vi2_vi(vint vi)
{ return vec_mergeh(vzero__vi(), vi); }
static INLINE vmask vcastu_vm_vi(vint vi)
{ return (vmask)vec_mergeh(vzero__vi(), vi); }
static INLINE vopmask vcast_vo_i(int i) {
i = i ? -1 : 0;
return (vopmask) { i, i, i, i };
}
// signed int to single-precision
static INLINE vfloat vcast_vf_vi2(vint2 vi)
{
vfloat ret;
#if defined(__clang__) || __GNUC__ >= 9
ret = __builtin_convertvector(vi, vfloat);
#else
__asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi));
#endif
return ret;
}
// lower signed int to double-precision
static INLINE vdouble vcast_vd_vi(vint vi)
{
vdouble ret;
vint swap = vec_mergeh(vi, vi);
#if defined(__clang__) || __GNUC__ >= 7
ret = __builtin_vsx_xvcvsxwdp(swap);
#else
__asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap));
#endif
return ret;
}
// zip two scalars
static INLINE vmask vcast_vm_i_i(int l, int h)
{ return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); }
static INLINE vmask vcast_vm_i64(int64_t i) {
return (vmask)vsetall__s64(i);
}
static INLINE vmask vcast_vm_u64(uint64_t i) {
return (vmask)vsetall__u64(i);
}
////////////// Truncation //////////////
static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
{
vint2 ret;
#if defined(__clang__) || __GNUC__ >= 9
ret = __builtin_convertvector(vf, vint2);
#else
__asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf));
#endif
return ret;
}
static INLINE vint vtruncate_vi_vd(vdouble vd)
{
vint ret;
#if defined(__clang__) || __GNUC__ >= 7
ret = __builtin_vsx_xvcvdpsxws(vd);
#else
__asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd));
#endif
return vec_mergeo(ret, vec_splat(ret, 3));
}
static INLINE vdouble vtruncate_vd_vd(vdouble vd)
{ return vec_trunc(vd); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf)
{ return vec_trunc(vf); }
////////////// Rounding //////////////
// towards the nearest even
static INLINE vint vrint_vi_vd(vdouble vd)
{ return vtruncate_vi_vd(vec_rint(vd)); }
static INLINE vint2 vrint_vi2_vf(vfloat vf)
{ return vtruncate_vi2_vf(vec_rint(vf)); }
static INLINE vdouble vrint_vd_vd(vdouble vd)
{ return vec_rint(vd); }
static INLINE vfloat vrint_vf_vf(vfloat vf)
{ return vec_rint(vf); }
/**********************************************
** Logical
**********************************************/
////////////// And //////////////
static INLINE vint vand_vi_vi_vi(vint x, vint y)
{ return vec_and(x, y); }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y)
{ return vec_and((vint)x, y); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_and(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y)
{ return (vint2)vec_and((vint2)x, y); }
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y)
{ return vec_and(x, y); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)
{ return vec_and((vmask)x, y); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)
{ return vec_and((vmask)x, y); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y)
{ return vec_and(x, y); }
////////////// Or //////////////
static INLINE vint vor_vi_vi_vi(vint x, vint y)
{ return vec_or(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_or(x, y); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y)
{ return vec_or(x, y); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)
{ return vec_or((vmask)x, y); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)
{ return vec_or((vmask)x, y); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y)
{ return vec_or(x, y); }
////////////// Xor //////////////
static INLINE vint vxor_vi_vi_vi(vint x, vint y)
{ return vec_xor(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_xor(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y)
{ return vec_xor(x, y); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)
{ return vec_xor((vmask)x, y); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)
{ return vec_xor((vmask)x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y)
{ return vec_xor(x, y); }
////////////// Not //////////////
static INLINE vopmask vnot_vo_vo(vopmask o)
{ return vec_nor(o, o); }
////////////// And Not ((~x) & y) //////////////
static INLINE vint vandnot_vi_vi_vi(vint x, vint y)
{ return vec_andc(y, x); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y)
{ return vec_andc(y, (vint)x); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_andc(y, x); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y)
{ return vec_andc(y, x); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)
{ return vec_andc(y, x); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)
{ return vec_andc(y, x); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y)
{ return vec_andc(y, x); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y)
{ return vec_andc(y, (vint2)x); }
/**********************************************
** Comparison
**********************************************/
////////////// Equal //////////////
static INLINE vint veq_vi_vi_vi(vint x, vint y)
{ return (vint)vec_cmpeq(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y)
{ return vec_cmpeq(x, y); }
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y)
{ return vec_cmpeq(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y)
{ return (vint2)vec_cmpeq(x, y); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y)
{ return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmpeq(x, y); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmpeq(x, y); }
////////////// Not Equal //////////////
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y)
{ return vnot_vo_vo(vec_cmpeq(x, y)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y)
{ return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); }
////////////// Less Than //////////////
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmplt(x, y); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmplt(x, y); }
////////////// Greater Than //////////////
static INLINE vint vgt_vi_vi_vi(vint x, vint y)
{ return (vint)vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y)
{ return vec_cmpgt(x, y);}
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y)
{ return (vint2)vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y)
{ return vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmpgt(x, y); }
////////////// Less Than Or Equal //////////////
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmple(x, y); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmple(x, y); }
////////////// Greater Than Or Equal //////////////
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmpge(x, y); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmpge(x, y); }
////////////// Special Cases //////////////
static INLINE vopmask visinf_vo_vf(vfloat d)
{ return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); }
static INLINE vopmask visinf_vo_vd(vdouble d)
{ return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); }
static INLINE vopmask vispinf_vo_vf(vfloat d)
{ return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vd(vdouble d)
{ return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); }
static INLINE vopmask visminf_vo_vf(vfloat d)
{ return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vd(vdouble d)
{ return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); }
static INLINE vopmask visnan_vo_vf(vfloat d)
{ return vnot_vo_vo(vec_cmpeq(d, d)); }
static INLINE vopmask visnan_vo_vd(vdouble d)
{ return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); }
/**********************************************
** Shift
**********************************************/
////////////// Left //////////////
static INLINE vint vsll_vi_vi_i(vint x, int c)
{ return vec_sl (x, vsetall__u32(c)); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c)
{ return vec_sl(x, vsetall__u32(c)); }
////////////// Right //////////////
static INLINE vint vsrl_vi_vi_i(vint x, int c)
{ return vec_sr(x, vsetall__u32(c)); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c)
{ return vec_sr(x, vsetall__u32(c)); }
////////////// Algebraic Right //////////////
static INLINE vint vsra_vi_vi_i(vint x, int c)
{ return vec_sra(x, vsetall__u32(c)); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c)
{ return vec_sra(x, vsetall__u32(c)); }
/**********************************************
** Reorder
**********************************************/
////////////// Reverse //////////////
// Reverse elements order inside the lower and higher parts
static INLINE vint2 vrev21_vi2_vi2(vint2 vi)
{ return vec_mergee(vec_mergeo(vi, vi), vi); }
static INLINE vfloat vrev21_vf_vf(vfloat vf)
{ return (vfloat)vrev21_vi2_vi2((vint2)vf); }
// Swap the lower and higher parts
static INLINE vfloat vreva2_vf_vf(vfloat vf)
{ return (vfloat)v__swapd_u64((v__u64)vf); }
static INLINE vdouble vrev21_vd_vd(vdouble vd)
{ return (vdouble)v__swapd_u64((v__u64)vd); }
static INLINE vdouble vreva2_vd_vd(vdouble vd)
{ return vd; }
/**********************************************
** Arithmetic
**********************************************/
////////////// Negation //////////////
static INLINE vint vneg_vi_vi(vint e) {
#if defined(__clang__) || __GNUC__ >= 9
return vec_neg(e);
#else
return vec_sub(vzero__vi(), e);
#endif
}
static INLINE vint2 vneg_vi2_vi2(vint2 e)
{ return vneg_vi_vi(e); }
static INLINE vfloat vneg_vf_vf(vfloat d)
{
vfloat ret;
#if defined(__clang__) || __GNUC__ >= 9
ret = vec_neg(d);
#else
__asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d));
#endif
return ret;
}
static INLINE vdouble vneg_vd_vd(vdouble d)
{
vdouble ret;
#if defined(__clang__) || __GNUC__ >= 9
ret = vec_neg(d);
#else
__asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d));
#endif
return ret;
}
static INLINE vfloat vposneg_vf_vf(vfloat d)
{ return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); }
static INLINE vdouble vposneg_vd_vd(vdouble d)
{ return vec_xor(d, vset__vd(+0.0, -0.0)); }
static INLINE vfloat vnegpos_vf_vf(vfloat d)
{ return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); }
static INLINE vdouble vnegpos_vd_vd(vdouble d)
{ return vec_xor(d, vset__vd(-0.0, +0.0)); }
////////////// Addition //////////////
static INLINE vint vadd_vi_vi_vi(vint x, vint y)
{ return vec_add(x, y); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_add(x, y); }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y)
{ return vec_add(x, y); }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y)
{ return vec_add(x, y); }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y)
{ return (vmask)vec_add((v__i64)x, (v__i64)y); }
////////////// Subtraction //////////////
static INLINE vint vsub_vi_vi_vi(vint x, vint y)
{ return vec_sub(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_sub(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y)
{ return vec_sub(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y)
{ return vec_sub(x, y); }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y)
{ return vec_add(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y)
{ return vec_add(x, vnegpos_vf_vf(y)); }
////////////// Multiplication //////////////
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y)
{ return vec_mul(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y)
{ return vec_mul(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y)
{ return vec_div(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y)
{ return vec_div(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x)
{ return vec_div(vsetall__vf(1.0f), x); }
static INLINE vdouble vrec_vd_vd(vdouble x)
{ return vec_div(vsetall__vd(1.0), x); }
/**********************************************
** Math
**********************************************/
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y)
{ return vec_max(x, y); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y)
{ return vec_max(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y)
{ return vec_min(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y)
{ return vec_min(x, y); }
static INLINE vfloat vabs_vf_vf(vfloat f)
{ return vec_abs(f); }
static INLINE vdouble vabs_vd_vd(vdouble d)
{ return vec_abs(d); }
static INLINE vfloat vsqrt_vf_vf(vfloat f)
{ return vec_sqrt(f); }
static INLINE vdouble vsqrt_vd_vd(vdouble d)
{ return vec_sqrt(d); }
/**********************************************
** FMA3
**********************************************/
#if CONFIG == 1 || CONFIG == 3
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_madd(x, y, z); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_madd(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_msub(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_msub(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_nmsub(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_nmsub(x, y, z); }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_add(vec_mul(x, y), z); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_add(vec_mul(x, y), z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_sub(vec_mul(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_sub(vec_mul(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_sub(z, vec_mul(x, y)); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_sub(z, vec_mul(x, y)); }
#endif
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_madd(x, y, z); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_madd(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_madd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_madd(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_msub(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_msub(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_nmsub(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_nmsub(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_nmadd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_nmadd(x, y, z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
static INLINE int vtestallzeros_i_vo64(vopmask g) {
return vec_all_eq((__vector signed long long)g, vzero__s64());
}
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (v__b64)o);
}
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
return (vmask)vec_sub((__vector signed long long)x, (__vector signed long long)y);
}
static INLINE vmask vneg64_vm_vm(vmask x) {
return (vmask)vec_sub((__vector signed long long) {0, 0}, (__vector signed long long)x);
}
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
}
#define vsll64_vm_vm_i(x, c) ((vmask)vec_sl((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
#define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
static INLINE vint vcast_vi_vm(vmask vm) {
return (vint) { vm[0], vm[2] };
}
static INLINE vmask vcast_vm_vi(vint vi) {
return (vmask) (__vector signed long long) { vi[0], vi[1] };
}
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }

View File

@@ -0,0 +1,561 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdint.h>
#include <math.h>
#include "misc.h"
#ifndef CONFIG
#error CONFIG macro not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENDP CONFIG
//@#define LOG2VECTLENDP CONFIG
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#define DFTPRIORITY LOG2VECTLENDP
#define ISANAME "Pure C Array"
typedef union {
uint32_t u[VECTLENDP*2];
uint64_t x[VECTLENDP];
double d[VECTLENDP];
float f[VECTLENDP*2];
int32_t i[VECTLENDP*2];
} versatileVector;
typedef versatileVector vmask;
typedef versatileVector vopmask;
typedef versatileVector vdouble;
typedef versatileVector vint;
typedef versatileVector vfloat;
typedef versatileVector vint2;
typedef union {
uint8_t u[sizeof(long double)*VECTLENDP];
long double ld[VECTLENDP];
} longdoubleVector;
typedef longdoubleVector vmaskl;
typedef longdoubleVector vlongdouble;
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
typedef union {
uint8_t u[sizeof(Sleef_quad)*VECTLENDP];
Sleef_quad q[VECTLENDP];
} quadVector;
typedef quadVector vmaskq;
typedef quadVector vquad;
#endif
//
static INLINE int vavailability_i(int name) { return -1; }
static INLINE void vprefetch_v_p(const void *ptr) { }
static INLINE int vtestallones_i_vo64(vopmask g) {
int ret = 1; for(int i=0;i<VECTLENDP;i++) ret = ret && g.x[i]; return ret;
}
static INLINE int vtestallones_i_vo32(vopmask g) {
int ret = 1; for(int i=0;i<VECTLENSP;i++) ret = ret && g.u[i]; return ret;
}
//
static vint2 vloadu_vi2_p(int32_t *p) {
vint2 vi;
for(int i=0;i<VECTLENSP;i++) vi.i[i] = p[i];
return vi;
}
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
for(int i=0;i<VECTLENSP;i++) p[i] = v.i[i];
}
static vint vloadu_vi_p(int32_t *p) {
vint vi;
for(int i=0;i<VECTLENDP;i++) vi.i[i] = p[i];
return vi;
}
static void vstoreu_v_p_vi(int32_t *p, vint v) {
for(int i=0;i<VECTLENDP;i++) p[i] = v.i[i];
}
//
static INLINE vopmask vcast_vo32_vo64(vopmask m) {
vopmask ret;
for(int i=0;i<VECTLENDP;i++) ret.u[i] = m.u[i*2+1];
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.u[i] = 0;
return ret;
}
static INLINE vopmask vcast_vo64_vo32(vopmask m) {
vopmask ret;
for(int i=0;i<VECTLENDP;i++) ret.u[i*2] = ret.u[i*2+1] = m.u[i];
return ret;
}
static INLINE vmask vcast_vm_i_i(int h, int l) {
vmask ret;
for(int i=0;i<VECTLENDP;i++) {
ret.u[i*2+0] = l;
ret.u[i*2+1] = h;
}
return ret;
}
static INLINE vint2 vcastu_vi2_vi(vint vi) {
vint2 ret;
for(int i=0;i<VECTLENDP;i++) {
ret.i[i*2+0] = 0;
ret.i[i*2+1] = vi.i[i];
}
return ret;
}
static INLINE vint vcastu_vi_vi2(vint2 vi2) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i*2+1];
return ret;
}
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i];
return ret;
}
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
vint2 ret;
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi.i[i];
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.i[i] = 0;
return ret;
}
static INLINE vdouble vrev21_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.d[i*2+0] = d0.d[i*2+1];
r.d[i*2+1] = d0.d[i*2+0];
}
return r;
}
static INLINE vdouble vreva2_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.d[i*2+0] = d0.d[(VECTLENDP/2-1-i)*2+0];
r.d[i*2+1] = d0.d[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vfloat vrev21_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r.f[i*2+0] = d0.f[i*2+1];
r.f[i*2+1] = d0.f[i*2+0];
}
return r;
}
static INLINE vfloat vreva2_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r.f[i*2+0] = d0.f[(VECTLENSP/2-1-i)*2+0];
r.f[i*2+1] = d0.f[(VECTLENSP/2-1-i)*2+1];
}
return r;
}
static INLINE vdouble vcast_vd_d(double d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = d; return ret; }
//
static INLINE vopmask vand_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vopmask vor_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vopmask vxor_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vmask vand_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vmask vandnot_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vmask vor_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vmask vxor_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
//
static INLINE vdouble vsel_vd_vo_vd_vd (vopmask o, vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { vint2 ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = vi.i[i]; return ret; }
static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = (int)vd.d[i]; return ret; }
static INLINE vint vrint_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = vd.d[i] > 0 ? (int)(vd.d[i] + 0.5) : (int)(vd.d[i] - 0.5); return ret; }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
static INLINE vint vcast_vi_i(int j) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = j; return ret; }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] == y.x[i] ? -1 : 0; return ret; }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] + y.x[i]; return ret; }
//
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { union { vdouble vd; vmask vm; } cnv; cnv.vd = vd; return cnv.vm; }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { union { vdouble vd; vint2 vi2; } cnv; cnv.vd = vd; return cnv.vi2; }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { union { vint2 vi2; vdouble vd; } cnv; cnv.vi2 = vi; return cnv.vd; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { union { vmask vm; vdouble vd; } cnv; cnv.vm = vm; return cnv.vd; }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] + y.d[i]; return ret; }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] - y.d[i]; return ret; }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i]; return ret; }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] / y.d[i]; return ret; }
static INLINE vdouble vrec_vd_vd(vdouble x) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = 1.0 / x.d[i]; return ret; }
static INLINE vdouble vabs_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.x[i] & 0x7fffffffffffffffULL; return ret; }
static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = -d.d[i]; return ret; }
static INLINE vdouble vmla_vd_vd_vd_vd (vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] + z.d[i]; return ret; }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] - z.d[i]; return ret; }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] > y.d[i] ? x.d[i] : y.d[i]; return ret; }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] < y.d[i] ? x.d[i] : y.d[i]; return ret; }
static INLINE vdouble vposneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? d.d[i] : -d.d[i]; return ret; }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? -d.d[i] : d.d[i]; return ret; }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? x.d[i] - y.d[i] : x.d[i] + y.d[i]; return ret; }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] == y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] != y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] < y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <= y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] > y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >= y.d[i] ? -1 : 0; return ret; }
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
static INLINE vint vneg_vi_vi (vint x) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = -x.i[i]; return ret; }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] & y.i[i]; return ret; }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] | y.i[i]; return ret; }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] ^ y.i[i]; return ret; }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] << c; return ret; }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
static INLINE vint vsra_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] >> c; return ret; }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
union { vopmask vo; vint2 vi2; } cnv;
cnv.vo = m;
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), x),
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), y));
}
static INLINE vopmask visinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = (d.d[i] == SLEEF_INFINITY || d.d[i] == -SLEEF_INFINITY) ? -1 : 0; return ret; }
static INLINE vopmask vispinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == SLEEF_INFINITY ? -1 : 0; return ret; }
static INLINE vopmask visminf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == -SLEEF_INFINITY ? -1 : 0; return ret; }
static INLINE vopmask visnan_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] != d.d[i] ? -1 : 0; return ret; }
static INLINE vdouble vsqrt_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = sqrt(d.d[i]); return ret; }
#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
#endif
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
return vd;
}
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.d[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.d[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
//
static INLINE vint2 vcast_vi2_vm(vmask vm) { union { vint2 vi2; vmask vm; } cnv; cnv.vm = vm; return cnv.vi2; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { union { vint2 vi2; vmask vm; } cnv; cnv.vi2 = vi; return cnv.vm; }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = vi.i[i]; return ret; }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = (int)vf.f[i]; return ret; }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = vf.f[i] > 0 ? (int)(vf.f[i] + 0.5) : (int)(vf.f[i] - 0.5); return ret; }
static INLINE vint2 vcast_vi2_i(int j) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = j; return ret; }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
static INLINE vfloat vcast_vf_f(float f) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = f; return ret; }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { union { vfloat vf; vmask vm; } cnv; cnv.vf = vf; return cnv.vm; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { union { vfloat vf; vmask vm; } cnv; cnv.vm = vm; return cnv.vf; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { union { vfloat vf; vint2 vi2; } cnv; cnv.vi2 = vi; return cnv.vf; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { union { vfloat vf; vint2 vi2; } cnv; cnv.vf = vf; return cnv.vi2; }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] + y.f[i]; return ret; }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] - y.f[i]; return ret; }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i]; return ret; }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] / y.f[i]; return ret; }
static INLINE vfloat vrec_vf_vf (vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = 1.0 / x.f[i]; return ret; }
static INLINE vfloat vabs_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & 0x7fffffff; return ret; }
static INLINE vfloat vneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = -x.f[i]; return ret; }
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] + z.f[i]; return ret; }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] - z.f[i]; return ret; }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] > y.f[i] ? x.f[i] : y.f[i]; return ret; }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] < y.f[i] ? x.f[i] : y.f[i]; return ret; }
static INLINE vfloat vposneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] : -x.f[i]; return ret; }
static INLINE vfloat vnegpos_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? -x.f[i] : x.f[i]; return ret; }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] - y.f[i] : x.f[i] + y.f[i]; return ret; }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] == y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] != y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] < y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <= y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] > y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >= y.f[i]) ? -1 : 0); return ret; }
static INLINE vint vadd_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
static INLINE vint vsub_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
static INLINE vint vneg_vi2_vi2(vint x) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = -x.i[i]; return ret; }
static INLINE vint vand_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & y.i[i]; return ret; }
static INLINE vint vandnot_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
static INLINE vint vor_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] | y.i[i]; return ret; }
static INLINE vint vxor_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] ^ y.i[i]; return ret; }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
union { vopmask vo; vint2 vi2; } cnv;
cnv.vo = x;
return vand_vi2_vi2_vi2(cnv.vi2, y);
}
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(x, y); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] << c; return ret; }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >> c; return ret; }
static INLINE vopmask visinf_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (d.f[i] == SLEEF_INFINITYf || d.f[i] == -SLEEF_INFINITYf) ? -1 : 0; return ret; }
static INLINE vopmask vispinf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == SLEEF_INFINITYf ? -1 : 0; return ret; }
static INLINE vopmask visminf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == -SLEEF_INFINITYf ? -1 : 0; return ret; }
static INLINE vopmask visnan_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] != d.f[i] ? -1 : 0; return ret; }
static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = sqrtf(x.f[i]); return ret; }
#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) { return v.f[0]; }
#endif
static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
static INLINE vfloat vloadu_vf_p(const float *ptr) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[i];
return vf;
}
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
return vf;
}
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
}
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
for(int i=0;i<VECTLENSP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.f[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.f[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
//
static INLINE vlongdouble vcast_vl_l(long double d) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = d; return ret; }
static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.ld[i*2+0] = d0.ld[i*2+1];
r.ld[i*2+1] = d0.ld[i*2+0];
}
return r;
}
static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.ld[i*2+0] = d0.ld[(VECTLENDP/2-1-i)*2+0];
r.ld[i*2+1] = d0.ld[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] + y.ld[i]; return ret; }
static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] - y.ld[i]; return ret; }
static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] * y.ld[i]; return ret; }
static INLINE vlongdouble vneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = -x.ld[i]; return ret; }
static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] - y.ld[i] : x.ld[i] + y.ld[i]; return ret; }
static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
static INLINE vlongdouble vposneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] : -x.ld[i]; return ret; }
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? -x.ld[i] : x.ld[i]; return ret; }
static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
vlongdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.ld[i] = ptr[i];
return vd;
}
static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v.ld[i];
}
static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.ld[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.ld[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
static INLINE vquad vcast_vq_q(Sleef_quad d) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = d; return ret; }
static INLINE vquad vrev21_vq_vq(vquad d0) {
vquad r;
for(int i=0;i<VECTLENDP/2;i++) {
r.q[i*2+0] = d0.q[i*2+1];
r.q[i*2+1] = d0.q[i*2+0];
}
return r;
}
static INLINE vquad vreva2_vq_vq(vquad d0) {
vquad r;
for(int i=0;i<VECTLENDP/2;i++) {
r.q[i*2+0] = d0.q[(VECTLENDP/2-1-i)*2+0];
r.q[i*2+1] = d0.q[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] + y.q[i]; return ret; }
static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] - y.q[i]; return ret; }
static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] * y.q[i]; return ret; }
static INLINE vquad vneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = -x.q[i]; return ret; }
static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] - y.q[i] : x.q[i] + y.q[i]; return ret; }
static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
static INLINE vquad vposneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] : -x.q[i]; return ret; }
static INLINE vquad vnegpos_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? -x.q[i] : x.q[i]; return ret; }
static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
vquad vd;
for(int i=0;i<VECTLENDP;i++) vd.q[i] = ptr[i];
return vd;
}
static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v.q[i];
}
static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.q[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.q[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
#endif

View File

@@ -0,0 +1,487 @@
// Copyright Naoki Shibata and contributors 2010 - 2023.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !defined(SLEEF_GENHEADER)
#include <stdint.h>
#endif
#ifndef ENABLE_BUILTIN_MATH
#if !defined(SLEEF_GENHEADER)
#include <math.h>
#endif
#define SQRT sqrt
#define SQRTF sqrtf
#define FMA fma
#define FMAF fmaf
#define RINT rint
#define RINTF rintf
#define TRUNC trunc
#define TRUNCF truncf
#else
#define SQRT __builtin_sqrt
#define SQRTF __builtin_sqrtf
#define FMA __builtin_fma
#define FMAF __builtin_fmaf
#define RINT __builtin_rint
#define RINTF __builtin_rintf
#define TRUNC __builtin_trunc
#define TRUNCF __builtin_truncf
#endif
#if !defined(SLEEF_GENHEADER)
#include "misc.h"
#endif
#ifndef CONFIG
#error CONFIG macro not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define ENABLE_SP
//@#define ENABLE_SP
#if CONFIG == 2 || CONFIG == 3
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) || defined(__riscv) || CONFIG == 3
#ifndef FP_FAST_FMA
//@#ifndef FP_FAST_FMA
#define FP_FAST_FMA
//@#define FP_FAST_FMA
#endif
//@#endif
#ifndef FP_FAST_FMAF
//@#ifndef FP_FAST_FMAF
#define FP_FAST_FMAF
//@#define FP_FAST_FMAF
#endif
//@#endif
#endif
#if (!defined(FP_FAST_FMA) || !defined(FP_FAST_FMAF)) && !defined(SLEEF_GENHEADER)
#error FP_FAST_FMA or FP_FAST_FMAF not defined
#endif
#define ISANAME "Pure C scalar with FMA"
#else // #if CONFIG == 2 || CONFIG == 3
#define ISANAME "Pure C scalar"
#endif // #if CONFIG == 2 || CONFIG == 3
#define LOG2VECTLENDP 0
//@#define LOG2VECTLENDP 0
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define LOG2VECTLENSP 0
//@#define LOG2VECTLENSP 0
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#if defined(__SSE4_1__) || defined(__aarch64__) || CONFIG == 3
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#endif
#define DFTPRIORITY LOG2VECTLENDP
typedef uint64_t vmask;
typedef uint32_t vopmask;
typedef double vdouble;
typedef int32_t vint;
typedef float vfloat;
typedef int32_t vint2;
typedef int64_t vint64;
typedef uint64_t vuint64;
typedef Sleef_uint64_2t vquad;
#if CONFIG != 3
typedef Sleef_quad vargquad;
#else
typedef Sleef_uint64_2t vargquad;
#endif
//
static INLINE int vavailability_i(int name) { return -1; }
static INLINE void vprefetch_v_p(const void *ptr) {}
static INLINE int vtestallones_i_vo64(vopmask g) { return g; }
static INLINE int vtestallones_i_vo32(vopmask g) { return g; }
//
static vint2 vloadu_vi2_p(int32_t *p) { return *p; }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { *p = v; }
static vint vloadu_vi_p(int32_t *p) { return *p; }
static void vstoreu_v_p_vi(int32_t *p, vint v) { *p = v; }
//
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return m; }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return m; }
static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
static INLINE vmask vcast_vm_i_i(int h, int l) { return (((uint64_t)h) << 32) | (uint32_t)l; }
static INLINE vmask vcast_vm_i64(int64_t i) { return (int64_t)i; }
static INLINE vmask vcast_vm_u64(uint64_t i) { return i; }
static INLINE vmask vcastu_vm_vi(vint vi) { return ((uint64_t)vi) << 32; }
static INLINE vint vcastu_vi_vm(vmask vm) { return (int32_t)(vm >> 32); }
static INLINE vdouble vcast_vd_d(double d) { return d; }
//
static INLINE vopmask vand_vo_vo_vo (vopmask x, vopmask y) { return x & y; }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
static INLINE vopmask vor_vo_vo_vo (vopmask x, vopmask y) { return x | y; }
static INLINE vopmask vxor_vo_vo_vo (vopmask x, vopmask y) { return x ^ y; }
static INLINE vmask vand_vm_vm_vm (vmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vm_vm (vmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vm_vm (vmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vm_vm (vmask x, vmask y) { return x ^ y; }
static INLINE vmask vcast_vm_vo(vopmask o) { return (vmask)o | (((vmask)o) << 32); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vcast_vm_vo(x) & y; }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~vcast_vm_vo(x); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vcast_vm_vo(x) | y; }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vcast_vm_vo(x) ^ y; }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vcast_vm_vo(x) & y; }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~vcast_vm_vo(x); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vcast_vm_vo(x) | y; }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vcast_vm_vo(x) ^ y; }
//
static INLINE vdouble vsel_vd_vo_vd_vd (vopmask o, vdouble x, vdouble y) { return o ? x : y; }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return o ? x : y; }
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return o ? v1 : v0; }
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vdouble vcast_vd_vi(vint vi) { return vi; }
static INLINE vint vcast_vi_i(int j) { return j; }
#ifdef FULL_FP_ROUNDING
static INLINE vint vrint_vi_vd(vdouble d) { return (int32_t)RINT(d); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return RINT(vd); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return TRUNC(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return (int32_t)TRUNC(vd); }
#else
static INLINE vint vrint_vi_vd(vdouble a) {
a += a > 0 ? 0.5 : -0.5;
uint64_t vx;
memcpy(&vx, &a, sizeof(vx));
vx -= 1 & (int)a;
memcpy(&a, &vx, sizeof(a));
return a;
}
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return vd; }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
#endif
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return x == y ? ~(uint32_t)0 : 0; }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return x + y; }
//
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { vmask vm; memcpy(&vm, &vd, sizeof(vm)); return vm; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { vdouble vd; memcpy(&vd, &vm, sizeof(vd)); return vd; }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }
static INLINE vdouble vabs_vd_vd(vdouble d) {
uint64_t vx;
memcpy(&vx, &d, sizeof(vx));
vx &= UINT64_C(0x7fffffffffffffff);
memcpy(&d, &vx, sizeof(d));
return d;
}
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return x > y ? x : y; }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return x < y ? x : y; }
#ifndef ENABLE_FMA_DP
static INLINE vdouble vmla_vd_vd_vd_vd (vdouble x, vdouble y, vdouble z) { return x * y + z; }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return -x * y + z; }
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, -z); }
#endif
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return x == y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return x != y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return x < y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return x <= y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return x > y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return x >= y ? ~(uint32_t)0 : 0; }
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
static INLINE vint vneg_vi_vi (vint x) { return - x; }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return x & y; }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~x; }
static INLINE vint vsll_vi_vi_i(vint x, int c) { return (uint32_t)x << c; }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (uint32_t)x >> c; }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return x == y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return x > y ? ~(uint32_t)0 : 0; }
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return m ? x : y; }
static INLINE vopmask visinf_vo_vd(vdouble d) { return (d == SLEEF_INFINITY || d == -SLEEF_INFINITY) ? ~(uint32_t)0 : 0; }
static INLINE vopmask vispinf_vo_vd(vdouble d) { return d == SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
static INLINE vopmask visminf_vo_vd(vdouble d) { return d == -SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
static INLINE vopmask visnan_vo_vd(vdouble d) { return d != d ? ~(uint32_t)0 : 0; }
static INLINE vdouble vsqrt_vd_vd(vdouble d) { return SQRT(d); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return SQRTF(x); }
static INLINE double vcast_d_vd(vdouble v) { return v; }
static INLINE vdouble vload_vd_p(const double *ptr) { return *ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return *ptr; }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return ptr[vi]; }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
//
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (int32_t)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (uint32_t)vi; }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (int32_t)vi; }
static INLINE vint2 vcast_vi2_i(int j) { return j; }
#ifdef FULL_FP_ROUNDING
static INLINE vint2 vrint_vi2_vf(vfloat d) { return (int)RINTF(d); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return RINTF(vd); }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return TRUNCF(vd); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (int32_t)TRUNCF(vf); }
#else
static INLINE vint2 vrint_vi2_vf(vfloat a) {
a += a > 0 ? 0.5f : -0.5f;
uint32_t vu[1];
memcpy(vu, &a, sizeof(vu));
vu[0] -= 1 & (int)a;
memcpy(&a, vu, sizeof(a));
return (int32_t)a;
}
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vf; }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
#endif
static INLINE vfloat vcast_vf_f(float f) { return f; }
static INLINE vmask vreinterpret_vm_vf(vfloat f) { vfloat vf[2] = { f, 0 }; vmask vm; memcpy(&vm, &vf, sizeof(vm)); return vm; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { vfloat vf[2]; memcpy(&vf, &vm, sizeof(vf)); return vf[0]; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { vfloat vf; memcpy(&vf, &vi, sizeof(vf)); return vf; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat f) { vint2 vi2; memcpy(&vi2, &f, sizeof(vi2)); return vi2; }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
static INLINE vfloat vrec_vf_vf (vfloat x) { return 1 / x; }
static INLINE vfloat vabs_vf_vf(vfloat x) {
int32_t vi[1];
memcpy(vi, &x, sizeof(vi));
vi[0] &= 0x7fffffff;
memcpy(&x, vi, sizeof(x));
return x;
}
static INLINE vfloat vneg_vf_vf(vfloat x) { return -x; }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return x > y ? x : y; }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return x < y ? x : y; }
#ifndef ENABLE_FMA_SP
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return x * y + z; }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return - x * y + z; }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x * y - z; }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, -z); }
#endif
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return x == y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return x != y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return x < y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return x <= y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return x > y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return x >= y ? ~(uint32_t)0 : 0; }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
static INLINE vint2 vneg_vi2_vi2(vint2 x) { return -x; }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return o ? x : y; }
static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return o ? v1 : v0; }
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vcast_vm_vo(x) & y; }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~vcast_vm_vo(x); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
return x << c;
}
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
return ((uint32_t)x) >> c;
}
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
return x >> c;
}
static INLINE vopmask visinf_vo_vf (vfloat d) { return (d == SLEEF_INFINITYf || d == -SLEEF_INFINITYf) ? ~(uint32_t)0 : 0; }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return d == SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
static INLINE vopmask visminf_vo_vf(vfloat d) { return d == -SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
static INLINE vopmask visnan_vo_vf (vfloat d) { return d != d ? ~(uint32_t)0 : 0; }
static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x > (int32_t)y ? ~(uint32_t)0 : 0; }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x > (int32_t)y ? ~(uint32_t)0 : 0; }
static INLINE float vcast_f_vf(vfloat v) { return v; }
static INLINE vfloat vload_vf_p(const float *ptr) { return *ptr; }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return *ptr; }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) { return ptr[vi]; }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
//
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(8 + (char *)&vq, p, 8);
memcpy((char *)&vq, 8 + p, 8);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(8 + (char *)&vq, (char *)&aq, 8);
memcpy((char *)&vq, 8 + (char *)&aq, 8);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(8 + (char *)&aq, (char *)&vq, 8);
memcpy((char *)&aq, 8 + (char *)&vq, 8);
return aq;
}
#else
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, sizeof(vq));
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, sizeof(vq));
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, sizeof(aq));
return aq;
}
#endif
//
static INLINE int vtestallzeros_i_vo64(vopmask g) { return !g ? ~(uint32_t)0 : 0; }
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return o ? x : y; }
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return (int64_t)x - (int64_t)y; }
static INLINE vmask vneg64_vm_vm(vmask x) { return -(int64_t)x; }
#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
//@#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
//@#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (int64_t)x > (int64_t)y ? ~(uint32_t)0 : 0; }
static INLINE vmask vcast_vm_vi(vint vi) { return vi; }
static INLINE vint vcast_vi_vm(vmask vm) { return vm; }
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,462 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 140 || CONFIG == 141 || CONFIG == 150 || CONFIG == 151
#if !defined(__VX__) && !defined(SLEEF_GENHEADER)
#error This helper is for IBM s390x.
#endif
#if __ARCH__ < 12 && !defined(SLEEF_GENHEADER)
#error Please specify -march=z14 or higher.
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#if CONFIG == 140 || CONFIG == 150
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#if !defined(SLEEF_GENHEADER)
#ifndef SLEEF_VECINTRIN_H_INCLUDED
#include <vecintrin.h>
#define SLEEF_VECINTRIN_H_INCLUDED
#endif
#include <stdint.h>
#include <math.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
typedef __vector unsigned long long vmask;
typedef __vector unsigned long long vopmask;
typedef __vector double vdouble;
typedef __vector int vint;
typedef __vector float vfloat;
typedef __vector int vint2;
typedef __vector long long vint64;
typedef __vector unsigned long long vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
//
#if !defined(SLEEF_GENHEADER)
static INLINE int vavailability_i(int n) {
if (n == 1 || n == 2) {
return vec_max((vdouble) {n, n}, (vdouble) {n, n})[0] != 0;
}
return 0;
}
#if CONFIG == 140 || CONFIG == 141
#define ISANAME "VXE"
#else
#define ISANAME "VXE2"
#endif
#define DFTPRIORITY 14
#endif // #if !defined(SLEEF_GENHEADER)
static INLINE void vprefetch_v_p(const void *ptr) { }
static vint2 vloadu_vi2_p(int32_t *p) { return (vint2) { p[0], p[1], p[2], p[3] }; }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
static vint vloadu_vi_p(int32_t *p) { return (vint) { p[0], p[1] }; }
static void vstoreu_v_p_vi(int32_t *p, vint v) { p[0] = v[0]; p[1] = v[1]; }
static INLINE vdouble vload_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
static INLINE void vstore_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
static INLINE vdouble vloadu_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
static INLINE void vstoreu_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
static INLINE vfloat vload_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
static INLINE void vstore_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
static INLINE void vscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) {
*(p+(offset + step * 0)*2 + 0) = v[0];
*(p+(offset + step * 0)*2 + 1) = v[1];
*(p+(offset + step * 1)*2 + 0) = v[2];
*(p+(offset + step * 1)*2 + 1) = v[3];
}
static INLINE vfloat vloadu_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
static INLINE void vstoreu_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
static INLINE void vscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&p[2*offset]), v); }
static INLINE vdouble vgather_vd_p_vi(const double *p, vint vi) {
return ((vdouble) { p[vi[0]], p[vi[1]] });
}
static INLINE vfloat vgather_vf_p_vi2(const float *p, vint2 vi2) {
return ((vfloat) { p[vi2[0]], p[vi2[1]], p[vi2[2]], p[vi2[3]] });
}
static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (long long)-1 : 0, i ? (long long)-1 : 0 }; }
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { vi[0], vi[1] }; }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { vi[0], vi[1], vi[2], vi[3] }; }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 5); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 4); }
static INLINE vint vrint_vi_vd(vdouble vd) {
vd = vrint_vd_vd(vd);
return (vint) { vd[0], vd[1] };
}
static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { vd[0], vd[1] }; }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { vf[0], vf[1], vf[2], vf[3] }; }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
static INLINE vfloat vrec_vf_vf(vfloat x) { return 1 / x; }
static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return vec_sel(y, x, o); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return vec_sel(y, x, (__vector unsigned int)o); }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return vec_sel(y, x, (__vector unsigned int)o); }
static INLINE int vtestallones_i_vo32(vopmask g) { return vec_all_ne((vint2)g, (vint2 ) { 0, 0, 0, 0 }); }
static INLINE int vtestallones_i_vo64(vopmask g) { return vec_all_ne((__vector unsigned long long)g, (__vector unsigned long long) { 0, 0 }); }
static INLINE vopmask vcast_vo32_vo64(vopmask g) { return (vopmask)(vint) { g[0] != 0 ? -1 : 0, g[1] != 0 ? -1 : 0, 0, 0 }; }
static INLINE vopmask vcast_vo64_vo32(vopmask g) { return (vopmask) { ((vint)g)[0] != 0 ? 0xffffffffffffffffLL : 0, ((vint)g)[1] != 0 ? 0xffffffffffffffffLL : 0 }; }
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask)(vint){ h, l, h, l }; }
static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)(vint64){ i, i }; }
static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)(vuint64){ i, i }; }
static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)(vint2){ vi[0], 0, vi[1], 0 }; }
static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ vi2[0] >> 32, vi2[1] >> 32 }; }
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
return (vopmask) { x[0] == y[0] ? 0xffffffffffffffffLL : 0, x[1] == y[1] ? 0xffffffffffffffffLL : 0 };
}
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
return (vmask)((__vector long long)x + (__vector long long)y);
}
//
#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
//
static INLINE vdouble vabs_vd_vd(vdouble d) { return vec_abs(d); }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
#if CONFIG == 140 || CONFIG == 150
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
#endif
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmadd(x, y, z); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
#if CONFIG == 140 || CONFIG == 150
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
static INLINE vfloat vfma_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmadd(x, y, z); }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
//
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
//
static INLINE vopmask vnot_vo_vo(vopmask o) { return ~o; }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpeq(x, y); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmplt(x, y); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmple(x, y); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpgt(x, y); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpge(x, y); }
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
static INLINE vint vneg_vi_vi(vint e) { return -e; }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, vreinterpretFirstHalf_vi_vi2((vint2)x)); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> (__vector int){c, c, c, c}; }
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return vec_cmpgt(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpeq(x, y)); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpgt(x, y));}
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
}
static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY))); }
static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(SLEEF_INFINITY))); }
static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(-SLEEF_INFINITY))); }
static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(vnot_vo_vo(vec_cmpeq(d, d))); }
static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
static INLINE void vstream_v_p_vd(double *p, vdouble v) { vstore_v_p_vd(p, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(p, offset, step, v); }
//
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> (__vector int){c, c, c, c}; }
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpeq(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpgt(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpeq(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpgt(x, y); }
static INLINE void vsscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(p, offset, step, v); }
static INLINE void vstream_v_p_vf(float *p, vfloat v) { vstore_v_p_vf(p, v); }
//
static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vec_sqrt(d); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vec_max(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vec_min(x, y); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpeq(x, y); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmplt(x, y); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmple(x, y); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpgt(x, y); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpge(x, y); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vec_abs(f); }
static INLINE vfloat vrint_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 4); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 5); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vec_max(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vec_min(x, y); }
static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vec_sqrt(d); }
static INLINE vopmask visinf_vo_vf (vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf (vfloat d) { return vneq_vo_vf_vf(d, d); }
static INLINE vint2 vrint_vi2_vf(vfloat vf) {
vf = vrint_vf_vf(vf);
return (vint) { vf[0], vf[1], vf[2], vf[3] };
}
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad m = { aq.y, aq.x };
return m;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad a = { vq.y, vq.x };
return a;
}
static INLINE int vtestallzeros_i_vo64(vopmask g) {
return vec_all_eq((__vector signed long long)g, (__vector signed long long){ 0, 0 });
}
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (__vector __bool long long)o);
}
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
return (vmask)((__vector signed long long)x - (__vector signed long long)y);
}
static INLINE vmask vneg64_vm_vm(vmask x) {
return (vmask)((__vector signed long long) {0, 0} - (__vector signed long long)x);
}
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
}
#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { c, c }))
#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { c, c }))
static INLINE vint vcast_vi_vm(vmask vm) {
return (vint) { vm[0], vm[1] };
}
static INLINE vmask vcast_vm_vi(vint vi) {
return (vmask) (__vector signed long long) { vi[0], vi[1] };
}
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }

View File

@@ -0,0 +1,517 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if CONFIG == 2
#if !defined(__SSE2__) && !defined(SLEEF_GENHEADER)
#error Please specify -msse2.
#endif
#elif CONFIG == 3
#if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER)
#error Please specify -msse2 and -msse3
#endif
#elif CONFIG == 4
#if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER)
#error Please specify -msse2, -msse3 and -msse4.1
#endif
#else
#error CONFIG macro invalid or not defined
#endif
#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)
typedef __m128i vmask;
typedef __m128i vopmask;
typedef __m128d vdouble;
typedef __m128i vint;
typedef __m128 vfloat;
typedef __m128i vint2;
typedef __m128i vint64;
typedef __m128i vuint64;
typedef struct {
vmask x, y;
} vquad;
typedef vquad vargquad;
//
#if !defined(SLEEF_GENHEADER)
#ifndef __SLEEF_H__
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
#endif
static INLINE int cpuSupportsSSE2() {
int32_t reg[4];
Sleef_x86CpuID(reg, 1, 0);
return (reg[3] & (1 << 26)) != 0;
}
static INLINE int cpuSupportsSSE3() {
int32_t reg[4];
Sleef_x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 0)) != 0;
}
static INLINE int cpuSupportsSSE4_1() {
int32_t reg[4];
Sleef_x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 19)) != 0;
}
#if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__)
static INLINE int vavailability_i(int name) {
//int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3") && __builtin_cpu_supports("sse4.1");
int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1();
return d ? 3 : 0;
}
#define ISANAME "SSE4.1"
#define DFTPRIORITY 12
#elif defined(__SSE2__) && defined(__SSE3__)
static INLINE int vavailability_i(int name) {
//int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3");
int d = cpuSupportsSSE2() && cpuSupportsSSE3();
return d ? 3 : 0;
}
#define ISANAME "SSE3"
#define DFTPRIORITY 11
#else
static INLINE int vavailability_i(int name) {
int d = cpuSupportsSSE2();
return d ? 3 : 0;
}
#define ISANAME "SSE2"
#define DFTPRIORITY 10
#endif
#endif // #if !defined(SLEEF_GENHEADER)
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
//
static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
//
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
static INLINE vint2 vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
static INLINE vint vcastu_vi_vm(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
#if CONFIG == 4
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#else
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
vmask t = _mm_cmpeq_epi32(x, y);
return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));
}
#endif
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
//
static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
#if CONFIG == 4
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }
#else
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {
return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y));
}
#endif
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY)));
}
static INLINE vopmask vispinf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY)));
}
static INLINE vopmask visminf_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY)));
}
static INLINE vopmask visnan_vo_vd(vdouble d) {
return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));
}
//
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[sizeof(vint)/sizeof(int)];
vstoreu_v_p_vi(a, vi);
return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
}
// This function is for debugging
static INLINE double vcast_d_vd(vdouble v) {
double a[VECTLENDP];
vstoreu_v_p_vd(a, v);
return a[0];
}
//
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }
#if CONFIG != 4
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }
#endif
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
#if CONFIG == 4
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }
#else
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));
}
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) {
return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y));
}
#endif
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
int a[VECTLENSP];
vstoreu_v_p_vi2(a, vi);
return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}
// This function is for debugging
static INLINE float vcast_f_vf(vfloat v) {
float a[VECTLENSP];
vstoreu_v_p_vf(a, v);
return a[0];
}
//
#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
#if CONFIG >= 3
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
#else
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
#endif
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
//
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
}
//
static vquad loadu_vq_p(void *p) {
vquad vq;
memcpy(&vq, p, VECTLENDP * 16);
return vq;
}
static INLINE vquad cast_vq_aq(vargquad aq) {
vquad vq;
memcpy(&vq, &aq, VECTLENDP * 16);
return vq;
}
static INLINE vargquad cast_aq_vq(vquad vq) {
vargquad aq;
memcpy(&aq, &vq, VECTLENDP * 16);
return aq;
}
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y));
}
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
int64_t ax[2], ay[2];
_mm_storeu_si128((__m128i *)ax, x);
_mm_storeu_si128((__m128i *)ay, y);
return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
}
static INLINE vmask vcast_vm_vi(vint vi) {
vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,871 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdint.h>
#include "misc.h"
#ifndef CONFIG
#error CONFIG macro not defined
#endif
#define ENABLE_DP
#define ENABLE_SP
#define LOG2VECTLENDP CONFIG
#define VECTLENDP (1 << LOG2VECTLENDP)
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
#define DFTPRIORITY LOG2VECTLENDP
#if defined(__clang__)
#define ISANAME "Clang Vector Extension"
typedef uint32_t vmask __attribute__((ext_vector_type(VECTLENDP*2)));
typedef uint32_t vopmask __attribute__((ext_vector_type(VECTLENDP*2)));
typedef double vdouble __attribute__((ext_vector_type(VECTLENDP)));
typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP)));
typedef float vfloat __attribute__((ext_vector_type(VECTLENDP*2)));
typedef int32_t vint2 __attribute__((ext_vector_type(VECTLENDP*2)));
#ifdef ENABLE_LONGDOUBLE
typedef uint8_t vmaskl __attribute__((ext_vector_type(sizeof(long double)*VECTLENDP)));
typedef long double vlongdouble __attribute__((ext_vector_type(VECTLENDP)));
#endif
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
typedef uint8_t vmaskq __attribute__((ext_vector_type(sizeof(Sleef_quad)*VECTLENDP)));
#ifdef ENABLE_LONGDOUBLE
typedef Sleef_quad vquad __attribute__((ext_vector_type(VECTLENDP)));
#endif
#endif
#elif defined(__GNUC__)
#define ISANAME "GCC Vector Extension"
typedef uint32_t vmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
typedef uint32_t vopmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
typedef double vdouble __attribute__((vector_size(sizeof(double)*VECTLENDP)));
typedef int32_t vint __attribute__((vector_size(sizeof(int32_t)*VECTLENDP)));
typedef float vfloat __attribute__((vector_size(sizeof(float)*VECTLENDP*2)));
typedef int32_t vint2 __attribute__((vector_size(sizeof(int32_t)*VECTLENDP*2)));
#ifdef ENABLE_LONGDOUBLE
typedef uint8_t vmaskl __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
#endif
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
typedef uint8_t vmaskq __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
typedef Sleef_quad vquad __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
#endif
#endif
//
#if VECTLENDP == 2
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], 0, 0 }; }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1] }; }
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
#endif
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
static INLINE vquad vcast_vq_q(Sleef_quad d) { return (vquad) { d, d }; }
#endif
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h }; }
static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1] }; }
static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3] }; }
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0] }; }
static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return vd; }
static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1] }; }
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1] }; }
#endif
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
static INLINE vquad vrev21_vq_vq(vquad vd) { return (vquad) { vd[1], vd[0] }; }
static INLINE vquad vreva2_vq_vq(vquad vd) { return vd; }
static INLINE vquad vposneg_vq_vq(vquad vd) { return (vquad) { +vd[0], -vd[1] }; }
static INLINE vquad vnegpos_vq_vq(vquad vd) { return (vquad) { -vd[0], +vd[1] }; }
#endif
#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
#elif VECTLENDP == 4
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], 0, 0, 0, 0 }; }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3] }; }
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f }; }
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d }; }
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d }; }
#endif
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h }; }
static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3] }; }
static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7] }; }
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3] }; }
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], 0, 0, 0, 0 }; }
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2] }; }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[2], vd[3], vd[0], vd[1] }; }
static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2] }; }
static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[2], vd[3], vd[0], vd[1] }; }
static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3] }; }
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3] }; }
#endif
#elif VECTLENDP == 8
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], m[9], m[11], m[13], m[15], 0, 0, 0, 0, 0, 0, 0, 0 }; }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3], m[4], m[4], m[5], m[5], m[6], m[6], m[7], m[7] }; }
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i, i, i, i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f }; }
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d, d, d, d, d }; }
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d, d, d, d, d }; }
#endif
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h, l, h, l, h, l, h, l, h }; }
static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3], 0, vi[4], 0, vi[5], 0, vi[6], 0, vi[7] }; }
static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7], vi2[9], vi2[11], vi2[13], vi2[15] }; }
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3], vi2[4], vi2[5], vi2[6], vi2[7] }; }
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], vi[4], vi[5], vi[6], vi[7], 0, 0, 0, 0, 0, 0, 0, 0 }; }
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 })
static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
static INLINE vfloat vrev21_vf_vf(vfloat vd) {
return (vfloat) {
vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6],
vd[9], vd[8], vd[11], vd[10], vd[13], vd[12], vd[15], vd[14] };
}
static INLINE vfloat vreva2_vf_vf(vfloat vd) {
return (vfloat) {
vd[14], vd[15], vd[12], vd[13], vd[10], vd[11], vd[8], vd[9],
vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1]};
}
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3], +vd[4], -vd[5], +vd[6], -vd[7] }; }
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3], -vd[4], +vd[5], -vd[6], +vd[7] }; }
#endif
#else
static INLINE vint vcast_vi_i(int k) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = k;
return ret;
}
static INLINE vint2 vcast_vi2_i(int k) {
vint2 ret;
for(int i=0;i<VECTLENSP;i++) ret[i] = k;
return ret;
}
static INLINE vdouble vcast_vd_d(double d) {
vdouble ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = d;
return ret;
}
static INLINE vfloat vcast_vf_f(float f) {
vfloat ret;
for(int i=0;i<VECTLENSP;i++) ret[i] = f;
return ret;
}
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vcast_vl_l(long double d) {
vlongdouble ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = d;
return ret;
}
#endif
static INLINE vopmask vcast_vo32_vo64(vopmask m) {
vopmask ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = m[i*2+1];
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
return ret;
}
static INLINE vopmask vcast_vo64_vo32(vopmask m) {
vopmask ret;
for(int i=0;i<VECTLENDP;i++) ret[i*2] = ret[i*2+1] = m[i];
return ret;
}
static INLINE vmask vcast_vm_i_i(int h, int l) {
vmask ret;
for(int i=0;i<VECTLENDP;i++) {
ret[i*2+0] = l;
ret[i*2+1] = h;
}
return ret;
}
static INLINE vint2 vcastu_vi2_vi(vint vi) {
vint2 ret;
for(int i=0;i<VECTLENDP;i++) {
ret[i*2+0] = 0;
ret[i*2+1] = vi[i];
}
return ret;
}
static INLINE vint vcastu_vi_vi2(vint2 vi2) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i*2+1];
return ret;
}
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i];
return ret;
}
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
vint2 ret;
for(int i=0;i<VECTLENDP;i++) ret[i] = vi[i];
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
return ret;
}
static INLINE vdouble vrev21_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = d0[i*2+1];
r[i*2+1] = d0[i*2+0];
}
return r;
}
static INLINE vdouble vreva2_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vfloat vrev21_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r[i*2+0] = d0[i*2+1];
r[i*2+1] = d0[i*2+0];
}
return r;
}
static INLINE vfloat vreva2_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r[i*2+0] = d0[(VECTLENSP/2-1-i)*2+0];
r[i*2+1] = d0[(VECTLENSP/2-1-i)*2+1];
}
return r;
}
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = d0[i*2+1];
r[i*2+1] = d0[i*2+0];
}
return r;
}
static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
#endif
static INLINE vdouble vposneg_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = +d0[i*2+0];
r[i*2+1] = -d0[i*2+1];
}
return r;
}
static INLINE vdouble vnegpos_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = -d0[i*2+0];
r[i*2+1] = +d0[i*2+1];
}
return r;
}
static INLINE vfloat vposneg_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r[i*2+0] = +d0[i*2+0];
r[i*2+1] = -d0[i*2+1];
}
return r;
}
static INLINE vfloat vnegpos_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r[i*2+0] = -d0[i*2+0];
r[i*2+1] = +d0[i*2+1];
}
return r;
}
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vposneg_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = +d0[i*2+0];
r[i*2+1] = -d0[i*2+1];
}
return r;
}
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r[i*2+0] = -d0[i*2+0];
r[i*2+1] = +d0[i*2+1];
}
return r;
}
#endif
#endif
//
static INLINE int vavailability_i(int name) { return -1; }
static INLINE void vprefetch_v_p(const void *ptr) { }
static INLINE int vtestallones_i_vo64(vopmask g) {
int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
}
static INLINE int vtestallones_i_vo32(vopmask g) {
int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
}
//
static vint2 vloadu_vi2_p(int32_t *p) {
vint2 vi;
for(int i=0;i<VECTLENSP;i++) vi[i] = p[i];
return vi;
}
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
for(int i=0;i<VECTLENSP;i++) p[i] = v[i];
}
static vint vloadu_vi_p(int32_t *p) {
vint vi;
for(int i=0;i<VECTLENDP;i++) vi[i] = p[i];
return vi;
}
static void vstoreu_v_p_vi(int32_t *p, vint v) {
for(int i=0;i<VECTLENDP;i++) p[i] = v[i];
}
//
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
//
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return (vdouble)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (vint2)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vdouble vcast_vd_vi(vint vi) {
#if defined(__clang__)
return __builtin_convertvector(vi, vdouble);
#else
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = vi[i];
return vd;
#endif
}
static INLINE vint vtruncate_vi_vd(vdouble vd) {
#if defined(__clang__)
return __builtin_convertvector(vd, vint);
#else
vint vi;
for(int i=0;i<VECTLENDP;i++) vi[i] = vd[i];
return vi;
#endif
}
static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
#if defined(__clang__)
typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
#else
typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
#endif
return (vopmask)((vi64)x == (vi64)y);
}
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
#if defined(__clang__)
typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
#else
typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
#endif
return (vmask)((vi64)x + (vi64)y);
}
//
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (vint2)vd; }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (vdouble)vi; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1.0 / x; }
static INLINE vdouble vabs_vd_vd(vdouble d) { return (vdouble)((vmask)d & ~(vmask)vcast_vd_d(-0.0)); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y + z; }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x > y), x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x < y), x, y); }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x == y); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x != y); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x < y); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x <= y); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x > y); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x >= y); }
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
static INLINE vint vneg_vi_vi(vint e) { return -e; }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~vreinterpretFirstHalf_vi_vi2((vint2)x); }
static INLINE vint vsll_vi_vi_i(vint x, int c) {
#if defined(__clang__)
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
#else
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
#endif
return (vint)(((vu)x) << c);
}
static INLINE vint vsrl_vi_vi_i(vint x, int c) {
#if defined(__clang__)
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
#else
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
#endif
return (vint)(((vu)x) >> c);
}
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return x == y; }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return x > y; }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x == y); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x > y);}
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
}
static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vabs_vd_vd(d) == SLEEF_INFINITY); }
static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(d == SLEEF_INFINITY); }
static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(d == -SLEEF_INFINITY); }
static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(d != d); }
static INLINE vdouble vsqrt_vd_vd(vdouble d) {
#if defined(__clang__)
typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
#else
typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
#endif
vdouble q = vcast_vd_d(1);
vopmask o = (vopmask)(d < 8.636168555094445E-78);
d = (vdouble)((o & (vmask)(d * 1.157920892373162E77)) | (~o & (vmask)d));
q = (vdouble)((o & (vmask)vcast_vd_d(2.9387358770557188E-39)) | (~o & (vmask)vcast_vd_d(1)));
q = (vdouble)vor_vm_vm_vm(vlt_vo_vd_vd(d, vcast_vd_d(0)), (vmask)q);
vdouble x = (vdouble)(0x5fe6ec85e7de30daLL - ((vi64)(d + 1e-320) >> 1));
x = x * ( 3 - d * x * x);
x = x * ( 12 - d * x * x);
x = x * (768 - d * x * x);
x *= 1.0 / (1 << 13);
x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
return x * q;
}
static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
return vd;
}
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
return vd;
}
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
}
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
//
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return (vfloat)(((vmask)o & (vmask)x) | (~(vmask)o & (vmask)y)); }
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
static INLINE vfloat vcast_vf_vi2(vint2 vi) {
#if defined(__clang__)
return __builtin_convertvector(vi, vfloat);
#else
vfloat vf;
for(int i=0;i<VECTLENDP*2;i++) vf[i] = vi[i];
return vf;
#endif
}
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) {
#if defined(__clang__)
return __builtin_convertvector(vf, vint2);
#else
vint2 vi;
for(int i=0;i<VECTLENDP*2;i++) vi[i] = vf[i];
return vi;
#endif
}
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
static INLINE vfloat vrec_vf_vf(vfloat x) { return 1.0f / x; }
static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x*y+z; }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return z-x*y; }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x > y), x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x < y), x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x == y); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x != y); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x < y); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x <= y); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x > y); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x >= y); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
#if defined(__clang__)
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
#else
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
#endif
return (vint2)(((vu)x) << c);
}
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
#if defined(__clang__)
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
#else
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
#endif
return (vint2)(((vu)x) >> c);
}
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> c; }
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x == y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x > y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return x == y; }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return x > y; }
static INLINE vopmask visinf_vo_vf(vfloat d) { return (vopmask)(vabs_vf_vf(d) == SLEEF_INFINITYf); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return (vopmask)(d == SLEEF_INFINITYf); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return (vopmask)(d == -SLEEF_INFINITYf); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return (vopmask)(d != d); }
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
vfloat q = vcast_vf_f(1);
vopmask o = (vopmask)(d < 5.4210108624275221700372640043497e-20f); // 2^-64
d = (vfloat)((o & (vmask)(d * vcast_vf_f(18446744073709551616.0f))) | (~o & (vmask)d)); // 2^64
q = (vfloat)((o & (vmask)vcast_vf_f(0.00000000023283064365386962890625f)) | (~o & (vmask)vcast_vf_f(1))); // 2^-32
q = (vfloat)vor_vm_vm_vm(vlt_vo_vf_vf(d, vcast_vf_f(0)), (vmask)q);
vfloat x = (vfloat)(0x5f330de2 - (((vint2)d) >> 1));
x = x * ( 3.0f - d * x * x);
x = x * (12.0f - d * x * x);
x *= 0.0625f;
x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
return x * q;
}
static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
static INLINE vfloat vloadu_vf_p(const float *ptr) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[i];
return vf;
}
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
return vf;
}
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
}
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
for(int i=0;i<VECTLENSP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
//
#ifdef ENABLE_LONGDOUBLE
static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return x + y; }
static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { return x - y; }
static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { return x * y; }
static INLINE vlongdouble vneg_vl_vl(vlongdouble d) { return -d; }
static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return vadd_vl_vl_vl(x, vnegpos_vl_vl(y)); }
static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
vlongdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
return vd;
}
static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
}
static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
#endif
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { return x + y; }
static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { return x - y; }
static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { return x * y; }
static INLINE vquad vneg_vq_vq(vquad d) { return -d; }
static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { return vadd_vq_vq_vq(x, vnegpos_vq_vq(y)); }
static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
vquad vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
return vd;
}
static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
}
static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
#endif

View File

@@ -0,0 +1,25 @@
# Compiler properties
set(COMMON_TARGET_PROPERTIES
C_STANDARD 99 # -std=gnu99
)
if (BUILD_SHARED_LIBS)
list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON) # -fPIC
endif()
# This is a workaround of appveyor bug
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}")
# Target TARGET_LIBCOMMON_OBJ
add_library(${TARGET_LIBCOMMON_OBJ} OBJECT common.c)
set_target_properties(${TARGET_LIBCOMMON_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target TARGET_LIBARRAYMAP_OBJ
add_library(${TARGET_LIBARRAYMAP_OBJ} OBJECT arraymap.c)
set_target_properties(${TARGET_LIBARRAYMAP_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
add_host_executable("addSuffix" addSuffix.c)
set_target_properties("addSuffix" PROPERTIES C_STANDARD 99)

View File

@@ -0,0 +1,234 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdbool.h>
#define N 1000
FILE *cygopen(const char *path, const char *mode) {
#if defined(__MINGW64__) || defined(__MINGW32__)
FILE *fp = fopen(path, mode);
if (fp != NULL) return fp;
char *buf = malloc(strlen(path) + N + 1);
snprintf(buf, strlen(path) + N, "cygpath -m '%s'", path);
FILE *pfp = popen(buf, "r");
if (pfp == NULL || fgets(buf, N, pfp) == NULL) {
if (pfp != NULL) pclose(pfp);
free(buf);
return NULL;
}
pclose(pfp);
int len = strlen(buf);
if (0 < len && len < N && buf[len-1] == '\n') buf[len-1] = '\0';
fp = fopen(buf, mode);
free(buf);
return fp;
#else
return fopen(path, mode);
#endif
}
int nkeywords = 0, nalloc = 0;
char **keywords = NULL, *suffix = NULL;
int nIgnore = 0;
char **ignore = NULL;
void insert(char *buf) {
for(int i=0;i<nIgnore;i++) if (strcmp(ignore[i], buf) == 0) return;
for(int i=0;i<nkeywords;i++) {
if (strcmp(keywords[i], buf) == 0) printf("%s", suffix);
}
}
void doit(FILE *fp) {
int state = 0;
bool nl = true;
char buf[N+10], *p = buf;
for(;;) {
int c = getc(fp);
if (c == EOF) break;
switch(state) {
case 0:
if (isalnum(c) || c == '_') {
ungetc(c, fp);
p = buf;
state = 1;
break;
}
if (c == '/') {
int c2 = getc(fp);
if (c2 == '*') {
putc(c, stdout);
putc(c2, stdout);
state = 4;
break;
} else if (c2 == '/') {
putc(c, stdout);
putc(c2, stdout);
do {
c = getc(fp);
putc(c, stdout);
} while(c != '\n');
break;
}
ungetc(c2, fp);
}
if (nl && c == '#') {
putc(c, stdout);
do {
c = getc(fp);
putc(c, stdout);
} while(c != '\n');
break;
}
putc(c, stdout);
if (!isspace(c)) nl = false;
if (c == '\n') nl = true;
if (c == '\"') state = 2;
if (c == '\'') state = 3;
break;
case 1: // Identifier
if (isalnum(c) || c == '_') {
if (p - buf < N) { *p++ = c; *p = '\0'; }
putc(c, stdout);
} else if (c == '\"') {
insert(buf);
putc(c, stdout);
state = 2;
} else if (c == '\'') {
insert(buf);
putc(c, stdout);
state = 3;
} else {
insert(buf);
putc(c, stdout);
state = 0;
}
break;
case 2: // String
if (c == '\\') {
putc(c, stdout);
putc(getc(fp), stdout);
} else if (c == '\"') {
putc(c, stdout);
state = 0;
} else {
putc(c, stdout);
}
break;
case 3: // Character
if (c == '\\') {
putc(c, stdout);
putc(getc(fp), stdout);
} else if (c == '\'') {
putc(c, stdout);
state = 0;
} else {
putc(c, stdout);
}
break;
case 4: // Comment
if (c == '*') {
int c2 = getc(fp);
if (c2 == '/') {
putc(c, stdout);
putc(c2, stdout);
state = 0;
break;
}
ungetc(c2, fp);
}
putc(c, stdout);
break;
}
}
}
int main(int argc, char **argv) {
nalloc = 1;
keywords = malloc(sizeof(char *) * nalloc);
if (argc < 2) {
fprintf(stderr, "%s <input file>\n", argv[0]);
fprintf(stderr, "Print the file on the standard output\n");
fprintf(stderr, "\n");
fprintf(stderr, "%s <input file> <keywords file> <suffix> [<keywords to ignore> ... ]\n", argv[0]);
fprintf(stderr, "Add the suffix to keywords\n");
exit(-1);
}
char buf[N];
if (argc == 2) {
FILE *fp = cygopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Cannot open %s\n", argv[1]);
exit(-1);
}
while(fgets(buf, N, fp) != NULL) {
fputs(buf, stdout);
}
fclose(fp);
exit(0);
}
FILE *fp = cygopen(argv[2], "r");
if (fp == NULL) {
fprintf(stderr, "Cannot open %s\n", argv[2]);
exit(-1);
}
while(fgets(buf, N, fp) != NULL) {
if (strlen(buf) >= 1) buf[strlen(buf)-1] = '\0';
keywords[nkeywords] = malloc(sizeof(char) * (strlen(buf) + 1));
strcpy(keywords[nkeywords], buf);
nkeywords++;
if (nkeywords >= nalloc) {
nalloc *= 2;
keywords = realloc(keywords, sizeof(char *) * nalloc);
}
}
fclose(fp);
nIgnore = argc - 4;
ignore = argv + 4;
suffix = argv[3];
fp = cygopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Cannot open %s\n", argv[1]);
exit(-1);
}
doit(fp);
fclose(fp);
exit(0);
}
// cat sleef*inline*.h | egrep -o '[a-zA-Z_][0-9a-zA-Z_]*' | sort | uniq > cand.txt

View File

@@ -0,0 +1,347 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include <inttypes.h>
#include <assert.h>
//
#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER))
#include <unistd.h>
#include <sys/types.h>
#include <sys/file.h>
static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); }
static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); }
static void FTRUNCATE(FILE *fp, off_t z) {
if (ftruncate(fileno(fp), z))
;
}
static FILE *OPENTMPFILE() { return tmpfile(); }
static void CLOSETMPFILE(FILE *fp) { fclose(fp); }
#else
#include <windows.h>
#include <io.h>
static void FLOCK(FILE *fp) { }
static void FUNLOCK(FILE *fp) { }
static void FTRUNCATE(FILE *fp, long z) {
fseek(fp, 0, SEEK_SET);
SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp)));
}
static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); }
static void CLOSETMPFILE(FILE *fp) {
fclose(fp);
remove("tmpfile.txt");
}
#endif
//
#define MAGIC_ARRAYMAPNODE 0xf73130fa
#define MAGIC_ARRAYMAP 0x8693bd21
#define LOGNBUCKETS 8
#define NBUCKETS (1 << LOGNBUCKETS)
static int hash(uint64_t key) {
return (key ^ (key >> LOGNBUCKETS) ^ (key >> (LOGNBUCKETS*2)) ^ (key >> (LOGNBUCKETS*3))) & (NBUCKETS-1);
}
static void String_trim(char *str) {
char *dst = str, *src = str, *pterm = src;
while(*src != '\0' && isspace((int)*src)) src++;
for(;*src != '\0';src++) {
*dst++ = *src;
if (!isspace((int)*src)) pterm = dst;
}
*pterm = '\0';
}
typedef struct ArrayMapNode {
uint32_t magic;
uint64_t key;
void *value;
} ArrayMapNode;
typedef struct ArrayMap {
uint32_t magic;
ArrayMapNode *array[NBUCKETS];
int size[NBUCKETS], capacity[NBUCKETS], totalSize;
} ArrayMap;
ArrayMap *initArrayMap() {
ArrayMap *thiz = (ArrayMap *)calloc(1, sizeof(ArrayMap));
thiz->magic = MAGIC_ARRAYMAP;
for(int i=0;i<NBUCKETS;i++) {
thiz->capacity[i] = 8;
thiz->array[i] = (ArrayMapNode *)malloc(thiz->capacity[i] * sizeof(ArrayMapNode));
thiz->size[i] = 0;
}
thiz->totalSize = 0;
return thiz;
}
void ArrayMap_dispose(ArrayMap *thiz) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
for(int j=0;j<NBUCKETS;j++) {
for(int i=0;i<thiz->size[j];i++) {
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
thiz->array[j][i].magic = 0;
}
free(thiz->array[j]);
}
thiz->magic = 0;
free(thiz);
}
int ArrayMap_size(ArrayMap *thiz) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
return thiz->totalSize;
}
uint64_t *ArrayMap_keyArray(ArrayMap *thiz) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
uint64_t *a = (uint64_t *)malloc(sizeof(uint64_t) * thiz->totalSize);
int p = 0;
for(int j=0;j<NBUCKETS;j++) {
for(int i=0;i<thiz->size[j];i++) {
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
a[p++] = thiz->array[j][i].key;
}
}
return a;
}
void **ArrayMap_valueArray(ArrayMap *thiz) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
void **a = (void **)malloc(sizeof(void *) * thiz->totalSize);
int p = 0;
for(int j=0;j<NBUCKETS;j++) {
for(int i=0;i<thiz->size[j];i++) {
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
a[p++] = thiz->array[j][i].value;
}
}
return a;
}
void *ArrayMap_remove(ArrayMap *thiz, uint64_t key) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
int h = hash(key);
for(int i=0;i<thiz->size[h];i++) {
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
if (thiz->array[h][i].key == key) {
void *old = thiz->array[h][i].value;
thiz->array[h][i].key = thiz->array[h][thiz->size[h]-1].key;
thiz->array[h][i].value = thiz->array[h][thiz->size[h]-1].value;
thiz->array[h][thiz->size[h]-1].magic = 0;
thiz->size[h]--;
thiz->totalSize--;
return old;
}
}
return NULL;
}
void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value) {
if (value == NULL) return ArrayMap_remove(thiz, key);
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
int h = hash(key);
for(int i=0;i<thiz->size[h];i++) {
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
if (thiz->array[h][i].key == key) {
void *old = thiz->array[h][i].value;
thiz->array[h][i].value = value;
return old;
}
}
if (thiz->size[h] >= thiz->capacity[h]) {
thiz->capacity[h] *= 2;
thiz->array[h] = (ArrayMapNode *)realloc(thiz->array[h], thiz->capacity[h] * sizeof(ArrayMapNode));
}
ArrayMapNode *n = &(thiz->array[h][thiz->size[h]++]);
n->magic = MAGIC_ARRAYMAPNODE;
n->key = key;
n->value = value;
thiz->totalSize++;
return NULL;
}
void *ArrayMap_get(ArrayMap *thiz, uint64_t key) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
int h = hash(key);
for(int i=0;i<thiz->size[h];i++) {
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
if (thiz->array[h][i].key == key) {
return thiz->array[h][i].value;
}
}
return NULL;
}
#define LINELEN (1024*1024)
ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock) {
const int idstrlen = (int)strlen(idstr);
int prefixLen = (int)strlen(prefix) + 3;
if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return NULL;
FILE *fp = fopen(fn, "r");
if (fp == NULL) return NULL;
if (doLock) FLOCK(fp);
ArrayMap *thiz = initArrayMap();
char *prefix2 = malloc(prefixLen+10);
strcpy(prefix2, prefix);
String_trim(prefix2);
for(char *p = prefix2;*p != '\0';p++) {
if (*p == ':') *p = ';';
if (*p == ' ') *p = '_';
}
strcat(prefix2, " : ");
prefixLen = (int)strlen(prefix2);
char *line = malloc(sizeof(char) * (LINELEN+10));
line[idstrlen] = '\0';
if (fread(line, sizeof(char), idstrlen, fp) != idstrlen ||
strcmp(idstr, line) != 0) {
if (doLock) FUNLOCK(fp);
fclose(fp);
free(prefix2);
free(line);
return NULL;
}
for(;;) {
line[LINELEN] = '\0';
if (fgets(line, LINELEN, fp) == NULL) break;
if (strncmp(line, prefix2, prefixLen) != 0) continue;
uint64_t key;
char *value = malloc(sizeof(char) * LINELEN);
if (sscanf(line + prefixLen, "%" SCNx64 " : %s\n", &key, value) == 2) {
ArrayMap_put(thiz, (uint64_t)key, (void *)value);
} else {
free(value);
}
}
if (doLock) FUNLOCK(fp);
fclose(fp);
free(prefix2);
free(line);
return thiz;
}
int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr) {
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
const int idstrlen = (int)strlen(idstr);
int prefixLen = (int)strlen(prefix) + 3;
if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return -1;
// Generate prefix2
char *prefix2 = malloc(prefixLen+10);
strcpy(prefix2, prefix);
String_trim(prefix2);
for(char *p = prefix2;*p != '\0';p++) {
if (*p == ':') *p = ';';
if (*p == ' ') *p = '_';
}
strcat(prefix2, " : ");
prefixLen = (int)strlen(prefix2);
//
FILE *fp = fopen(fn, "a+");
if (fp == NULL) return -1;
FLOCK(fp);
fseek(fp, 0, SEEK_SET);
// Copy the file specified by fn to tmpfile
FILE *tmpfp = OPENTMPFILE();
if (tmpfp == NULL) {
FUNLOCK(fp);
fclose(fp);
return -1;
}
char *line = malloc(sizeof(char) * (LINELEN+10));
line[idstrlen] = '\0';
if (fread(line, sizeof(char), idstrlen, fp) == idstrlen && strcmp(idstr, line) == 0) {
for(;;) {
line[LINELEN] = '\0';
if (fgets(line, LINELEN, fp) == NULL) break;
if (strncmp(line, prefix2, prefixLen) != 0) fputs(line, tmpfp);
}
}
// Write the contents in the map into tmpfile
uint64_t *keys = ArrayMap_keyArray(thiz);
int s = ArrayMap_size(thiz);
for(int i=0;i<s;i++) {
char *value = ArrayMap_get(thiz, keys[i]);
if (strlen(value) + prefixLen >= LINELEN-10) continue;
fprintf(tmpfp, "%s %" PRIx64 " : %s\n", prefix2, keys[i], value);
}
free(keys);
fseek(fp, 0, SEEK_SET);
FTRUNCATE(fp, 0);
fwrite(idstr, sizeof(char), strlen(idstr), fp);
fseek(tmpfp, 0, SEEK_SET);
for(;;) {
size_t s = fread(line, 1, LINELEN, tmpfp);
if (s == 0) break;
fwrite(line, 1, s, fp);
}
FUNLOCK(fp);
fclose(fp);
CLOSETMPFILE(tmpfp);
free(prefix2);
free(line);
return 0;
}

View File

@@ -0,0 +1,21 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef __ARRAYMAP_H__
#define __ARRAYMAP_H__
typedef struct ArrayMap ArrayMap;
ArrayMap *initArrayMap();
void ArrayMap_dispose(ArrayMap *thiz);
int ArrayMap_size(ArrayMap *thiz);
void *ArrayMap_remove(ArrayMap *thiz, uint64_t key);
void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value);
void *ArrayMap_get(ArrayMap *thiz, uint64_t key);
uint64_t *ArrayMap_keyArray(ArrayMap *thiz);
void **ArrayMap_valueArray(ArrayMap *thiz);
int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr);
ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock);
#endif

View File

@@ -0,0 +1,98 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include "misc.h"
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
#include <sys/timeb.h>
EXPORT void *Sleef_malloc(size_t z) { return _aligned_malloc(z, 256); }
EXPORT void Sleef_free(void *ptr) { _aligned_free(ptr); }
EXPORT uint64_t Sleef_currentTimeMicros() {
struct __timeb64 t;
_ftime64(&t);
return t.time * INT64_C(1000000) + t.millitm*1000;
}
#elif defined(__APPLE__)
#include <sys/time.h>
EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 256, z); return ptr; }
EXPORT void Sleef_free(void *ptr) { free(ptr); }
EXPORT uint64_t Sleef_currentTimeMicros() {
struct timeval time;
gettimeofday(&time, NULL);
return (uint64_t)((time.tv_sec * INT64_C(1000000)) + time.tv_usec);
}
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
#include <time.h>
#include <unistd.h>
#if defined(__FreeBSD__) || defined(__OpenBSD__)
#include <stdlib.h>
#else
#include <malloc.h>
#endif
EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 4096, z); return ptr; }
EXPORT void Sleef_free(void *ptr) { free(ptr); }
EXPORT uint64_t Sleef_currentTimeMicros() {
struct timespec tp;
clock_gettime(CLOCK_MONOTONIC, &tp);
return (uint64_t)tp.tv_sec * INT64_C(1000000) + ((uint64_t)tp.tv_nsec/1000);
}
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
#ifdef _MSC_VER
#include <intrin.h>
EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
__cpuidex(out, eax, ecx);
}
#else
#if defined(__x86_64__) || defined(__i386__)
EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
uint32_t a, b, c, d;
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
}
#endif
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
static char x86BrandString[256];
EXPORT char *Sleef_getCpuIdString() {
union {
int32_t info[4];
uint8_t str[16];
} u;
int i,j;
char *p;
p = x86BrandString;
for(i=0;i<3;i++) {
Sleef_x86CpuID(u.info, i + 0x80000002, 0);
for(j=0;j<16;j++) {
*p++ = u.str[j];
}
}
*p++ = '\n';
return x86BrandString;
}
#else
EXPORT char *Sleef_getCpuIdString() {
return "Unknown architecture";
}
#endif

View File

@@ -0,0 +1,9 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef __COMMON_H__
#define __COMMON_H__
char *Sleef_getCpuIdString();
#endif

View File

@@ -0,0 +1,438 @@
// Copyright Naoki Shibata and contributors 2010 - 2023.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
typedef struct {
vdouble x, y, z;
} vdouble3;
static INLINE CONST VECTOR_CC vdouble vd3getx_vd_vd3(vdouble3 v) { return v.x; }
static INLINE CONST VECTOR_CC vdouble vd3gety_vd_vd3(vdouble3 v) { return v.y; }
static INLINE CONST VECTOR_CC vdouble vd3getz_vd_vd3(vdouble3 v) { return v.z; }
static INLINE CONST VECTOR_CC vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
vdouble3 v = { x, y, z };
return v;
}
static INLINE CONST VECTOR_CC vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { v.x = d; return v; }
static INLINE CONST VECTOR_CC vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { v.y = d; return v; }
static INLINE CONST VECTOR_CC vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { v.z = d; return v; }
//
typedef struct {
vdouble2 a, b;
} dd2;
static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {
dd2 r = { a, b };
return r;
}
static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; }
static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; }
//
typedef struct {
vmask e;
vdouble3 d3;
} tdx;
static INLINE CONST VECTOR_CC vmask tdxgete_vm_tdx(tdx t) { return t.e; }
static INLINE CONST VECTOR_CC vdouble3 tdxgetd3_vd3_tdx(tdx t) { return t.d3; }
static INLINE CONST VECTOR_CC vdouble tdxgetd3x_vd_tdx(tdx t) { return t.d3.x; }
static INLINE CONST VECTOR_CC vdouble tdxgetd3y_vd_tdx(tdx t) { return t.d3.y; }
static INLINE CONST VECTOR_CC vdouble tdxgetd3z_vd_tdx(tdx t) { return t.d3.z; }
static INLINE CONST VECTOR_CC tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { t.e = e; return t; }
static INLINE CONST VECTOR_CC tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { t.d3 = d3; return t; }
static INLINE CONST VECTOR_CC tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { t.d3.x = x; return t; }
static INLINE CONST VECTOR_CC tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { t.d3.y = y; return t; }
static INLINE CONST VECTOR_CC tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { t.d3.z = z; return t; }
static INLINE CONST VECTOR_CC tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) {
t.d3 = (vdouble3) { x, y, z };
return t;
}
static INLINE CONST VECTOR_CC tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return (tdx) { e, d3 }; }
static INLINE CONST VECTOR_CC tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) {
return (tdx) { e, (vdouble3) { x, y, z } };
}
static INLINE CONST VECTOR_CC vmask vqgetx_vm_vq(vquad v) { return v.x; }
static INLINE CONST VECTOR_CC vmask vqgety_vm_vq(vquad v) { return v.y; }
static INLINE CONST VECTOR_CC vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return (vquad) { x, y }; }
static INLINE CONST VECTOR_CC vquad vqsetx_vq_vq_vm(vquad v, vmask x) { v.x = x; return v; }
static INLINE CONST VECTOR_CC vquad vqsety_vq_vq_vm(vquad v, vmask y) { v.y = y; return v; }
//
typedef struct {
vdouble d;
vint i;
} di_t;
static INLINE CONST VECTOR_CC vdouble digetd_vd_di(di_t d) { return d.d; }
static INLINE CONST VECTOR_CC vint digeti_vi_di(di_t d) { return d.i; }
static INLINE CONST VECTOR_CC di_t disetdi_di_vd_vi(vdouble d, vint i) {
di_t r = { d, i };
return r;
}
//
typedef struct {
vdouble2 dd;
vint i;
} ddi_t;
static INLINE CONST VECTOR_CC vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; }
static INLINE CONST VECTOR_CC vint ddigeti_vi_ddi(ddi_t d) { return d.i; }
static INLINE CONST VECTOR_CC ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {
ddi_t r = { v, i };
return r;
}
static INLINE CONST VECTOR_CC ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {
ddi.dd = v;
return ddi;
}
//
typedef struct {
vdouble3 td;
vint i;
} tdi_t;
static INLINE CONST VECTOR_CC vdouble3 tdigettd_vd3_tdi(tdi_t d) { return d.td; }
static INLINE CONST VECTOR_CC vdouble tdigetx_vd_tdi(tdi_t d) { return d.td.x; }
static INLINE CONST VECTOR_CC vint tdigeti_vi_tdi(tdi_t d) { return d.i; }
static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) {
tdi_t r = { v, i };
return r;
}
#endif
#if defined(ENABLE_MAIN)
// Functions for debugging
#include <stdio.h>
#include <wchar.h>
static void printvmask(char *mes, vmask g) {
uint64_t u[VECTLENDP];
vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(g));
printf("%s ", mes);
for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
printf("\n");
}
#if !defined(ENABLE_SVE)
static void printvopmask(char *mes, vopmask g) {
union {
vopmask g;
uint8_t u[sizeof(vopmask)];
} cnv = { .g = g };
printf("%s ", mes);
for(int i=0;i<sizeof(vopmask);i++) printf("%02x", cnv.u[i]);
printf("\n");
}
#else
static void printvopmask(char *mes, vopmask g) {
vmask m = vand_vm_vo64_vm(g, vcast_vm_i64(-1));
printvmask(mes, m);
}
#endif
static void printvdouble(char *mes, vdouble vd) {
double u[VECTLENDP];
vstoreu_v_p_vd((double *)u, vd);
printf("%s ", mes);
for(int i=0;i<VECTLENDP;i++) printf("%.20g : ", u[i]);
printf("\n");
}
static void printvint(char *mes, vint vi) {
uint32_t u[VECTLENDP];
vstoreu_v_p_vi((int32_t *)u, vi);
printf("%s ", mes);
for(int i=0;i<VECTLENDP;i++) printf("%08x : ", (unsigned)u[i]);
printf("\n");
}
static void printvint64(char *mes, vint64 vi) {
uint64_t u[VECTLENDP*2];
vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vreinterpret_vm_vi64(vi)));
printf("%s ", mes);
for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
printf("\n");
}
static void printvquad(char *mes, vquad g) {
uint64_t u[VECTLENDP*2];
vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vqgetx_vm_vq(g)));
vstoreu_v_p_vd((double *)&u[VECTLENDP], vreinterpret_vd_vm(vqgety_vm_vq(g)));
printf("%s ", mes);
for(int i=0;i<VECTLENDP*2;i++) printf("%016lx : ", (unsigned long)(u[i]));
printf("\n");
}
#endif // #if defined(ENABLE_MAIN)
///////////////////////////////////////////////////////////////////////////////////
// vdouble functions
static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) {
return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
}
static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) {
return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));
}
static INLINE CONST vopmask visnonfinite_vo_vd(vdouble x) {
return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(INT64_C(0x7ff0000000000000))), vcast_vm_i64(INT64_C(0x7ff0000000000000)));
}
static INLINE CONST vmask vsignbit_vm_vd(vdouble d) {
return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
}
static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) {
return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
}
static INLINE CONST vdouble vclearlsb_vd_vd_i(vdouble d, int n) {
return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_u64((~UINT64_C(0)) << n)));
}
static INLINE CONST VECTOR_CC vdouble vtoward0_vd_vd(vdouble x) { // returns nextafter(x, 0)
vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(-1)));
return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);
}
#if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {
return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
}
#endif
static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) {
return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);
}
#if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) {
return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
}
static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {
return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)),
vand_vm_vm_vm (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));
}
#endif
static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) {
#ifdef FULL_FP_ROUNDING
return vtruncate_vd_vd(x);
#else
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
#endif
}
static INLINE CONST VECTOR_CC vdouble vfloor2_vd_vd(vdouble x) {
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
}
static INLINE CONST VECTOR_CC vdouble vceil2_vd_vd(vdouble x) {
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
}
static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) {
vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);
fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
}
static INLINE CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) {
#ifdef FULL_FP_ROUNDING
return vrint_vd_vd(d);
#else
vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);
return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),
d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));
#endif
}
static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) {
return veq_vo_vd_vd(vrint2_vd_vd(d), d);
}
static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) {
vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5));
return vneq_vo_vd_vd(vrint2_vd_vd(x), x);
}
// ilogb
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) {
vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));
d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
q = vand_vi_vi_vi(q, vcast_vi_i((int)(((1U << 12) - 1) << 20)));
q = vsrl_vi_vi_i(q, 20);
q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));
return q;
}
static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) {
vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
q = vsrl_vi_vi_i(q, 20);
q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));
q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));
return q;
}
#endif
static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) {
vmask m = vreinterpret_vm_vd(d);
m = vsrl64_vm_vm_i(m, 20 + 32);
m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
m = vsub64_vm_vm_vm(m, vcast_vm_i64(0x3ff));
return m;
}
static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) {
vmask m = vreinterpret_vm_vd(d);
m = vsrl64_vm_vm_i(m, 20 + 32);
m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
return m;
}
// ldexp
static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) {
q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);
vmask r = vcastu_vm_vi(vsll_vi_vi_i(q, 20));
return vreinterpret_vd_vm(r);
}
static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vm(vmask q) {
q = vadd64_vm_vm_vm(vcast_vm_i64(0x3ff), q);
return vreinterpret_vd_vm(vsll64_vm_vm_i(q, 52));
}
static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) {
vint m = vsra_vi_vi_i(q, 31);
m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);
q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));
m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);
m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);
m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);
vmask r = vcastu_vm_vi(vsll_vi_vi_i(m, 20));
vdouble y = vreinterpret_vd_vm(r);
return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));
}
static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {
return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));
}
static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {
return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vcastu_vm_vi(vsll_vi_vi_i(q, 20))));
}
static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) {
vmask m = vsrl64_vm_vm_i(e, 2);
e = vsub64_vm_vm_vm(vsub64_vm_vm_vm(vsub64_vm_vm_vm(e, m), m), m);
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(e));
return d;
}
static INLINE CONST vdouble vldexp2_vd_vd_vm(vdouble d, vmask e) {
return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vm(vsrl64_vm_vm_i(e, 1))), vpow2i_vd_vm(vsub64_vm_vm_vm(e, vsrl64_vm_vm_i(e, 1))));
}
static INLINE CONST vdouble vldexp3_vd_vd_vm(vdouble d, vmask q) {
return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vsll64_vm_vm_i(q, 52)));
}
// vmask functions
static INLINE CONST vdouble vcast_vd_vm(vmask m) { return vcast_vd_vi(vcast_vi_vm(m)); } // 32 bit only
static INLINE CONST vmask vtruncate_vm_vd(vdouble d) { return vcast_vm_vi(vtruncate_vi_vd(d)); }
static INLINE CONST vopmask vlt64_vo_vm_vm(vmask x, vmask y) { return vgt64_vo_vm_vm(y, x); }
static INLINE CONST vopmask vnot_vo64_vo64(vopmask x) {
return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i64(0), vcast_vm_i64(0)));
}
static INLINE CONST vopmask vugt64_vo_vm_vm(vmask x, vmask y) { // unsigned compare
x = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), x);
y = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), y);
return vgt64_vo_vm_vm(x, y);
}
static INLINE CONST vmask vilogbk_vm_vd(vdouble d) {
vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(4.9090934652977266E-91));
d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
vmask q = vreinterpret_vm_vd(d);
q = vsrl64_vm_vm_i(q, 20 + 32);
q = vand_vm_vm_vm(q, vcast_vm_i64(0x7ff));
q = vsub64_vm_vm_vm(q, vsel_vm_vo64_vm_vm(o, vcast_vm_i64(300 + 0x3ff), vcast_vm_i64(0x3ff)));
return q;
}
// vquad functions
static INLINE CONST vquad sel_vq_vo_vq_vq(vopmask o, vquad x, vquad y) {
return vqsetxy_vq_vm_vm(vsel_vm_vo64_vm_vm(o, vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vsel_vm_vo64_vm_vm(o, vqgety_vm_vq(x), vqgety_vm_vq(y)));
}
static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) {
vquad r = vqsetxy_vq_vm_vm(vadd64_vm_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vadd64_vm_vm_vm(vqgety_vm_vq(x), vqgety_vm_vq(y)));
r = vqsety_vq_vq_vm(r, vadd64_vm_vm_vm(vqgety_vm_vq(r), vand_vm_vo64_vm(vugt64_vo_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(r)), vcast_vm_i64(1))));
return r;
}
static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; }
// imm must be smaller than 64
#define srl128_vq_vq_i(m, imm) \
imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), imm), vsll64_vm_vm_i(vqgety_vm_vq(m), 64-imm)), vsrl64_vm_vm_i(vqgety_vm_vq(m), imm))
// This function is equivalent to :
// di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) };
static INLINE CONST di_t rempisub(vdouble x) {
#ifdef FULL_FP_ROUNDING
vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4)));
vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4))));
return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi);
#else
vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x);
vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)),
vmul_vd_vd_vd(vcast_vd_d(4), x),
vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x));
vdouble rintx = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)),
x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x));
return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x),
vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x)));
#endif
}

View File

@@ -0,0 +1,324 @@
// Copyright Naoki Shibata and contributors 2010 - 2024.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
#if !defined(SLEEF_ENABLE_CUDA)
typedef struct {
vdouble x, y;
} vdouble2;
#else
typedef double2 vdouble2;
#endif
static INLINE CONST VECTOR_CC vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
static INLINE CONST VECTOR_CC vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
static INLINE CONST VECTOR_CC vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { vdouble2 v; v.x = x; v.y = y; return v; }
static INLINE CONST VECTOR_CC vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; }
static INLINE CONST VECTOR_CC vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; }
#endif
#if !defined(SLEEF_ENABLE_CUDA)
typedef struct {
double x, y;
} double2;
#endif
static INLINE CONST VECTOR_CC double2 dd(double h, double l) {
double2 ret = { h, l };
return ret;
}
static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) {
return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));
}
static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
return vd2setxy_vd2_vd_vd(h, l);
}
static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {
return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l));
}
static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d2(double2 dd) {
return vd2setxy_vd2_vd_vd(vcast_vd_d(dd.x), vcast_vd_d(dd.y));
}
static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)),
vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
}
static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {
return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0),
vsel_vd_vo_d_d(o, y1, y0));
}
static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);
}
static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);
}
static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);
}
static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);
}
static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);
}
static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);
}
static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);
}
static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);
}
static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);
}
//
static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {
return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x)));
}
static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {
return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)),
vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)),
vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)),
vreinterpret_vm_vd(vcast_vd_d(-0.0))))));
}
static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t)));
}
static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s));
}
static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_d(vdouble2 d, double s) { return ddscale_vd2_vd2_vd(d, vcast_vd_d(s)); }
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
vdouble s = vadd_vd_vd_vd(x, y);
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
vdouble s = vadd_vd_vd_vd(x, y);
vdouble v = vsub_vd_vd_vd(s, x);
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x)));
}
static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y);
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x)));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v));
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x)));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
vdouble v = vsub_vd_vd_vd(s, x);
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)),
vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y)));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
// |x| >= |y|
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
}
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v));
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))));
}
static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
// |x| >= |y|
vdouble s = vsub_vd_vd_vd(x, y);
return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
}
static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
// |x| >= |y|
vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s);
t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y));
t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x));
return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y)));
}
#ifdef ENABLE_FMA_DP
static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s);
vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1)));
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u)));
}
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
vdouble s = vmul_vd_vd_vd(x, y);
return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s));
}
static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s)));
}
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s))));
}
static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y))));
}
static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))));
}
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s)));
}
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
vdouble s = vrec_vd_vd(d);
return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1))));
}
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d));
return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1)))));
}
#else // #ifdef ENABLE_FMA_DP
static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th);
vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh);
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),
vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u));
}
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);
vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);
vdouble s = vmul_vd_vd_vd(x, y);
return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)));
}
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
vdouble yh = vupper_vd_vd(y ), yl = vsub_vd_vd_vd(y , yh);
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y)));
}
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y))));
}
static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));
}
static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x)))));
}
static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));
}
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
vdouble t = vrec_vd_vd(d);
vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);
vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);
return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
}
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th);
return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t))));
}
#endif // #ifdef ENABLE_FMA_DP
static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));
return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
}
static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {
vdouble t = vsqrt_vd_vd(d);
return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
}
static INLINE CONST VECTOR_CC vdouble2 ddmla_vd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y, vdouble2 z) {
return ddadd_vd2_vd2_vd2(z, ddmul_vd2_vd2_vd2(x, y));
}

View File

@@ -0,0 +1,369 @@
// Copyright Naoki Shibata and contributors 2010 - 2024.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
#if !defined(SLEEF_ENABLE_CUDA)
typedef struct {
vfloat x, y;
} vfloat2;
#else
typedef float2 vfloat2;
#endif
static INLINE CONST VECTOR_CC vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
static INLINE CONST VECTOR_CC vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
static INLINE CONST VECTOR_CC vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { vfloat2 v; v.x = x; v.y = y; return v; }
static INLINE CONST VECTOR_CC vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; }
static INLINE CONST VECTOR_CC vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; }
#endif
static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) {
return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));
}
static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
return vf2setxy_vf2_vf_vf(h, l);
}
static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {
return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l));
}
static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) {
return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d));
}
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
}
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {
return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0));
}
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));
}
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));
}
static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {
return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))),
vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));
}
static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);
}
static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);
}
static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);
}
static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {
return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);
}
static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {
return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);
}
static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);
}
static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);
}
static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);
}
//
static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {
return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x)));
}
static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {
return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)),
vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));
}
static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t)));
}
static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
vfloat s = vadd_vf_vf_vf(x, y);
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
vfloat s = vadd_vf_vf_vf(x, y);
vfloat v = vsub_vf_vf_vf(s, x);
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
vfloat v = vsub_vf_vf_vf(s, x);
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y)));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x)));
}
static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y);
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x)));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v));
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
// |x| >= |y|
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
}
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v));
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))));
}
static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
// |x| >= |y|
vfloat s = vsub_vf_vf_vf(x, y);
return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
}
static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
// |x| >= |y|
vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s);
t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y));
t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x));
return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y)));
}
#ifdef ENABLE_FMA_SP
static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s);
vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1)));
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u)));
}
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
vfloat s = vmul_vf_vf_vf(x, y);
return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s));
}
static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s)));
}
static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x))));
}
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s))));
}
static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y))));
}
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y);
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s)));
}
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
vfloat s = vrec_vf_vf(d);
return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1))));
}
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d));
return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1)))));
}
#else
static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th);
vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh);
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
vfloat u, w;
w = vcast_vf_f(-1);
w = vmla_vf_vf_vf_vf(dh, th, w);
w = vmla_vf_vf_vf_vf(dh, tl, w);
w = vmla_vf_vf_vf_vf(dl, th, w);
w = vmla_vf_vf_vf_vf(dl, tl, w);
w = vneg_vf_vf(w);
u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s));
u = vmla_vf_vf_vf_vf(nhh, tl, u);
u = vmla_vf_vf_vf_vf(nhl, th, u);
u = vmla_vf_vf_vf_vf(nhl, tl, u);
u = vmla_vf_vf_vf_vf(s, w, u);
return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u));
}
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);
vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);
vfloat s = vmul_vf_vf_vf(x, y), t;
t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
t = vmla_vf_vf_vf_vf(xl, yh, t);
t = vmla_vf_vf_vf_vf(xh, yl, t);
t = vmla_vf_vf_vf_vf(xl, yl, t);
return vf2setxy_vf2_vf_vf(s, t);
}
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
vfloat yh = vupper_vf_vf(y ), yl = vsub_vf_vf_vf(y , yh);
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t;
t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
t = vmla_vf_vf_vf_vf(xl, yh, t);
t = vmla_vf_vf_vf_vf(xh, yl, t);
t = vmla_vf_vf_vf_vf(xl, yl, t);
t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t);
return vf2setxy_vf2_vf_vf(s, t);
}
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t;
t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
t = vmla_vf_vf_vf_vf(xl, yh, t);
t = vmla_vf_vf_vf_vf(xh, yl, t);
t = vmla_vf_vf_vf_vf(xl, yl, t);
t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t);
t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t);
return vf2setxy_vf2_vf_vf(s, t);
}
static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));
}
static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t;
t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s));
t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);
t = vmla_vf_vf_vf_vf(xl, xl, t);
t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t);
return vf2setxy_vf2_vf_vf(s, t);
}
static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));
}
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
vfloat t = vrec_vf_vf(d);
vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);
vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);
vfloat u = vcast_vf_f(-1);
u = vmla_vf_vf_vf_vf(dh, th, u);
u = vmla_vf_vf_vf_vf(dh, tl, u);
u = vmla_vf_vf_vf_vf(dl, th, u);
u = vmla_vf_vf_vf_vf(dl, tl, u);
return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
}
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th);
vfloat u = vcast_vf_f(-1);
u = vmla_vf_vf_vf_vf(dh, th, u);
u = vmla_vf_vf_vf_vf(dh, tl, u);
u = vmla_vf_vf_vf_vf(dl, th, u);
u = vmla_vf_vf_vf_vf(dl, tl, u);
u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u);
return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
}
#endif
static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
#ifdef ENABLE_RECSQRT_SP
vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
vfloat2 r = dfmul_vf2_vf2_vf(d, x);
return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));
#else
vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));
#endif
}
static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {
vfloat t = vsqrt_vf_vf(d);
return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));
}

View File

@@ -0,0 +1,40 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
// These are macros for evaluating polynomials using Estrin's method
#define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0))
#define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0)))
#define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0)))
#define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY20(x, x2, x4, x8, x16, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x16, POLY4(x, x2, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY21(x, x2, x4, x8, x16, d4, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
MLA(x16, POLY5(x, x2, x4, d4, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))

View File

@@ -0,0 +1,92 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <quadmath.h>
#include <inttypes.h>
static __float128 mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd) {
if (isnan(mpfr_get_d(m, GMP_RNDN))) return __builtin_nan("");
mpfr_t frr, frd;
mpfr_inits(frr, frd, NULL);
mpfr_exp_t e;
mpfr_frexp(&e, frr, m, GMP_RNDN);
double d0 = mpfr_get_d(frr, GMP_RNDN);
mpfr_set_d(frd, d0, GMP_RNDN);
mpfr_sub(frr, frr, frd, GMP_RNDN);
double d1 = mpfr_get_d(frr, GMP_RNDN);
mpfr_set_d(frd, d1, GMP_RNDN);
mpfr_sub(frr, frr, frd, GMP_RNDN);
double d2 = mpfr_get_d(frr, GMP_RNDN);
mpfr_clears(frr, frd, NULL);
return ldexpq((__float128)d2 + (__float128)d1 + (__float128)d0, e);
}
static void mpfr_set_f128(mpfr_t frx, __float128 f, mpfr_rnd_t rnd) {
char s[128];
quadmath_snprintf(s, 120, "%.50Qg", f);
mpfr_set_str(frx, s, 10, rnd);
}
static void printf128(__float128 f) {
char s[128];
quadmath_snprintf(s, 120, "%.50Qg", f);
printf("%s", s);
}
static char frstr[16][1000];
static int frstrcnt = 0;
static char *toBC(double d) {
union {
double d;
uint64_t u64;
int64_t i64;
} cnv;
cnv.d = d;
int64_t l = cnv.i64;
int e = (int)((l >> 52) & ~(-1L << 11));
int s = (int)(l >> 63);
l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52));
char *ptr = frstr[(frstrcnt++) & 15];
sprintf(ptr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52));
return ptr;
}
static char *toBCq(__float128 d) {
union {
__float128 d;
__uint128_t u128;
} cnv;
cnv.d = d;
__uint128_t m = cnv.u128;
int e = (int)((m >> 112) & ~(-1L << 15));
int s = (int)(m >> 127);
m = d == 0 ? 0 : ((m & ((((__uint128_t)1) << 112)-1)) | ((__uint128_t)1 << 112));
uint64_t h = m / UINT64_C(10000000000000000000);
uint64_t l = m % UINT64_C(10000000000000000000);
char *ptr = frstr[(frstrcnt++) & 15];
sprintf(ptr, "%s%" PRIu64 "%019" PRIu64 "*2^%d", s != 0 ? "-" : "", h, l, (e-0x3fff-112));
return ptr;
}
static int xisnanq(Sleef_quad x) { return x != x; }
static int xisinfq(Sleef_quad x) { return x == (Sleef_quad)__builtin_inf() || x == -(Sleef_quad)__builtin_inf(); }
static int xisfiniteq(Sleef_quad x) { return !xisnanq(x) && !isinfq(x); }

View File

@@ -0,0 +1,683 @@
double2
double3
float2
atan2k
atan2kf
atan2kf_u1
atan2k_u1
cospik
cospifk
dd
dd2
dd2geta_vd2_dd2
dd2getb_vd2_dd2
dd2setab_dd2_vd2_vd2
ddabs_vd2_vd2
ddadd2_vd2_vd2_vd
ddadd2_vd2_vd2_vd2
ddadd2_vd2_vd_vd
ddadd2_vd2_vd_vd2
ddadd_vd2_vd2_vd
ddadd_vd2_vd2_vd2
ddadd_vd2_vd_vd
ddadd_vd2_vd_vd2
dddiv_vd2_vd2_vd2
ddi
ddi_t
ddigetdd_vd2_ddi
ddigeti_vi_ddi
ddisetdd_ddi_ddi_vd2
ddisetddi_ddi_vd2_vi
ddmla_vd2_vd2_vd2_vd2
ddmla_vd2_vd_vd2_vd2
ddmul_vd2_vd2_vd
ddmul_vd2_vd2_vd2
ddmul_vd2_vd_vd
ddmul_vd_vd2_vd2
ddneg_vd2_vd2
ddnormalize_vd2_vd2
ddrec_vd2_vd
ddrec_vd2_vd2
ddscale_vd2_vd2_d
ddscale_vd2_vd2_vd
ddsqrt_vd2_vd
ddsqrt_vd2_vd2
ddsqu_vd2_vd2
ddsqu_vd_vd2
ddsub_vd2_vd2_vd
ddsub_vd2_vd2_vd2
ddsub_vd2_vd_vd
df
df2
df2geta_vf2_df2
df2getb_vf2_df2
df2setab_df2_vf2_vf2
dfabs_vf2_vf2
dfadd2_vf2_vf2_vf
dfadd2_vf2_vf2_vf2
dfadd2_vf2_vf_vf
dfadd2_vf2_vf_vf2
dfadd_vf2_vf2_vf
dfadd_vf2_vf2_vf2
dfadd_vf2_vf_vf
dfadd_vf2_vf_vf2
dfdiv_vf2_vf2_vf2
dfi
dfi_t
dfigetdf_vf2_dfi
dfigeti_vi2_dfi
dfisetdf_dfi_dfi_vf2
dfisetdfi_dfi_vf2_vi2
dfmla_vf2_vf_vf2_vf2
dfmul_vf2_vf2_vf
dfmul_vf2_vf2_vf2
dfmul_vf2_vf_vf
dfmul_vf_vf2_vf2
dfneg_vf2_vf2
dfnormalize_vf2_vf2
dfrec_vf2_vf
dfrec_vf2_vf2
dfscale_vf2_vf2_vf
dfsqrt_vf2_vf
dfsqrt_vf2_vf2
dfsqu_vf2_vf2
dfsqu_vf_vf2
dfsub_vf2_vf2_vf
dfsub_vf2_vf2_vf2
dfsub_vf2_vf_vf
di_t
digetd_vd_di
digeti_vi_di
disetdi_di_vd_vi
expk
expk2
expk2f
expk3f
expkf
expm1fk
expm1k
fi_t
figetd_vf_di
figeti_vi2_di
fisetdi_fi_vf_vi2
gammafk
gammak
imdvq_vq_vm_vm
logk
logk2
logk2f
logk3f
logkf
poly2dd
poly2dd_b
poly2df
poly2df_b
poly4dd
poly4df
pragma
rempi
rempif
rempisub
rempisubf
sinpifk
sinpik
td
tdi_t
tdigeti_vi_tdi
tdigettd_vd3_tdi
tdigetx_vd_tdi
tdisettdi_tdi_vd3_vi
tdx
tdxgetd3_vd3_tdx
tdxgetd3x_vd_tdx
tdxgetd3y_vd_tdx
tdxgetd3z_vd_tdx
tdxgete_vm_tdx
tdxsetd3_tdx_tdx_vd3
tdxsete_tdx_tdx_vm
tdxseted3_tdx_vm_vd3
tdxsetexyz_tdx_vm_vd_vd_vd
tdxsetx_tdx_tdx_vd
tdxsetxyz_tdx_tdx_vd_vd_vd
tdxsety_tdx_tdx_vd
tdxsetz_tdx_tdx_vd
vabs_vd_vd
vabs_vf2_vf2
vabs_vf_vf
add128_vq_vq_vq
vadd64_vm_vm_vm
vadd_vd_3vd
vadd_vd_4vd
vadd_vd_5vd
vadd_vd_6vd
vadd_vd_7vd
vadd_vd_vd_vd
vadd_vf_3vf
vadd_vf_4vf
vadd_vf_5vf
vadd_vf_6vf
vadd_vf_7vf
vadd_vf_vf_vf
vadd_vi2_vi2_vi2
vadd_vi_vi_vi
vand_vi2_vi2_vi2
vand_vi2_vo_vi2
vand_vi_vi_vi
vand_vi_vo_vi
vand_vm_vm_vm
vand_vm_vo32_vm
vand_vm_vo64_vm
vand_vo_vo_vo
vandnot_vi2_vi2_vi2
vandnot_vi2_vo_vi2
vandnot_vi_vi_vi
vandnot_vi_vo_vi
vandnot_vm_vm_vm
vandnot_vm_vo32_vm
vandnot_vm_vo64_vm
vandnot_vo_vo_vo
vargquad
vavailability_i
cast_aq_vq
vcast_d_vd
vcast_f_vf
vcast_vd2_d2
vcast_vd2_d_d
vcast_vd2_vd_vd
vcast_vd_d
vcast_vd_vi
vcast_vd_vm
vcast_vf2_d
vcast_vf2_f_f
vcast_vf2_vf_vf
vcast_vf_f
vcast_vf_vi2
vcast_vi2_i
vcast_vi2_i_i
vcast_vi2_vm
vcast_vi_i
vcast_vi_vm
vcast_vm_i64
vcast_vm_i_i
vcast_vm_u64
vcast_vm_vi
vcast_vm_vi2
vcast_vm_vo
vcast_vo_i
vcast_vo32_vo64
vcast_vo64_vo32
cast_vq_aq
vclearlsb_vd_vd_i
vcopysign_vd_vd_vd
vcopysign_vf_vf_vf
vd
vd2getx_vd_vd2
vd2gety_vd_vd2
vd2setx_vd2_vd2_vd
vd2setxy_vd2_vd_vd
vd2sety_vd2_vd2_vd
vd3getx_vd_vd3
vd3gety_vd_vd3
vd3getz_vd_vd3
vd3setx_vd3_vd3_vd
vd3setxyz_vd3_vd_vd_vd
vd3sety_vd3_vd3_vd
vd3setz_vd3_vd3_vd
vdiv_vd_vd_vd
vdiv_vf_vf_vf
vdouble
vdouble2
vdouble3
veq64_vo_vm_vm
veq_vi2_vi2_vi2
veq_vi_vi_vi
veq_vo_vd_vd
veq_vo_vf_vf
veq_vo_vi2_vi2
veq_vo_vi_vi
versatileVector
vf2getx_vf_vf2
vf2gety_vf_vf2
vf2setx_vf2_vf2_vf
vf2setxy_vf2_vf_vf
vf2sety_vf2_vf2_vf
vfloat
vfloat2
vfma_vd_vd_vd_vd
vfma_vf_vf_vf_vf
vfmann_vd_vd_vd_vd
vfmann_vf_vf_vf_vf
vfmanp_vd_vd_vd_vd
vfmanp_vf_vf_vf_vf
vfmapn_vd_vd_vd_vd
vfmapn_vf_vf_vf_vf
vfmapp_vd_vd_vd_vd
vfmapp_vf_vf_vf_vf
vgather_vd_p_vi
vgather_vf_p_vi2
vge_vo_vd_vd
vge_vo_vf_vf
vgetexp_vd_vd
vgetexp_vf_vf
vgetmant_vd_vd
vgetmant_vf_vf
vgt64_vo_vm_vm
vgt_vi2_vi2_vi2
vgt_vi_vi_vi
vgt_vo_vd_vd
vgt_vo_vf_vf
vgt_vo_vi2_vi2
vgt_vo_vi_vi
vilogb2k_vi2_vf
vilogb2k_vi_vd
vilogb2k_vm_vd
vilogb3k_vm_vd
vilogbk_vi2_vf
vilogbk_vi_vd
vilogbk_vm_vd
vint
vint2
vint64
visinf2_vd_vd_vd
visinf2_vf_vf_vf
visinf_vo_vd
visinf_vo_vf
visint_vo_vd
visint_vo_vf
visminf_vo_vd
visminf_vo_vf
visnan_vo_vd
visnan_vo_vf
visnegzero_vo_vd
visnegzero_vo_vf
visnonfinite_vo_vd
visnumber_vo_vd
visnumber_vo_vf
visodd_vo_vd
vispinf_vo_vd
vispinf_vo_vf
vldexp1_vd_vd_vm
vldexp2_vd_vd_vi
vldexp2_vd_vd_vm
vldexp2_vf_vf_vi2
vldexp3_vd_vd_vi
vldexp3_vd_vd_vm
vldexp3_vf_vf_vi2
vldexp_vd_vd_vi
vldexp_vf_vf_vi2
vle_vo_vd_vd
vle_vo_vf_vf
vload_vd_p
vload_vf_p
vloadu_vd_p
vloadu_vf_p
vloadu_vi2_p
vloadu_vi_p
loadu_vq_p
vlt64_vo_vm_vm
vlt_vo_vd_vd
vlt_vo_vf_vf
vmask
vmax_vd_vd_vd
vmax_vf_vf_vf
vmin_vd_vd_vd
vmin_vf_vf_vf
vmla_vd_vd_vd_vd
vmla_vf_vf_vf_vf
vmlanp_vd_vd_vd_vd
vmlanp_vf_vf_vf_vf
vmlapn_vd_vd_vd_vd
vmlapn_vf_vf_vf_vf
vmlsubadd_vd_vd_vd_vd
vmlsubadd_vf_vf_vf_vf
vmul_vd_vd_vd
vmul_vf_vf_vf
vmulsign_vd_vd_vd
vmulsign_vf_vf_vf
vneg64_vm_vm
vneg_vd_vd
vneg_vf_vf
vneg_vi2_vi2
vneg_vi_vi
vnegpos_vd_vd
vnegpos_vf_vf
vneq_vo_vd_vd
vneq_vo_vf_vf
vnot_vo32_vo32
vnot_vo64_vo64
vopmask
vor_vi2_vi2_vi2
vor_vi_vi_vi
vor_vm_vm_vm
vor_vm_vo32_vm
vor_vm_vo64_vm
vor_vo_vo_vo
vorsign_vd_vd_vd
vorsign_vf_vf_vf
vposneg_vd_vd
vposneg_vf_vf
vpow2i_vd_vi
vpow2i_vd_vm
vpow2i_vf_vi2
vprefetch_v_p
vptrunc_vd_vd
vptrunc_vf_vf
vqgetx_vm_vq
vqgety_vm_vq
vqsetx_vq_vq_vm
vqsetxy_vq_vm_vm
vqsety_vq_vq_vm
vquad
vrec_vd_vd
vrec_vf_vf
vreinterpret_vd_vf
vreinterpret_vd_vm
vreinterpret_vf_vd
vreinterpret_vf_vi2
vreinterpret_vf_vm
vreinterpret_vi2_vf
vreinterpret_vi64_vm
vreinterpret_vm_vd
vreinterpret_vm_vf
vreinterpret_vm_vi64
vreinterpret_vm_vu64
vreinterpret_vu64_vm
vrev21_vd_vd
vrev21_vf_vf
vreva2_vd_vd
vreva2_vf_vf
vrint_vd_vd
vrint2_vd_vd
vrint_vf_vf
vrint_vi2_vf
vrint_vi_vd
vrintfk2_vf_vf
vrintk2_vd_vd
vscatter2_v_p_i_i_vd
vscatter2_v_p_i_i_vf
vsel_vd2_vo_d_d_d_d
vsel_vd2_vo_vd2_vd2
vsel_vd_vo_d_d
vsel_vd_vo_vd_vd
vsel_vd_vo_vo_d_d_d
vsel_vd_vo_vo_vo_d_d_d_d
vsel_vf2_vo_f_f_f_f
vsel_vf2_vo_vf2_vf2
vsel_vf2_vo_vo_d_d_d
vsel_vf2_vo_vo_vo_d_d_d_d
vsel_vf_vo_f_f
vsel_vf_vo_vf_vf
vsel_vf_vo_vo_f_f_f
vsel_vf_vo_vo_vo_f_f_f_f
vsel_vi2_vf_vf_vi2_vi2
vsel_vi2_vf_vi2
vsel_vi2_vo_vi2_vi2
vsel_vi_vd_vd_vi_vi
vsel_vi_vd_vi
vsel_vi_vo_vi_vi
vsel_vm_vo64_vm_vm
sel_vq_vo_vq_vq
vsign_vd_vd
vsign_vf_vf
vsignbit_vm_vd
vsignbit_vm_vf
vsignbit_vo_vd
vsignbit_vo_vf
vsll_vi2_vi2_i
vsll_vi_vi_i
vsqrt_vd_vd
vsqrt_vf_vf
vsra_vi2_vi2_i
vsra_vi_vi_i
vsrl_vi2_vi2_i
vsrl_vi_vi_i
vsscatter2_v_p_i_i_vd
vsscatter2_v_p_i_i_vf
vstore_v_p_vd
vstore_v_p_vf
vstoreu_v_p_vd
vstoreu_v_p_vf
vstoreu_v_p_vi
vstoreu_v_p_vi2
storeu_v_p_vq
vstream_v_p_vd
vstream_v_p_vf
vsub64_vm_vm_vm
vsub_vd_3vd
vsub_vd_4vd
vsub_vd_5vd
vsub_vd_6vd
vsub_vd_vd_vd
vsub_vf_3vf
vsub_vf_4vf
vsub_vf_5vf
vsub_vf_vf_vf
vsub_vi2_vi2_vi2
vsub_vi_vi_vi
vsubadd_vd_vd_vd
vsubadd_vf_vf_vf
vtestallones_i_vo32
vtestallones_i_vo64
vtestallzeros_i_vo64
vtoward0_vd_vd
vtoward0_vf_vf
vtruncate_vd_vd
vtruncate2_vd_vd
vtruncate_vf_vf
vtruncate_vi2_vf
vtruncate_vi_vd
vtruncate_vm_vd
vugt64_vo_vm_vm
vuint64
vupper_vd_vd
vupper_vf_vf
vxor_vi2_vi2_vi2
vxor_vi_vi_vi
vxor_vm_vm_vm
vxor_vm_vo32_vm
vxor_vm_vo64_vm
vxor_vo_vo_vo
#
abs_tdx_tdx
abs_vd3_vd3
acos_tdx_tdx
acosh_tdx_tdx
add2_vd3_vd2_vd3
add2_vd3_vd3_vd3
add2_vd3_vd_vd3
add_tdx_tdx_tdx
add_vd3_vd2_vd3
add_vd3_vd_vd3
asin_tdx_tdx
asinh_tdx_tdx
atan2_tdx_tdx_tdx
atan_tdx_tdx
atanh_tdx_tdx
cast_tdx_d
cast_tdx_d_d_d
cast_tdx_vd
cast_tdx_vd3
cast_tdx_vq
cast_vd3_d3
cast_vd3_d_d_d
cast_vd3_tdx
cast_vd3_vd_vd_vd
cast_vd_tdx
cast_vq_tdx
cmp_vm_tdx_tdx
cmpcnv_vq_vq
cos_tdx_tdx
cosh_tdx_tdx
div2_vd3_vd3_vd3
div_tdx_tdx_tdx
div_vd3_vd3_vd3
eq_vo_tdx_tdx
exp10_tdx_tdx
exp10i
exp10tab
exp2_tdx_tdx
exp_tdx_tdx
expm1_tdx_tdx
fastcast_tdx_vd3
fastcast_tdx_vq
fastcast_vq_tdx
ge_vo_tdx_tdx
gt_vo_tdx_tdx
ilogb_vm_tdx
isinf_vo_vq
isint_vo_tdx
isminf_vo_vq
isnan_vo_tdx
isnan_vo_vq
isnonfinite_vo_vq
isnonfinite_vo_vq_vq
isnonfinite_vo_vq_vq_vq
isodd_vo_tdx
ispinf_vo_vq
iszero_vo_tdx
iszero_vo_vq
le_vo_tdx_tdx
log10_tdx_tdx
log1p_tdx_tdx
log2_tdx_tdx
log_tdx_tdx
logk_tdx_tdx
lt_vo_tdx_tdx
mla_vd3_vd3_vd3_vd3
modf_tdx_tdx_ptdx
mul2_vd3_vd3_vd3
mul_tdx_tdx_tdx
mul_vd3_vd2_vd2
mul_vd3_vd2_vd3
mul_vd3_vd3_vd
mul_vd3_vd3_vd2
mul_vd3_vd3_vd3
mulsign_tdx_tdx_vd
mulsign_vd3_vd3_vd
mulsign_vq_vq_vq
neg_tdx_tdx
neg_vd3_vd3
neq_vo_tdx_tdx
normalize_vd3_vd3
poly10dd
poly10dd_b
poly11dd
poly11dd_b
poly12dd
poly12dd_b
poly13dd
poly13dd_b
poly14dd
poly14dd_b
poly15dd
poly15dd_b
poly16dd
poly16dd_b
poly17dd
poly17dd_b
poly18dd
poly18dd_b
poly19dd
poly19dd_b
poly20dd
poly20dd_b
poly21dd
poly21dd_b
poly22dd
poly22dd_b
poly23dd
poly23dd_b
poly24dd
poly24dd_b
poly25dd
poly25dd_b
poly26dd
poly26dd_b
poly27dd
poly27dd_b
poly2d
poly2td
poly2td_b
poly3d
poly3dd
poly3dd_b
poly3td
poly3td_b
poly4d
poly4dd_b
poly4td
poly4td_b
poly5d
poly5dd
poly5dd_b
poly5td
poly5td_b
poly6d
poly6dd
poly6dd_b
poly6td
poly6td_b
poly7d
poly7dd
poly7dd_b
poly7td
poly7td_b
poly8d
poly8dd
poly8dd_b
poly8td
poly8td_b
poly9dd
poly9dd_b
pow_tdx_tdx_tdx
quickrenormalize_vd3_vd3
quicktwosum_vd2_vd_vd
rec_vd3_vd2
rec_vd3_vd3
rempio2q
scale_vd3_vd3_d
scale_vd3_vd3_vd
scaleadd2_vd3_vd3_vd3_vd
scalesub2_vd3_vd3_vd3_vd
sel_tdx_vo_tdx_tdx
sel_vd3_vo_vd3_vd3
signbit_vo_tdx
sin_tdx_tdx
sinh_tdx_tdx
slowcast_vq_tdx
snprintquad
snprintquadhex
sqrt_tdx_tdx
sqrt_vd3_vd3
squ_vd3_vd3
sub2_vd3_vd3_vd3
sub_tdx_tdx_tdx
tan_tdx_tdx
tanh_tdx_tdx
twoprod_vd2_vd_vd
twosub_vd2_vd_vd
twosubx_vd2_vd_vd_vd
twosum_vd2_vd_vd
twosumx_vd2_vd_vd_vd
vtruncate2_vd_vd
vfloor2_vd_vd
vceil2_vd_vd
vround2_vd_vd
isinf_vo_tdx
trunc_tdx_tdx
rint_tdx_tdx
fmod_tdx_tdx_tdx
remainder_tdx_tdx_tdx
cbrt_tdx_tdx
frexp_tdx_tdx_pvi
fma_tdx_tdx_tdx_tdx
hypot_tdx_tdx_tdx
ilogb_vi_tdx
ldexp_tdx_tdx_vi
Sleef_rempitabsp
Sleef_rempitabdp
Sleef_rempitabqp
vcastu_vm_vi
vcastu_vi_vm
rvv_sp_vopmask
rvv_dp_vopmask

View File

@@ -0,0 +1,50 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <signal.h>
#include <setjmp.h>
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
static jmp_buf sigjmp;
#define SETJMP(x) setjmp(x)
#define LONGJMP longjmp
#else
static sigjmp_buf sigjmp;
#define SETJMP(x) sigsetjmp(x, 1)
#define LONGJMP siglongjmp
#endif
int main2(int argc, char **argv);
int check_feature(double, float);
static void sighandler(int signum) {
LONGJMP(sigjmp, 1);
}
int detectFeature() {
signal(SIGILL, sighandler);
if (SETJMP(sigjmp) == 0) {
int r = check_feature(1.0, 1.0f);
signal(SIGILL, SIG_DFL);
return r;
} else {
signal(SIGILL, SIG_DFL);
return 0;
}
}
int main(int argc, char **argv) {
if (!detectFeature()) {
printf("0\n");
fclose(stdout);
exit(0);
}
return main2(argc, argv);
}

View File

@@ -0,0 +1,332 @@
// Copyright Naoki Shibata and contributors 2010 - 2024.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
#ifndef __MISC_H__
#define __MISC_H__
#if !defined(SLEEF_GENHEADER)
#include <stdint.h>
#include <string.h>
#endif
#ifndef M_PI
#define M_PI 3.141592653589793238462643383279502884
#endif
#ifndef M_PIl
#define M_PIl 3.141592653589793238462643383279502884L
#endif
#ifndef M_1_PI
#define M_1_PI 0.318309886183790671537767526745028724
#endif
#ifndef M_1_PIl
#define M_1_PIl 0.318309886183790671537767526745028724L
#endif
#ifndef M_2_PI
#define M_2_PI 0.636619772367581343075535053490057448
#endif
#ifndef M_2_PIl
#define M_2_PIl 0.636619772367581343075535053490057448L
#endif
#if !defined(SLEEF_GENHEADER)
#ifndef SLEEF_FP_ILOGB0
#define SLEEF_FP_ILOGB0 ((int)0x80000000)
#endif
#ifndef SLEEF_FP_ILOGBNAN
#define SLEEF_FP_ILOGBNAN ((int)2147483647)
#endif
#endif
#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
#define SLEEF_FLT_MIN 0x1p-126
#define SLEEF_DBL_MIN 0x1p-1022
#define SLEEF_INT_MAX 2147483647
#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
//
/*
PI_A to PI_D are constants that satisfy the following two conditions.
* For PI_A, PI_B and PI_C, the last 28 bits are zero.
* PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
The argument of a trig function is multiplied by 1/PI, and the
integral part is divided into two parts, each has at most 28
bits. So, the maximum argument that could be correctly reduced
should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
double precision calculation, the actual maximum argument that can
be correctly reduced is around 2^47.
*/
#define PI_A 3.1415926218032836914
#define PI_B 3.1786509424591713469e-08
#define PI_C 1.2246467864107188502e-16
#define PI_D 1.2736634327021899816e-24
#define TRIGRANGEMAX 1e+14
/*
PI_A2 and PI_B2 are constants that satisfy the following two conditions.
* The last 3 bits of PI_A2 are zero.
* PI_A2 + PI_B2 is close to PI as much as possible.
The argument of a trig function is multiplied by 1/PI, and the
integral part is multiplied by PI_A2. So, the maximum argument that
could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
we confirmed that it correctly reduces the argument up to around 15.
*/
#define PI_A2 3.141592653589793116
#define PI_B2 1.2246467991473532072e-16
#define TRIGRANGEMAX2 15
#define M_2_PI_H 0.63661977236758138243
#define M_2_PI_L -3.9357353350364971764e-17
#define SQRT_DBL_MAX 1.3407807929942596355e+154
#define TRIGRANGEMAX3 1e+9
#define M_4_PI 1.273239544735162542821171882678754627704620361328125
#define L2U .69314718055966295651160180568695068359375
#define L2L .28235290563031577122588448175013436025525412068e-12
#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
#define L10U 0.30102999566383914498 // log 2 / log 10
#define L10L 1.4205023227266099418e-13
#define LOG10_2 3.3219280948873623478703194294893901758648313930
#define L10Uf 0.3010253906f
#define L10Lf 4.605038981e-06f
//
#define PI_Af 3.140625f
#define PI_Bf 0.0009670257568359375f
#define PI_Cf 6.2771141529083251953e-07f
#define PI_Df 1.2154201256553420762e-10f
#define TRIGRANGEMAXf 39000
#define PI_A2f 3.1414794921875f
#define PI_B2f 0.00011315941810607910156f
#define PI_C2f 1.9841872589410058936e-09f
#define TRIGRANGEMAX2f 125.0f
#define TRIGRANGEMAX4f 8e+6f
#define SQRT_FLT_MAX 18446743523953729536.0
#define L2Uf 0.693145751953125f
#define L2Lf 1.428606765330187045e-06f
#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
#ifndef M_PIf
# define M_PIf ((float)M_PI)
#endif
//
#ifndef MIN
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#endif
#ifndef MAX
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#endif
#ifndef ABS
#define ABS(x) ((x) < 0 ? -(x) : (x))
#endif
#define stringify(s) stringify_(s)
#define stringify_(s) #s
#if !defined(SLEEF_GENHEADER)
typedef long double longdouble;
#endif
#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_double2_DEFINED
typedef struct {
double x, y;
} Sleef_double2;
#endif
#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_float2_DEFINED
typedef struct {
float x, y;
} Sleef_float2;
#endif
#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_longdouble2_DEFINED
typedef struct {
long double x, y;
} Sleef_longdouble2;
#endif
#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
#define LIKELY(condition) __builtin_expect(!!(condition), 1)
#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
#define RESTRICT __restrict__
#ifndef __arm__
#define ALIGNED(x) __attribute__((aligned(x)))
#else
#define ALIGNED(x)
#endif
#if defined(SLEEF_GENHEADER)
#define INLINE SLEEF_ALWAYS_INLINE
#define EXPORT SLEEF_INLINE
#define CONST SLEEF_CONST
#define NOEXPORT
#else // #if defined(SLEEF_GENHEADER)
#define CONST __attribute__((const))
#define INLINE __attribute__((always_inline))
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#ifndef SLEEF_STATIC_LIBS
#define EXPORT __stdcall __declspec(dllexport)
#define NOEXPORT
#else // #ifndef SLEEF_STATIC_LIBS
#define EXPORT
#define NOEXPORT
#endif // #ifndef SLEEF_STATIC_LIBS
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#define EXPORT __attribute__((visibility("default")))
#define NOEXPORT __attribute__ ((visibility ("hidden")))
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#endif // #if defined(SLEEF_GENHEADER)
#define SLEEF_NAN __builtin_nan("")
#define SLEEF_NANf __builtin_nanf("")
#define SLEEF_NANl __builtin_nanl("")
#define SLEEF_INFINITY __builtin_inf()
#define SLEEF_INFINITYf __builtin_inff()
#define SLEEF_INFINITYl __builtin_infl()
#if defined(__INTEL_COMPILER) || defined (__clang__)
#define SLEEF_INFINITYq __builtin_inf()
#define SLEEF_NANq __builtin_nan("")
#else
#define SLEEF_INFINITYq __builtin_infq()
#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
#endif
#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
#if defined(SLEEF_GENHEADER)
#define INLINE SLEEF_ALWAYS_INLINE
#define CONST SLEEF_CONST
#define EXPORT SLEEF_INLINE
#define NOEXPORT
#else // #if defined(SLEEF_GENHEADER)
#define INLINE __forceinline
#define CONST
#ifndef SLEEF_STATIC_LIBS
#define EXPORT __declspec(dllexport)
#define NOEXPORT
#else
#define EXPORT
#define NOEXPORT
#endif
#endif // #if defined(SLEEF_GENHEADER)
#define RESTRICT
#define ALIGNED(x)
#define LIKELY(condition) (condition)
#define UNLIKELY(condition) (condition)
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
#include <x86intrin.h>
#endif
#define SLEEF_INFINITY (1e+300 * 1e+300)
#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
#define SLEEF_NANf ((float)SLEEF_NAN)
#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
#define SLEEF_NANl ((long double)SLEEF_NAN)
#if (defined(_M_AMD64) || defined(_M_X64))
#ifndef __SSE2__
#define __SSE2__
#define __SSE3__
#define __SSE4_1__
#endif
#elif _M_IX86_FP == 2
#ifndef __SSE2__
#define __SSE2__
#define __SSE3__
#define __SSE4_1__
#endif
#elif _M_IX86_FP == 1
#ifndef __SSE__
#define __SSE__
#endif
#endif
#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
#if !defined(__linux__)
#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
#define isnanf(x) ((x) != (x))
#define isnanl(x) ((x) != (x))
#endif
#endif // #ifndef __MISC_H__
#ifdef ENABLE_AAVPCS
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
#else
#define VECTOR_CC
#endif
//
#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
#if !defined (__clang__)
#pragma GCC diagnostic ignored "-Wattribute-alias"
#pragma GCC diagnostic ignored "-Wlto-type-mismatch"
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#endif
#endif
#if defined(_MSC_VER)
#pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
#pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
#endif

View File

@@ -0,0 +1,99 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !defined(SLEEF_GENHEADER)
#if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)
#define SLEEF_FLOAT128_IS_IEEEQP
#endif
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
#define SLEEF_LONGDOUBLE_IS_IEEEQP
#endif
#if !defined(Sleef_quad_DEFINED)
#define Sleef_quad_DEFINED
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
#if defined(SLEEF_FLOAT128_IS_IEEEQP) || defined(ENABLEFLOAT128)
typedef __float128 Sleef_quad;
#define SLEEF_QUAD_C(x) (x ## Q)
#elif defined(SLEEF_LONGDOUBLE_IS_IEEEQP)
typedef long double Sleef_quad;
#define SLEEF_QUAD_C(x) (x ## L)
#else
typedef Sleef_uint64_2t Sleef_quad;
#endif
#endif
#if !defined(Sleef_quad1_DEFINED)
#define Sleef_quad1_DEFINED
typedef union {
struct {
Sleef_quad x;
};
Sleef_quad s[1];
} Sleef_quad1;
#endif
#if !defined(Sleef_quad2_DEFINED)
#define Sleef_quad2_DEFINED
typedef union {
struct {
Sleef_quad x, y;
};
Sleef_quad s[2];
} Sleef_quad2;
#endif
#if !defined(Sleef_quad4_DEFINED)
#define Sleef_quad4_DEFINED
typedef union {
struct {
Sleef_quad x, y, z, w;
};
Sleef_quad s[4];
} Sleef_quad4;
#endif
#if !defined(Sleef_quad8_DEFINED)
#define Sleef_quad8_DEFINED
typedef union {
Sleef_quad s[8];
} Sleef_quad8;
#endif
#if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED)
#define Sleef_quadx_DEFINED
typedef union {
Sleef_quad s[32];
} Sleef_quadx;
#endif
#else // #if !defined(SLEEF_GENHEADER)
SLEEFSHARPif !defined(SLEEFXXX__NVCC__) && ((defined(SLEEFXXX__SIZEOF_FLOAT128__) && SLEEFXXX__SIZEOF_FLOAT128__ == 16) || (defined(SLEEFXXX__linux__) && defined(SLEEFXXX__GNUC__) && (defined(SLEEFXXX__i386__) || defined(SLEEFXXX__x86_64__))) || (defined(SLEEFXXX__PPC64__) && defined(SLEEFXXX__GNUC__) && !defined(SLEEFXXX__clang__) && SLEEFXXX__GNUC__ >= 8))
SLEEFSHARPdefine SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP
SLEEFSHARPendif
SLEEFSHARPif !defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP) && !defined(SLEEFXXX__NVCC__) && defined(SLEEFXXX__SIZEOF_LONG_DOUBLE__) && SLEEFXXX__SIZEOF_LONG_DOUBLE__ == 16 && (defined(SLEEFXXX__aarch64__) || defined(SLEEFXXX__zarch__))
SLEEFSHARPdefine SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP
SLEEFSHARPendif
SLEEFSHARPif !defined(SLEEFXXXSleef_quad_DEFINED)
SLEEFSHARPdefine SLEEFXXXSleef_quad_DEFINED
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
SLEEFSHARPif defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP)
typedef __float128 Sleef_quad;
SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## Q)
SLEEFSHARPelif defined(SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP)
typedef long double Sleef_quad;
SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## L)
SLEEFSHARPelse
typedef Sleef_uint64_2t Sleef_quad;
SLEEFSHARPendif
SLEEFSHARPendif
#endif // #if !defined(SLEEF_GENHEADER)

View File

@@ -0,0 +1,201 @@
# Compiler properties
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
set(COMMON_TARGET_PROPERTIES
C_STANDARD 99 # -std=gnu99
)
#
function(add_test_dft TESTNAME)
if (ARMIE_COMMAND)
add_test(NAME ${TESTNAME} COMMAND ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS} ${ARGN})
elseif (NOT EMULATOR AND NOT SDE_COMMAND)
add_test(NAME ${TESTNAME} COMMAND ${ARGN})
elseif(NOT EMULATOR)
add_test(NAME ${TESTNAME} COMMAND ${SDE_COMMAND} "--" ${ARGN})
else()
add_test(NAME ${TESTNAME} COMMAND ${EMULATOR} ${ARGN})
endif()
set_tests_properties(${TESTNAME} PROPERTIES COST 0.1)
endfunction()
# Include directories
include_directories(${PROJECT_SOURCE_DIR}/include) # sleefdft.h
include_directories(${sleef_BINARY_DIR}/include) # sleef.h
if (FFTW3_INCLUDE_DIR)
include_directories(${FFTW3_INCLUDE_DIR}) # fftw3.h
endif()
# Link directories
link_directories(${sleef_BINARY_DIR}/lib) # libsleef, libsleefdft
# Link libraries
set(COMMON_LINK_LIBRARIES ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
if (COMPILER_SUPPORTS_OPENMP)
set(COMMON_LINK_LIBRARIES ${COMMON_LINK_LIBRARIES} ${OpenMP_C_FLAGS})
endif()
if((NOT MSVC) AND NOT SLEEF_CLANG_ON_WINDOWS)
# Target executable naivetestdp
set(TARGET_NAIVETESTDP "naivetestdp")
add_executable(${TARGET_NAIVETESTDP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_NAIVETESTDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_NAIVETESTDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
target_link_libraries(${TARGET_NAIVETESTDP} ${COMMON_LINK_LIBRARIES})
set_target_properties(${TARGET_NAIVETESTDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable naivetestsp
set(TARGET_NAIVETESTSP "naivetestsp")
add_executable(${TARGET_NAIVETESTSP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_NAIVETESTSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_NAIVETESTSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
target_link_libraries(${TARGET_NAIVETESTSP} ${COMMON_LINK_LIBRARIES})
set_target_properties(${TARGET_NAIVETESTSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Test naivetestdp
add_test_dft(${TARGET_NAIVETESTDP}_1 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 1)
add_test_dft(${TARGET_NAIVETESTDP}_2 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 2)
add_test_dft(${TARGET_NAIVETESTDP}_3 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 3)
add_test_dft(${TARGET_NAIVETESTDP}_4 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 4)
add_test_dft(${TARGET_NAIVETESTDP}_5 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 5)
add_test_dft(${TARGET_NAIVETESTDP}_10 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 10)
# Test naivetestsp
add_test_dft(${TARGET_NAIVETESTSP}_1 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 1)
add_test_dft(${TARGET_NAIVETESTSP}_2 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 2)
add_test_dft(${TARGET_NAIVETESTSP}_3 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 3)
add_test_dft(${TARGET_NAIVETESTSP}_4 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 4)
add_test_dft(${TARGET_NAIVETESTSP}_5 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 5)
add_test_dft(${TARGET_NAIVETESTSP}_10 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 10)
endif()
# Target executable roundtriptest1ddp
set(TARGET_ROUNDTRIPTEST1DDP "roundtriptest1ddp")
add_executable(${TARGET_ROUNDTRIPTEST1DDP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_ROUNDTRIPTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_ROUNDTRIPTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
target_link_libraries(${TARGET_ROUNDTRIPTEST1DDP} ${COMMON_LINK_LIBRARIES})
set_target_properties(${TARGET_ROUNDTRIPTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable roundtriptest1dsp
set(TARGET_ROUNDTRIPTEST1DSP "roundtriptest1dsp")
add_executable(${TARGET_ROUNDTRIPTEST1DSP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_ROUNDTRIPTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_ROUNDTRIPTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
target_link_libraries(${TARGET_ROUNDTRIPTEST1DSP} ${COMMON_LINK_LIBRARIES})
set_target_properties(${TARGET_ROUNDTRIPTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable roundtriptest2ddp
set(TARGET_ROUNDTRIPTEST2DDP "roundtriptest2ddp")
add_executable(${TARGET_ROUNDTRIPTEST2DDP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_ROUNDTRIPTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_ROUNDTRIPTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
target_link_libraries(${TARGET_ROUNDTRIPTEST2DDP} ${COMMON_LINK_LIBRARIES})
set_target_properties(${TARGET_ROUNDTRIPTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable roundtriptest2dsp
set(TARGET_ROUNDTRIPTEST2DSP "roundtriptest2dsp")
add_executable(${TARGET_ROUNDTRIPTEST2DSP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_ROUNDTRIPTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_ROUNDTRIPTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
target_link_libraries(${TARGET_ROUNDTRIPTEST2DSP} ${COMMON_LINK_LIBRARIES})
set_target_properties(${TARGET_ROUNDTRIPTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
# Target executable fftwtest1ddp
set(TARGET_FFTWTEST1DDP "fftwtest1ddp")
add_executable(${TARGET_FFTWTEST1DDP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_FFTWTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_FFTWTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
target_link_libraries(${TARGET_FFTWTEST1DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
set_target_properties(${TARGET_FFTWTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable fftwtest1dsp
set(TARGET_FFTWTEST1DSP "fftwtest1dsp")
add_executable(${TARGET_FFTWTEST1DSP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_FFTWTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_FFTWTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
target_link_libraries(${TARGET_FFTWTEST1DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
set_target_properties(${TARGET_FFTWTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable fftwtest2ddp
set(TARGET_FFTWTEST2DDP "fftwtest2ddp")
add_executable(${TARGET_FFTWTEST2DDP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_FFTWTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_FFTWTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
target_link_libraries(${TARGET_FFTWTEST2DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
set_target_properties(${TARGET_FFTWTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Target executable fftwtest2dsp
set(TARGET_FFTWTEST2DSP "fftwtest2dsp")
add_executable(${TARGET_FFTWTEST2DSP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
add_dependencies(${TARGET_FFTWTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
target_compile_definitions(${TARGET_FFTWTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
target_link_libraries(${TARGET_FFTWTEST2DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
set_target_properties(${TARGET_FFTWTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Test fftwtest1ddp
add_test_dft(${TARGET_FFTWTEST1DDP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 12)
add_test_dft(${TARGET_FFTWTEST1DDP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 16)
# Test fftwtest1dsp
add_test_dft(${TARGET_FFTWTEST1DSP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 12)
add_test_dft(${TARGET_FFTWTEST1DSP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 16)
# Test fftwtest2ddp
add_test_dft(${TARGET_FFTWTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 2 2)
add_test_dft(${TARGET_FFTWTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 4 4)
add_test_dft(${TARGET_FFTWTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 8 8)
add_test_dft(${TARGET_FFTWTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 10 10)
add_test_dft(${TARGET_FFTWTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 5 15)
# Test fftwtest2dsp
add_test_dft(${TARGET_FFTWTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 2 2)
add_test_dft(${TARGET_FFTWTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 4 4)
add_test_dft(${TARGET_FFTWTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 8 8)
add_test_dft(${TARGET_FFTWTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 10 10)
add_test_dft(${TARGET_FFTWTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 5 15)
else(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
if(MSVC OR SLEEF_CLANG_ON_WINDOWS)
# Test roundtriptestdp
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 1 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 2 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 3 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 4 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 5 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 10 10)
# Test roundtriptestsp
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 1 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 2 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 3 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 4 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 5 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 10 10)
endif()
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 12 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 16 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 12 10)
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 16 10)
# Test roundtriptest2ddp
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 2 2 10)
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 4 4 10)
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 8 8 10)
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 10 10 2)
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 5 15 2)
# Test roundtriptest2dsp
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 2 2 10)
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 4 4 10)
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 8 8 10)
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 10 10 2)
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 5 15 2)
endif(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)

View File

@@ -0,0 +1,116 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#define _DEFAULT_SOURCE
#define _XOPEN_SOURCE 700
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <complex.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
#ifdef USEFFTW
#include <fftw3.h>
#include <omp.h>
#else
#include "sleef.h"
#include "sleefdft.h"
#endif
typedef double real;
static uint64_t gettime() {
struct timespec tp;
clock_gettime(CLOCK_MONOTONIC, &tp);
return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
}
#define REPEAT 8
int main(int argc, char **argv) {
if (argc == 1) {
fprintf(stderr, "%s <log2n>\n", argv[0]);
exit(-1);
}
int backward = 0;
int log2n = atoi(argv[1]);
if (log2n < 0) {
backward = 1;
log2n = -log2n;
}
const int n = 1 << log2n;
const int64_t niter = (int)(100000000000.0 / n / log2n);
printf("Number of iterations = %lld\n", (long long int)niter);
#ifdef USEFFTW
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
#if 0
int fftw_init_threads(void);
fftw_plan_with_nthreads(omp_get_max_threads());
#endif
fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_MEASURE);
//fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_PATIENT);
for(int i=0;i<n;i++) {
in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
}
for(int64_t i=0;i<niter/2;i++) fftw_execute(w);
#else
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
real *in = (real *)Sleef_malloc(n*2 * sizeof(real));
real *out = (real *)Sleef_malloc(n*2 * sizeof(real));
int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE; // | SLEEF_MODE_NO_MT;
if (argc >= 3) mode = SLEEF_MODE_VERBOSE | SLEEF_MODE_ESTIMATE;
if (backward) mode |= SLEEF_MODE_BACKWARD;
struct SleefDFT *p = SleefDFT_double_init1d(n, in, out, mode);
if (argc >= 3) SleefDFT_setPath(p, argv[2]);
for(int i=0;i<n*2;i++) {
in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
}
for(int64_t i=0;i<niter/2;i++) SleefDFT_double_execute(p, in, out);
#endif
for(int rep=0;rep<REPEAT;rep++) {
uint64_t tm0 = gettime();
for(int64_t i=0;i<niter;i++) {
#ifdef USEFFTW
fftw_execute(w);
#else
SleefDFT_double_execute(p, in, out);
#endif
}
uint64_t tm1 = gettime();
printf("Actual time = %g ns\n", (double)(tm1 - tm0) / niter);
double timeus = (tm1 - tm0) / ((double)niter * 1000);
double mflops = 5 * n * log2n / timeus;
printf("%g Mflops\n", mflops);
}
//
exit(0);
}

View File

@@ -0,0 +1,230 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
#include <fftw3.h>
#ifndef MODE
#define MODE SLEEF_MODE_DEBUG
#endif
#if BASETYPEID == 1
#define THRES 1e-30
#define SleefDFT_init1d SleefDFT_double_init1d
#define SleefDFT_execute SleefDFT_double_execute
typedef double real;
#elif BASETYPEID == 2
#define THRES 1e-13
#define SleefDFT_init1d SleefDFT_float_init1d
#define SleefDFT_execute SleefDFT_float_execute
typedef float real;
#else
#error BASETYPEID not set
#endif
static double squ(double x) { return x * x; }
// complex forward
double check_cf(int n) {
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, MODE);
for(int i=0;i<n;i++) {
real re = (2.0 * random() - 1) / (real)RAND_MAX;
real im = (2.0 * random() - 1) / (real)RAND_MAX;
sx[(i*2+0)] = re;
sx[(i*2+1)] = im;
in[i] = re + im * _Complex_I;
}
SleefDFT_execute(p, NULL, NULL);
fftw_execute(w);
double rmsn = 0, rmsd = 0;
for(int i=0;i<n;i++) {
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
}
fftw_destroy_plan(w);
fftw_free(in);
fftw_free(out);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
return rmsn / rmsd;
}
// complex backward
double check_cb(int n) {
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_BACKWARD | MODE);
for(int i=0;i<n;i++) {
real re = (2.0 * random() - 1) / (real)RAND_MAX;
real im = (2.0 * random() - 1) / (real)RAND_MAX;
sx[(i*2+0)] = re;
sx[(i*2+1)] = im;
in[i] = re + im * _Complex_I;
}
SleefDFT_execute(p, NULL, NULL);
fftw_execute(w);
double rmsn = 0, rmsd = 0;
for(int i=0;i<n;i++) {
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
}
fftw_destroy_plan(w);
fftw_free(in);
fftw_free(out);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
return rmsn / rmsd;
}
// real forward
double check_rf(int n) {
double *in = (double *) fftw_malloc(sizeof(double) * n);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
fftw_plan w = fftw_plan_dft_r2c_1d(n, in, out, FFTW_ESTIMATE);
real *sx = (real *)Sleef_malloc(n*sizeof(real));
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | MODE);
for(int i=0;i<n;i++) {
real re = (2.0 * random() - 1) / (real)RAND_MAX;
sx[i] = re;
in[i] = re;
}
SleefDFT_execute(p, NULL, NULL);
fftw_execute(w);
double rmsn = 0, rmsd = 0;
for(int i=0;i<n/2+1;i++) {
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
}
fftw_destroy_plan(w);
fftw_free(in);
fftw_free(out);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
return rmsn / rmsd;
}
// real backward
double check_rb(int n) {
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
double *out = (double *) fftw_malloc(sizeof(double) * n);
fftw_plan w = fftw_plan_dft_c2r_1d(n, in, out, FFTW_ESTIMATE);
real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
real *sy = (real *)Sleef_malloc(sizeof(real)*n);
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
for(int i=0;i<n/2;i++) {
if (i == 0) {
in[0 ] = (2.0 * (rand() / (real)RAND_MAX) - 1);
in[n/2] = (2.0 * (rand() / (real)RAND_MAX) - 1);
} else {
in[i ] = (2.0 * (rand() / (real)RAND_MAX) - 1) + (2.0 * (rand() / (real)RAND_MAX) - 1) * _Complex_I;
}
}
for(int i=0;i<n/2+1;i++) {
sx[2*i+0] = creal(in[i]);
sx[2*i+1] = cimag(in[i]);
}
SleefDFT_execute(p, NULL, NULL);
fftw_execute(w);
double rmsn = 0, rmsd = 0;
for(int i=0;i<n;i++) {
rmsn += squ(sy[i] - out[i]);
rmsd += squ( out[i]);
}
fftw_destroy_plan(w);
fftw_free(in);
fftw_free(out);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
return rmsn / rmsd;
}
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "%s <log2n>\n", argv[0]);
exit(-1);
}
const int n = 1 << atoi(argv[1]);
srand((unsigned int)time(NULL));
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
//
int success = 1;
double e;
e = check_cf(n);
success = success && e < THRES;
printf("complex forward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
e = check_cb(n);
success = success && e < THRES;
printf("complex backward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
e = check_rf(n);
success = success && e < THRES;
printf("real forward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
e = check_rb(n);
success = success && e < THRES;
printf("real backward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
exit(success ? 0 : -1);
}

View File

@@ -0,0 +1,143 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
#include <fftw3.h>
#ifndef MODE
#define MODE SLEEF_MODE_DEBUG
#endif
#if BASETYPEID == 1
#define THRES 1e-30
#define SleefDFT_init2d SleefDFT_double_init2d
#define SleefDFT_execute SleefDFT_double_execute
typedef double real;
#elif BASETYPEID == 2
#define THRES 1e-13
#define SleefDFT_init2d SleefDFT_float_init2d
#define SleefDFT_execute SleefDFT_float_execute
typedef float real;
#else
#error BASETYPEID not set
#endif
static double squ(double x) { return x * x; }
// complex forward
double check_cf(int n, int m) {
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, MODE);
for(int i=0;i<n*m;i++) {
double re = (2.0 * random() - 1) / (double)RAND_MAX;
double im = (2.0 * random() - 1) / (double)RAND_MAX;
sx[(i*2+0)] = re;
sx[(i*2+1)] = im;
in[i] = re + im * _Complex_I;
}
SleefDFT_execute(p, NULL, NULL);
fftw_execute(w);
double rmsn = 0, rmsd = 0;
for(int i=0;i<n*m;i++) {
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
}
fftw_destroy_plan(w);
fftw_free(in);
fftw_free(out);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
return rmsn / rmsd;
}
// complex backward
double check_cb(int n, int m) {
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, SLEEF_MODE_BACKWARD | MODE);
for(int i=0;i<n*m;i++) {
double re = (2.0 * random() - 1) / (double)RAND_MAX;
double im = (2.0 * random() - 1) / (double)RAND_MAX;
sx[(i*2+0)] = re;
sx[(i*2+1)] = im;
in[i] = re + im * _Complex_I;
}
SleefDFT_execute(p, NULL, NULL);
fftw_execute(w);
double rmsn = 0, rmsd = 0;
for(int i=0;i<n*m;i++) {
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
}
fftw_destroy_plan(w);
fftw_free(in);
fftw_free(out);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
return rmsn / rmsd;
}
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "%s <log2n> <log2m>\n", argv[0]);
exit(-1);
}
const int n = 1 << atoi(argv[1]);
const int m = 1 << atoi(argv[2]);
srand((unsigned int)time(NULL));
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
//
int success = 1;
double e;
e = check_cf(n, m);
success = success && e < THRES;
printf("complex forward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
e = check_cb(n, m);
success = success && e < THRES;
printf("complex backward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
exit(success ? 0 : -1);
}

View File

@@ -0,0 +1,175 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#define _DEFAULT_SOURCE
#define _XOPEN_SOURCE 700
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
static uint64_t gettime() {
struct timespec tp;
clock_gettime(CLOCK_MONOTONIC, &tp);
return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
}
int mode[] = { SLEEF_MODE_MEASURE | SLEEF_MODE_NO_MT, SLEEF_MODE_MEASURE};
#define ENABLE_SP
//#define ROUNDTRIP
#define REPEAT 2
//#define ENABLE_SLEEP
//#define WARMUP
int main(int argc, char **argv) {
int start = 1, end = 18;
if (argc > 1) start = atoi(argv[1]);
if (argc > 2) end = atoi(argv[2]);
double *din = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
double *dout = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
float *sin = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
float *sout = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
for(int log2n=start;log2n<=end;log2n++) {
const int n = 1 << log2n;
int64_t niter = (int64_t)(1000000000.0 / REPEAT / n / log2n);
printf("%d ", n);
for(int m=0;m<2;m++) {
#ifdef ENABLE_SLEEP
sleep(1);
#endif
struct SleefDFT *pf = SleefDFT_double_init1d(n, NULL, NULL, mode[m]);
#ifdef ROUNDTRIP
struct SleefDFT *pb = SleefDFT_double_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
#endif
for(int i=0;i<n*2;i++) {
din[i] = 0;
}
#ifdef ENABLE_SLEEP
sleep(1);
#endif
#ifdef WARMUP
for(int64_t i=0;i<niter/2;i++) {
SleefDFT_double_execute(pf, din, dout);
#ifdef ROUNDTRIP
SleefDFT_double_execute(pb, dout, din);
#endif
}
#endif
uint64_t best = 1LL << 62;
//printf("\n");
for(int rep=0;rep<REPEAT;rep++) {
uint64_t tm0 = gettime();
for(int64_t i=0;i<niter;i++) {
SleefDFT_double_execute(pf, din, dout);
#ifdef ROUNDTRIP
SleefDFT_double_execute(pb, dout, din);
#endif
}
uint64_t tm1 = gettime();
if (tm1 - tm0 < best) best = tm1 - tm0;
//printf("%g\n", (double)(tm1 - tm0));
}
SleefDFT_dispose(pf);
#ifdef ROUNDTRIP
SleefDFT_dispose(pb);
#endif
double timeus = best / ((double)niter * 1000);
#ifdef ROUNDTRIP
double mflops = 10 * n * log2n / timeus;
#else
double mflops = 5 * n * log2n / timeus;
#endif
printf("%g ", mflops);
}
#ifdef ENABLE_SP
for(int m=0;m<2;m++) {
#ifdef ENABLE_SLEEP
sleep(1);
#endif
struct SleefDFT *pf = SleefDFT_float_init1d(n, NULL, NULL, mode[m]);
#ifdef ROUNDTRIP
struct SleefDFT *pb = SleefDFT_float_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
#endif
for(int i=0;i<n*2;i++) {
sin[i] = 0;
}
#ifdef ENABLE_SLEEP
sleep(1);
#endif
#ifdef WARMUP
for(int64_t i=0;i<niter/2;i++) {
SleefDFT_float_execute(pf, sin, sout);
#ifdef OUNDTRIP
SleefDFT_float_execute(pb, sout, sin);
#endif
}
#endif
uint64_t best = 1LL << 62;
for(int rep=0;rep<REPEAT;rep++) {
uint64_t tm0 = gettime();
for(int64_t i=0;i<niter;i++) {
SleefDFT_float_execute(pf, sin, sout);
#ifdef ROUNDTRIP
SleefDFT_float_execute(pb, sout, sin);
#endif
}
uint64_t tm1 = gettime();
if (tm1 - tm0 < best) best = tm1 - tm0;
}
SleefDFT_dispose(pf);
#ifdef ROUNDTRIP
SleefDFT_dispose(pb);
#endif
double timeus = best / ((double)niter * 1000);
#ifdef ROUNDTRIP
double mflops = 10 * n * log2n / timeus;
#else
double mflops = 5 * n * log2n / timeus;
#endif
printf("%g ", mflops);
}
#endif
printf("\n");
}
}

View File

@@ -0,0 +1,484 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
#include "misc.h"
#ifndef MODE
#define MODE SLEEF_MODE_DEBUG
#endif
#define THRES 1e-4
#if BASETYPEID == 1
#define SleefDFT_init SleefDFT_double_init1d
#define SleefDFT_execute SleefDFT_double_execute
typedef double real;
typedef double complex cmpl;
cmpl omega(double n, double kn) {
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
}
#elif BASETYPEID == 2
#define SleefDFT_init SleefDFT_float_init1d
#define SleefDFT_execute SleefDFT_float_execute
typedef float real;
typedef double complex cmpl;
cmpl omega(double n, double kn) {
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
}
#elif BASETYPEID == 3
#define SleefDFT_init SleefDFT_longdouble_init1d
#define SleefDFT_execute SleefDFT_longdouble_execute
typedef double real;
typedef double complex cmpl;
cmpl omega(double n, double kn) {
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
}
#elif BASETYPEID == 4
#include <quadmath.h>
#define SleefDFT_init SleefDFT_quad_init1d
#define SleefDFT_execute SleefDFT_quad_execute
typedef Sleef_quad real;
typedef double complex cmpl;
cmpl omega(double n, double kn) {
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
}
#else
#error No BASETYPEID specified
#endif
void forward(cmpl *ts, cmpl *fs, int len) {
int k, n;
for(k=0;k<len;k++) {
fs[k] = 0;
for(n=0;n<len;n++) {
fs[k] += ts[n] * omega(len, n*k);
}
}
}
void backward(cmpl *fs, cmpl *ts, int len) {
int k, n;
for(k=0;k<len;k++) {
ts[k] = 0;
for(n=0;n<len;n++) {
ts[k] += fs[n] * omega(-len, n*k);
}
}
}
// complex forward
int check_cf(int n) {
int i;
real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
//
for(i=0;i<n;i++) {
ts[i] = 0.5 * ((2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I);
sx[(i*2+0)] = creal(ts[i]);
sx[(i*2+1)] = cimag(ts[i]);
}
//
forward(ts, fs, n);
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_VERBOSE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
//
int success = 1;
double rmsn = 0, rmsd = 0;
for(i=0;i<n;i++) {
if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
(fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
success = 0;
}
double t;
t = (sy[(i*2+0)] - creal(fs[i]));
rmsn += t*t;
t = (sy[(i*2+1)] - cimag(fs[i]));
rmsn += t*t;
rmsd += creal(fs[i]) * creal(fs[i]) + cimag(fs[i]) * cimag(fs[i]);
}
//
free(fs);
free(ts);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
//
return success;
}
// complex backward
int check_cb(int n) {
int i;
real *sx = (real *)Sleef_malloc(sizeof(real)*n*2);
real *sy = (real *)Sleef_malloc(sizeof(real)*n*2);
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
//
for(i=0;i<n;i++) {
fs[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
sx[(i*2+0)] = creal(fs[i]);
sx[(i*2+1)] = cimag(fs[i]);
}
backward(fs, ts, n);
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_BACKWARD | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
//
int success = 1;
for(i=0;i<n;i++) {
if ((fabs(sy[(i*2+0)] - creal(ts[i])) > THRES) ||
(fabs(sy[(i*2+1)] - cimag(ts[i])) > THRES)) {
success = 0;
}
}
//
free(fs);
free(ts);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
//
return success;
}
// real forward
int check_rf(int n) {
int i;
real *sx = (real *)Sleef_malloc(n * sizeof(real));
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
//
for(i=0;i<n;i++) {
ts[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
sx[i] = creal(ts[i]);
}
//
forward(ts, fs, n);
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_NO_MT | SLEEF_MODE_REAL | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
//
int success = 1;
for(i=0;i<n/2+1;i++) {
if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
}
//
free(fs);
free(ts);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
//
return success;
}
// real backward
int check_rb(int n) {
int i;
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
//
for(i=0;i<n/2;i++) {
if (i == 0) {
fs[0 ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
} else {
fs[i ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
fs[n-i] = conj(fs[i]);
}
}
real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
real *sy = (real *)Sleef_malloc(sizeof(real)*n);
for(i=0;i<n/2+1;i++) {
sx[2*i+0] = creal(fs[i]);
sx[2*i+1] = cimag(fs[i]);
}
//
backward(fs, ts, n);
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
//
int success = 1;
for(i=0;i<n;i++) {
if (fabs(cimag(ts[i])) > THRES) {
success = 0;
}
if ((fabs(sy[i] - creal(ts[i])) > THRES)) {
success = 0;
}
}
//
free(fs);
free(ts);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
//
return success;
}
int check_arf(int n) {
int i;
real *sx = (real *)Sleef_malloc(n * sizeof(real));
real *sy = (real *)Sleef_malloc(n * sizeof(real));
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
//
for(i=0;i<n;i++) {
ts[i] = 2 * (rand() / (real)RAND_MAX) - 1;
sx[i] = creal(ts[i]);
}
//
backward(ts, fs, n);
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_ALT | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
//
int success = 1;
for(i=0;i<n/2;i++) {
if (i == 0) {
if (fabs(sy[(2*0+0)] - creal(fs[0 ])) > THRES) success = 0;
if (fabs(sy[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0;
} else {
if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
}
}
//
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
//
return success;
}
int check_arb(int n) {
int i;
real *sx = (real *)Sleef_malloc(n * sizeof(real));
real *sy = (real *)Sleef_malloc(n * sizeof(real));
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
//
for(i=0;i<n/2;i++) {
if (i == 0) {
fs[0 ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
} else {
fs[i ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
fs[n-i] = conj(fs[i]);
}
}
for(i=0;i<n/2;i++) {
if (i == 0) {
sx[2*0+0] = creal(fs[0 ]);
sx[2*0+1] = creal(fs[n/2]);
} else {
sx[2*i+0] = creal(fs[i]);
sx[2*i+1] = cimag(fs[i]);
}
}
//
forward(fs, ts, n);
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | SLEEF_MODE_ALT | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
//
int success = 1;
for(i=0;i<n;i++) {
if (fabs(cimag(ts[i])) > THRES) {
success = 0;
}
if ((fabs(sy[i]*2 - creal(ts[i])) > THRES)) {
success = 0;
}
}
//
free(fs);
free(ts);
Sleef_free(sx);
Sleef_free(sy);
SleefDFT_dispose(p);
//
return success;
}
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "%s <log2n>\n", argv[0]);
exit(-1);
}
const int n = 1 << atoi(argv[1]);
srand((unsigned int)time(NULL));
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
//
int success = 1;
printf("complex forward : %s\n", (success &= check_cf(n)) ? "OK" : "NG");
printf("complex backward : %s\n", (success &= check_cb(n)) ? "OK" : "NG");
printf("real forward : %s\n", (success &= check_rf(n)) ? "OK" : "NG");
printf("real backward : %s\n", (success &= check_rb(n)) ? "OK" : "NG");
printf("real alt forward : %s\n", (success &= check_arf(n)) ? "OK" : "NG");
printf("real alt backward : %s\n", (success &= check_arb(n)) ? "OK" : "NG");
exit(!success);
}

View File

@@ -0,0 +1,174 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
#ifndef MODE
#define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
#endif
#if BASETYPEID == 1
#define THRES 1e-30
#define SleefDFT_init SleefDFT_double_init1d
#define SleefDFT_execute SleefDFT_double_execute
typedef double real;
#elif BASETYPEID == 2
#define THRES 1e-13
#define SleefDFT_init SleefDFT_float_init1d
#define SleefDFT_execute SleefDFT_float_execute
typedef float real;
#else
#error BASETYPEID not set
#endif
static double squ(double x) { return x * x; }
// complex transforms
double check_c(int n) {
struct SleefDFT *p;
real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
real *sz = (real *)Sleef_malloc(n*2 * sizeof(real));
for(int i=0;i<n*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
//
p = SleefDFT_init(n, NULL, NULL, MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
exit(-1);
}
SleefDFT_execute(p, sx, sy);
SleefDFT_dispose(p);
//
p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
exit(-1);
}
SleefDFT_execute(p, sy, sz);
SleefDFT_dispose(p);
//
double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
for(int i=0;i<n;i++) {
rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
rmsd += squ( sx[i*2+0]) + squ( sx[i*2+1]);
}
//
Sleef_free(sx);
Sleef_free(sy);
Sleef_free(sz);
//
return rmsn / rmsd;
}
// real transforms
double check_r(int n) {
struct SleefDFT *p;
real *sx = (real *)Sleef_malloc(n * sizeof(real));
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
real *sz = (real *)Sleef_malloc(n * sizeof(real));
for(int i=0;i<n;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
//
p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sx, sy);
SleefDFT_dispose(p);
//
p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
return 0;
}
SleefDFT_execute(p, sy, sz);
SleefDFT_dispose(p);
//
double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
for(int i=0;i<n;i++) {
rmsn += squ(scale * sz[i] - sx[i]);
rmsd += squ( sx[i]);
}
//
Sleef_free(sx);
Sleef_free(sy);
Sleef_free(sz);
//
return rmsn / rmsd;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "%s <log2n> [<nloop>]\n", argv[0]);
exit(-1);
}
const int n = 1 << atoi(argv[1]);
const int nloop = argc >= 3 ? atoi(argv[2]) : 1;
srand((unsigned int)time(NULL));
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
//
int success = 1;
double e;
for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
e = check_c(n);
success = success && e < THRES;
printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
e = check_r(n);
success = success && e < THRES;
printf("real : %s (%g)\n", e < THRES ? "OK" : "NG", e);
}
exit(!success);
}

View File

@@ -0,0 +1,118 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
#ifndef MODE
#define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
#endif
#if BASETYPEID == 1
#define THRES 1e-30
#define SleefDFT_init2d SleefDFT_double_init2d
#define SleefDFT_execute SleefDFT_double_execute
typedef double real;
#elif BASETYPEID == 2
#define THRES 1e-13
#define SleefDFT_init2d SleefDFT_float_init2d
#define SleefDFT_execute SleefDFT_float_execute
typedef float real;
#else
#error BASETYPEID not set
#endif
static double squ(double x) { return x * x; }
// complex transforms
double check_c(int n, int m) {
struct SleefDFT *p;
real *sx = (real *)Sleef_malloc(n*m*2 * sizeof(real));
real *sy = (real *)Sleef_malloc(n*m*2 * sizeof(real));
real *sz = (real *)Sleef_malloc(n*m*2 * sizeof(real));
for(int i=0;i<n*m*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
//
p = SleefDFT_init2d(n, m, NULL, NULL, MODE);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
exit(-1);
}
SleefDFT_execute(p, sx, sy);
SleefDFT_dispose(p);
//
p = SleefDFT_init2d(n, m, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
exit(-1);
}
SleefDFT_execute(p, sy, sz);
SleefDFT_dispose(p);
//
double rmsn = 0, rmsd = 0, scale = 1 / (n*(double)m);
for(int i=0;i<n*m;i++) {
rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
rmsd += squ( sx[i*2+0]) + squ( sx[i*2+1]);
}
//
Sleef_free(sx);
Sleef_free(sy);
Sleef_free(sz);
//
return rmsn / rmsd;
}
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "%s <log2n> <log2m> [<nloop>]\n", argv[0]);
exit(-1);
}
const int n = 1 << atoi(argv[1]);
const int m = 1 << atoi(argv[2]);
const int nloop = argc >= 4 ? atoi(argv[3]) : 1;
srand((unsigned int)time(NULL));
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
//
int success = 1;
double e;
for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
e = check_c(n, m);
success = success && e < THRES;
printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
}
exit(!success);
}

View File

@@ -0,0 +1,80 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
// gcc tutorial.c -lsleef -lsleefdft -lm
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <complex.h>
#include "sleef.h"
#include "sleefdft.h"
#define THRES 1e-4
typedef double complex cmpl;
cmpl omega(double n, double kn) {
return cexp((-2 * M_PI * _Complex_I / n) * kn);
}
void forward(cmpl *ts, cmpl *fs, int len) {
for(int k=0;k<len;k++) {
fs[k] = 0;
for(int n=0;n<len;n++) fs[k] += ts[n] * omega(len, n*k);
}
}
int main(int argc, char **argv) {
int n = 256;
if (argc == 2) n = 1 << atoi(argv[1]);
SleefDFT_setPlanFilePath("plan.txt", NULL, SLEEF_PLAN_AUTOMATIC);
double *sx = (double *)Sleef_malloc(n*2 * sizeof(double));
double *sy = (double *)Sleef_malloc(n*2 * sizeof(double));
struct SleefDFT *p = SleefDFT_double_init1d(n, sx, sy, SLEEF_MODE_FORWARD);
if (p == NULL) {
printf("SleefDFT initialization failed\n");
exit(-1);
}
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
for(int i=0;i<n;i++) {
ts[i] =
(2.0 * (rand() / (double)RAND_MAX) - 1) * 1.0 +
(2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
sx[(i*2+0)] = creal(ts[i]);
sx[(i*2+1)] = cimag(ts[i]);
}
forward(ts, fs, n);
SleefDFT_double_execute(p, NULL, NULL);
int success = 1;
for(int i=0;i<n;i++) {
if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
(fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
success = 0;
}
}
printf("%s\n", success ? "OK" : "NG");
free(fs); free(ts);
Sleef_free(sy); Sleef_free(sx);
SleefDFT_dispose(p);
exit(success);
}

View File

@@ -0,0 +1,425 @@
# Options
if (COMPILER_SUPPORTS_SVE)
set(SLEEFDFT_MAXBUTWIDTH 6 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
else()
set(SLEEFDFT_MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
endif()
if (SLEEFDFT_MAXBUTWIDTH GREATER 7)
message(FATAL_ERROR "SLEEFDFT_MAXBUTWIDTH has to be smaller than 8." )
endif()
option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF)
# Settings
# Constants definition
set(LISTSHORTTYPENAME "dp" "sp")
set(LISTLONGTYPENAME "double" "float")
set(LISTTYPEID "1" "2")
set(MACRODEF_vecextdp BASETYPEID=1 ENABLE_VECEXT CONFIG=1)
set(CFLAGS_vecextdp ${FLAGS_ENABLE_VECEXT})
set(MACRODEF_vecextsp BASETYPEID=2 ENABLE_VECEXT CONFIG=1)
set(CFLAGS_vecextsp ${FLAGS_ENABLE_VECEXT})
set(MACRODEF_vecextld BASETYPEID=3 ENABLE_VECEXT CONFIG=1)
set(CFLAGS_vecextld ${FLAGS_ENABLE_VECEXT})
set(MACRODEF_vecextqp BASETYPEID=4 ENABLE_VECEXT CONFIG=1)
set(CFLAGS_vecextqp ${FLAGS_ENABLE_VECEXT})
set(MACRODEF_purecdp BASETYPEID=1 ENABLE_PUREC CONFIG=1)
set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC})
set(MACRODEF_purecsp BASETYPEID=2 ENABLE_PUREC CONFIG=1)
set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC})
set(MACRODEF_purecld BASETYPEID=3 ENABLE_PUREC CONFIG=1)
set(CFLAGS_purecld ${FLAGS_ENABLE_PUREC})
set(MACRODEF_purecqp BASETYPEID=4 ENABLE_PUREC CONFIG=1)
set(CFLAGS_purecqp ${FLAGS_ENABLE_PUREC})
set(MACRODEF_sse2dp BASETYPEID=1 ENABLE_SSE2 CONFIG=4)
set(CFLAGS_sse2dp ${FLAGS_ENABLE_SSE4})
set(MACRODEF_sse2sp BASETYPEID=2 ENABLE_SSE2 CONFIG=4)
set(CFLAGS_sse2sp ${FLAGS_ENABLE_SSE4})
set(MACRODEF_avxdp BASETYPEID=1 ENABLE_AVX CONFIG=1)
set(CFLAGS_avxdp ${FLAGS_ENABLE_AVX})
set(MACRODEF_avxsp BASETYPEID=2 ENABLE_AVX CONFIG=1)
set(CFLAGS_avxsp ${FLAGS_ENABLE_AVX})
set(MACRODEF_avx2dp BASETYPEID=1 ENABLE_AVX2 CONFIG=1)
set(CFLAGS_avx2dp ${FLAGS_ENABLE_AVX2})
set(MACRODEF_avx2sp BASETYPEID=2 ENABLE_AVX2 CONFIG=1)
set(CFLAGS_avx2sp ${FLAGS_ENABLE_AVX2})
set(MACRODEF_avx512fdp BASETYPEID=1 ENABLE_AVX512F CONFIG=1)
set(CFLAGS_avx512fdp ${FLAGS_ENABLE_AVX512F})
set(MACRODEF_avx512fsp BASETYPEID=2 ENABLE_AVX512F CONFIG=1)
set(CFLAGS_avx512fsp ${FLAGS_ENABLE_AVX512F})
set(MACRODEF_advsimddp BASETYPEID=1 ENABLE_ADVSIMD CONFIG=1)
set(CFLAGS_advsimddp ${FLAGS_ENABLE_ADVSIMD})
set(MACRODEF_advsimdsp BASETYPEID=2 ENABLE_ADVSIMD CONFIG=1)
set(CFLAGS_advsimdsp ${FLAGS_ENABLE_ADVSIMD})
set(MACRODEF_neon32sp BASETYPEID=2 ENABLE_NEON32 CONFIG=1)
set(CFLAGS_neon32sp ${FLAGS_ENABLE_NEON32})
set(MACRODEF_sve256dp BASETYPEID=1 ENABLE_SVE CONFIG=8)
set(CFLAGS_sve256dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve256sp BASETYPEID=2 ENABLE_SVE CONFIG=8)
set(CFLAGS_sve256sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve512dp BASETYPEID=1 ENABLE_SVE CONFIG=9)
set(CFLAGS_sve512dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve512sp BASETYPEID=2 ENABLE_SVE CONFIG=9)
set(CFLAGS_sve512sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve1024dp BASETYPEID=1 ENABLE_SVE CONFIG=10)
set(CFLAGS_sve1024dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve1024sp BASETYPEID=2 ENABLE_SVE CONFIG=10)
set(CFLAGS_sve1024sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11)
set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE})
set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11)
set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE})
set(MACRODEF_rvvm1128dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=7)
set(CFLAGS_rvvm1128dp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm1128sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=7)
set(CFLAGS_rvvm1128sp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm1256dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=8)
set(CFLAGS_rvvm1256dp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm1256sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=8)
set(CFLAGS_rvvm1256sp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm1512dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=9)
set(CFLAGS_rvvm1512dp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm1512sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=9)
set(CFLAGS_rvvm1512sp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm11024dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=10)
set(CFLAGS_rvvm11024dp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm11024sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=10)
set(CFLAGS_rvvm11024sp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm12048dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=11)
set(CFLAGS_rvvm12048dp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm12048sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=11)
set(CFLAGS_rvvm12048sp ${FLAGS_ENABLE_RVVM1})
set(MACRODEF_rvvm2128dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=7)
set(CFLAGS_rvvm2128dp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm2128sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=7)
set(CFLAGS_rvvm2128sp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm2256dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=8)
set(CFLAGS_rvvm2256dp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm2256sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=8)
set(CFLAGS_rvvm2256sp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm2512dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=9)
set(CFLAGS_rvvm2512dp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm2512sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=9)
set(CFLAGS_rvvm2512sp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm21024dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=10)
set(CFLAGS_rvvm21024dp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm21024sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=10)
set(CFLAGS_rvvm21024sp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm22048dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=11)
set(CFLAGS_rvvm22048dp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_rvvm22048sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=11)
set(CFLAGS_rvvm22048sp ${FLAGS_ENABLE_RVVM2})
set(MACRODEF_vsxdp BASETYPEID=1 ENABLE_VSX CONFIG=1)
set(CFLAGS_vsxdp ${FLAGS_ENABLE_VSX})
set(MACRODEF_vsxsp BASETYPEID=2 ENABLE_VSX CONFIG=1)
set(CFLAGS_vsxsp ${FLAGS_ENABLE_VSX})
set(MACRODEF_vsx3dp BASETYPEID=1 ENABLE_VSX3 CONFIG=1)
set(CFLAGS_vsx3dp ${FLAGS_ENABLE_VSX3})
set(MACRODEF_vsx3sp BASETYPEID=2 ENABLE_VSX3 CONFIG=1)
set(CFLAGS_vsx3sp ${FLAGS_ENABLE_VSX3})
set(MACRODEF_vxedp BASETYPEID=1 ENABLE_VXE CONFIG=140)
set(CFLAGS_vxedp ${FLAGS_ENABLE_VXE})
set(MACRODEF_vxesp BASETYPEID=2 ENABLE_VXE CONFIG=140)
set(CFLAGS_vxesp ${FLAGS_ENABLE_VXE})
set(MACRODEF_vxe2dp BASETYPEID=1 ENABLE_VXE2 CONFIG=150)
set(CFLAGS_vxe2dp ${FLAGS_ENABLE_VXE2})
set(MACRODEF_vxe2sp BASETYPEID=2 ENABLE_VXE2 CONFIG=150)
set(CFLAGS_vxe2sp ${FLAGS_ENABLE_VXE2})
# List all available scalar data types
set(ISALIST_SP purecsp)
set(ISALIST_DP purecdp)
set(LIST_SUPPORTED_FPTYPE 0 1)
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
set(ISALIST_SP vecextsp)
set(ISALIST_DP vecextdp)
endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
# List all available vector data types
if (COMPILER_SUPPORTS_SSE4)
set(ISALIST_SP ${ISALIST_SP} sse2sp)
set(ISALIST_DP ${ISALIST_DP} sse2dp)
endif(COMPILER_SUPPORTS_SSE4)
if (COMPILER_SUPPORTS_AVX)
set(ISALIST_SP ${ISALIST_SP} avxsp)
set(ISALIST_DP ${ISALIST_DP} avxdp)
endif(COMPILER_SUPPORTS_AVX)
if (COMPILER_SUPPORTS_AVX2)
set(ISALIST_SP ${ISALIST_SP} avx2sp)
set(ISALIST_DP ${ISALIST_DP} avx2dp)
endif(COMPILER_SUPPORTS_AVX2)
if (COMPILER_SUPPORTS_AVX512F)
set(ISALIST_SP ${ISALIST_SP} avx512fsp)
set(ISALIST_DP ${ISALIST_DP} avx512fdp)
endif(COMPILER_SUPPORTS_AVX512F)
if (COMPILER_SUPPORTS_ADVSIMD)
set(ISALIST_SP ${ISALIST_SP} advsimdsp)
set(ISALIST_DP ${ISALIST_DP} advsimddp)
endif(COMPILER_SUPPORTS_ADVSIMD)
if (COMPILER_SUPPORTS_SVE)
set(ISALIST_SP ${ISALIST_SP} sve256sp sve512sp sve1024sp sve2048sp)
set(ISALIST_DP ${ISALIST_DP} sve256dp sve512dp sve1024dp sve2048dp)
endif(COMPILER_SUPPORTS_SVE)
if (COMPILER_SUPPORTS_NEON32)
set(ISALIST_SP ${ISALIST_SP} neon32sp)
endif(COMPILER_SUPPORTS_NEON32)
if (COMPILER_SUPPORTS_RVVM1)
set(ISALIST_SP ${ISALIST_SP} rvvm1128sp rvvm1256sp rvvm1512sp rvvm11024sp rvvm12048sp)
set(ISALIST_DP ${ISALIST_DP} rvvm1128dp rvvm1256dp rvvm1512dp rvvm11024dp rvvm12048dp)
endif(COMPILER_SUPPORTS_RVVM1)
if (COMPILER_SUPPORTS_RVVM2)
set(ISALIST_SP ${ISALIST_SP} rvvm2128sp rvvm2256sp rvvm2512sp rvvm21024sp rvvm22048sp)
set(ISALIST_DP ${ISALIST_DP} rvvm2128dp rvvm2256dp rvvm2512dp rvvm21024dp rvvm22048dp)
endif(COMPILER_SUPPORTS_RVVM2)
if (COMPILER_SUPPORTS_VSX)
set(ISALIST_SP ${ISALIST_SP} vsxsp)
set(ISALIST_DP ${ISALIST_DP} vsxdp)
endif(COMPILER_SUPPORTS_VSX)
if (COMPILER_SUPPORTS_VSX3)
set(ISALIST_SP ${ISALIST_SP} vsx3sp)
set(ISALIST_DP ${ISALIST_DP} vsx3dp)
endif(COMPILER_SUPPORTS_VSX3)
if (COMPILER_SUPPORTS_VXE)
set(ISALIST_SP ${ISALIST_SP} vxesp)
set(ISALIST_DP ${ISALIST_DP} vxedp)
endif(COMPILER_SUPPORTS_VXE)
if (COMPILER_SUPPORTS_VXE2)
set(ISALIST_SP ${ISALIST_SP} vxe2sp)
set(ISALIST_DP ${ISALIST_DP} vxe2dp)
endif(COMPILER_SUPPORTS_VXE2)
if(SLEEFDFT_ENABLE_STREAM)
set(NLIST 0 1 2 3)
else()
set(NLIST 0 2)
endif()
#
# Compiler properties
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
set(COMMON_TARGET_PROPERTIES
C_STANDARD 99 # -std=gnu99
)
if (BUILD_SHARED_LIBS)
list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON) # -fPIC
endif()
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} MAXBUTWIDTH=${SLEEFDFT_MAXBUTWIDTH})
if (SLEEFDFT_ENABLE_STREAM)
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=1)
else()
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=0)
endif()
if(COMPILER_SUPPORTS_OPENMP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
endif(COMPILER_SUPPORTS_OPENMP)
# Include directories
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_BINARY_DIR}/include)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
# Target mkunroll
set(TARGET_MKUNROLL "mkunroll")
add_host_executable(${TARGET_MKUNROLL} mkunroll.c)
set_target_properties(${TARGET_MKUNROLL} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (NOT CMAKE_CROSSCOMPILING)
target_compile_definitions(${TARGET_MKUNROLL} PRIVATE ${COMMON_TARGET_DEFINITIONS})
endif()
# Target mkdispatch
set(TARGET_MKDISPATCH "mkdispatch")
add_host_executable(${TARGET_MKDISPATCH} mkdispatch.c)
set_target_properties(${TARGET_MKDISPATCH} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (NOT CMAKE_CROSSCOMPILING)
target_compile_definitions(${TARGET_MKDISPATCH} PRIVATE ${COMMON_TARGET_DEFINITIONS})
endif()
# Target dispatchparam.h
add_custom_command(OUTPUT dispatchparam.h
COMMENT "Generating dispatchparam.h"
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> paramonly ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_DP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h
DEPENDS ${TARGET_MKDISPATCH}
)
add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h)
# Target dispatch*.h
foreach(T ${LIST_SUPPORTED_FPTYPE})
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
string(TOUPPER ${ST} CST) # CST is "DP"
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
list(GET LISTTYPEID ${T} ID) # ID is 1
string(CONCAT S "dispatch" ${ST} ".h") # S is dispatchdp.h
add_custom_command(OUTPUT ${S}
COMMENT "Generating ${S}"
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> ${LT} ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_${CST}} > ${S}
DEPENDS ${TARGET_MKDISPATCH}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
string(CONCAT G ${S} "_generated") # G is dispatchdp.h_generated
add_custom_target(${G} SOURCES ${S})
endforeach()
# Target dftcommon.o
add_library(dftcommon_obj OBJECT dftcommon.c dftcommon.h ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h)
add_dependencies(dftcommon_obj ${TARGET_HEADERS} dispatchparam.h_generated)
set_source_files_properties(${sleef_BINARY_DIR}/include/sleef.h PROPERTIES GENERATED TRUE)
set_target_properties(dftcommon_obj PROPERTIES ${COMMON_TARGET_PROPERTIES})
target_compile_definitions(dftcommon_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
# Target dft*.o
foreach(T ${LIST_SUPPORTED_FPTYPE})
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj"
string(CONCAT S "dispatch" ${ST} ".h") # S is "dispatchdp.h"
add_library(${G} OBJECT dft.c dftcommon.h ${S})
string(CONCAT SG ${S} "_generated") # SG is "dispatchdp.h_generated"
add_dependencies(${G} ${SG} ${TARGET_HEADERS})
set_target_properties(${G} PROPERTIES ${COMMON_TARGET_PROPERTIES})
list(GET LISTTYPEID ${T} ID) # ID is 1
target_compile_definitions(${G} PRIVATE BASETYPEID=${ID} ${COMMON_TARGET_DEFINITIONS})
endforeach()
# Copy unroll0.org to ${CMAKE_CURRENT_BINARY_DIR}
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org)
add_custom_target(unroll0.org.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org)
# Target unroll*.c
foreach(T ${LIST_SUPPORTED_FPTYPE})
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
string(TOUPPER ${ST} CST) # CST is "DP"
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
foreach(N ${NLIST})
string(CONCAT UC unroll_ ${N} _ ${E} ".c") # UC is "unroll_0_sse2dp.c"
set(UNROLL_TARGET_${CST} ${UNROLL_TARGET_${CST}} ${UC})
endforeach()
endforeach()
message(STATUS "Unroll target for ${CST} : ${UNROLL_TARGET_${CST}}")
if(UNROLL_TARGET_${CST})
add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}}
COMMENT "Generating ${UNROLL_TARGET_${CST}}"
COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> ${LT} ${ISALIST_${CST}}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ${TARGET_MKUNROLL} unroll0.org.copied
)
add_custom_target(unroll_target_${ST} DEPENDS ${UNROLL_TARGET_${CST}})
endif()
endforeach()
# Target unroll*.o
foreach(T ${LIST_SUPPORTED_FPTYPE})
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
string(TOUPPER ${ST} CST) # CST is "DP"
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
foreach(N ${NLIST})
string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp"
string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_obj"
string(CONCAT UC ${U} ".c") # UC is "unroll_0_sse2dp.c"
add_library(${UG} OBJECT ${UC})
set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES})
target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}})
target_compile_options(${UG} PRIVATE ${CFLAGS_${E}})
add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST})
endforeach()
endforeach()
endforeach()
# Target libdft
add_library(${TARGET_LIBDFT} $<TARGET_OBJECTS:dftcommon_obj> $<TARGET_OBJECTS:${TARGET_LIBARRAYMAP_OBJ}>)
target_link_libraries(${TARGET_LIBDFT} ${TARGET_LIBSLEEF} ${LIBM})
foreach(T ${LIST_SUPPORTED_FPTYPE})
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj"
target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${G}>)
endforeach()
foreach(T ${LIST_SUPPORTED_FPTYPE})
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
string(TOUPPER ${ST} CST) # CST is "DP"
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
foreach(N ${NLIST})
string(CONCAT UG unroll_ ${N} _ ${E} "_obj") # U is "unroll_0_sse2dp_obj"
target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${UG}>)
endforeach()
endforeach()
endforeach()
set_target_properties(${TARGET_LIBDFT} PROPERTIES
VERSION ${SLEEF_VERSION}
SOVERSION ${SLEEF_SOVERSION}
PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/sleefdft.h
${COMMON_TARGET_PROPERTIES}
)
# Install
install(
TARGETS ${TARGET_LIBDFT}
EXPORT sleefTargets
PUBLIC_HEADER #
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
COMPONENT sleef_Development
LIBRARY #
DESTINATION "${CMAKE_INSTALL_LIBDIR}"
COMPONENT sleef_Runtime
NAMELINK_COMPONENT sleef_Development
ARCHIVE #
DESTINATION "${CMAKE_INSTALL_LIBDIR}"
COMPONENT sleef_Development
RUNTIME #
DESTINATION "${CMAKE_INSTALL_BINDIR}"
COMPONENT sleef_Runtime
INCLUDES #
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,423 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include <inttypes.h>
#include <assert.h>
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "misc.h"
#include "sleef.h"
#define IMPORT_IS_EXPORT
#include "sleefdft.h"
#include "dispatchparam.h"
#include "dftcommon.h"
#include "common.h"
#include "arraymap.h"
#define MAGIC_FLOAT 0x31415926
#define MAGIC_DOUBLE 0x27182818
#define MAGIC2D_FLOAT 0x22360679
#define MAGIC2D_DOUBLE 0x17320508
const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" };
static int parsePathStr(char *p, int *path, int *config, int pathLenMax, int log2len) {
int pathLen = 0, l2l = 0;
for(;;) {
while(*p == ' ') p++;
if (*p == '\0') break;
if (!isdigit((int)*p)) return -1;
pathLen++;
if (pathLen >= pathLenMax) return -2;
int n = 0;
while(isdigit((int)*p)) n = n * 10 + *p++ - '0';
if (n > MAXBUTWIDTH) return -6;
path[pathLen-1] = n;
l2l += n;
config[pathLen-1] = 0;
if (*p != '(') continue;
int c;
for(c=3;c>=0;c--) if (strncmp(p+1, configStr[c], strlen(configStr[c])) == 0) break;
if (c == -1) return -3;
p += strlen(configStr[c]) + 1;
if (*p != ')') return -4;
p++;
config[pathLen-1] = c;
}
if (l2l != log2len) return -5;
return pathLen;
}
EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) {
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
int path[32], config[32];
int pathLen = parsePathStr(pathStr, path, config, 31, p->log2len);
if (pathLen < 0) {
if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Error %d in parsing path string : %s\n", pathLen, pathStr);
return;
}
for(uint32_t j = 0;j <= p->log2len;j++) p->bestPath[j] = 0;
for(int level = p->log2len, j=0;level > 0 && j < pathLen;) {
p->bestPath[level] = path[j];
p->bestPathConfig[level] = config[j];
level -= path[j];
j++;
}
p->pathLen = 0;
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
if ((p->mode & SLEEF_MODE_VERBOSE) != 0) {
printf("Set path : ");
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]);
printf("\n");
}
}
void freeTables(SleefDFT *p) {
for(int N=1;N<=MAXBUTWIDTH;N++) {
for(uint32_t level=N;level<=p->log2len;level++) {
Sleef_free(p->tbl[N][level]);
}
free(p->tbl[N]);
p->tbl[N] = NULL;
}
}
EXPORT void SleefDFT_dispose(SleefDFT *p) {
if (p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)) {
Sleef_free(p->tBuf);
SleefDFT_dispose(p->instH);
if (p->hlen != p->vlen) SleefDFT_dispose(p->instV);
p->magic = 0;
free(p);
return;
}
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
if (p->log2len <= 1) {
p->magic = 0;
free(p);
return;
}
if ((p->mode & SLEEF_MODE_REAL) != 0) {
Sleef_free(p->rtCoef1);
Sleef_free(p->rtCoef0);
p->rtCoef0 = p->rtCoef1 = NULL;
}
for(int level = p->log2len;level >= 1;level--) {
Sleef_free(p->perm[level]);
}
free(p->perm);
p->perm = NULL;
freeTables(p);
p->magic = 0;
free(p);
}
uint32_t ilog2(uint32_t q) {
static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
uint32_t r = 0,qq;
if (q & 0xffff0000) r = 16;
q >>= r;
qq = q | (q >> 1);
qq |= (qq >> 2);
qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
}
//
char *dftPlanFilePath = NULL;
char *archID = NULL;
uint64_t planMode = SLEEF_PLAN_REFERTOENVVAR;
ArrayMap *planMap = NULL;
int planFilePathSet = 0, planFileLoaded = 0;
#ifdef _OPENMP
omp_lock_t planMapLock;
int planMapLockInitialized = 0;
#endif
static void initPlanMapLock() {
#ifdef _OPENMP
#pragma omp critical
{
if (!planMapLockInitialized) {
planMapLockInitialized = 1;
omp_init_lock(&planMapLock);
}
}
#endif
}
static void planMap_clear() {
if (planMap != NULL) ArrayMap_dispose(planMap);
planMap = NULL;
}
EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
initPlanMapLock();
if ((mode & SLEEF_PLAN_RESET) != 0) {
planMap_clear();
planFileLoaded = 0;
planFilePathSet = 0;
}
if (dftPlanFilePath != NULL) free(dftPlanFilePath);
if (path != NULL) {
dftPlanFilePath = malloc(strlen(path)+10);
strcpy(dftPlanFilePath, path);
} else {
dftPlanFilePath = NULL;
}
if (archID != NULL) free(archID);
if (arch == NULL) arch = Sleef_getCpuIdString();
archID = malloc(strlen(arch)+10);
strcpy(archID, arch);
planMode = mode;
planFilePathSet = 1;
}
static void loadPlanFromFile() {
if (planFilePathSet == 0 && (planMode & SLEEF_PLAN_REFERTOENVVAR) != 0) {
char *s = getenv(ENVVAR);
if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode);
}
if (planMap != NULL) ArrayMap_dispose(planMap);
if (dftPlanFilePath != NULL && (planMode & SLEEF_PLAN_RESET) == 0) {
planMap = ArrayMap_load(dftPlanFilePath, archID, PLANFILEID, (planMode & SLEEF_PLAN_NOLOCK) == 0);
}
if (planMap == NULL) planMap = initArrayMap();
planFileLoaded = 1;
}
static void savePlanToFile() {
assert(planFileLoaded);
if ((planMode & SLEEF_PLAN_READONLY) == 0 && dftPlanFilePath != NULL) {
ArrayMap_save(planMap, dftPlanFilePath, archID, PLANFILEID);
}
}
#define CATBIT 8
#define BASETYPEIDBIT 2
#define LOG2LENBIT 8
#define DIRBIT 1
#define BUTSTATBIT 16
static uint64_t keyButStat(int baseTypeID, int log2len, int dir, int butStat) {
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
int cat = 0;
uint64_t k = 0;
k = (k << BUTSTATBIT) | (butStat & ~(~(uint64_t)0 << BUTSTATBIT));
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
return k;
}
#define LEVELBIT LOG2LENBIT
#define BUTCONFIGBIT 8
#define TRANSCONFIGBIT 8
static uint64_t keyTrans(int baseTypeID, int hlen, int vlen, int transConfig) {
int max = MAX(hlen, vlen), min = MIN(hlen, vlen);
int cat = 2;
uint64_t k = 0;
k = (k << TRANSCONFIGBIT) | (transConfig & ~(~(uint64_t)0 << TRANSCONFIGBIT));
k = (k << LOG2LENBIT) | (max & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << LOG2LENBIT) | (min & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
return k;
}
static uint64_t keyPath(int baseTypeID, int log2len, int dir, int level, int config) {
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
int cat = 3;
uint64_t k = 0;
k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
return k;
}
static uint64_t keyPathConfig(int baseTypeID, int log2len, int dir, int level, int config) {
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
int cat = 4;
uint64_t k = 0;
k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
return k;
}
static uint64_t planMap_getU64(uint64_t key) {
char *s = ArrayMap_get(planMap, key);
if (s == NULL) return 0;
uint64_t ret;
if (sscanf(s, "%" SCNx64, &ret) != 1) return 0;
return ret;
}
static void planMap_putU64(uint64_t key, uint64_t value) {
char *s = malloc(100);
sprintf(s, "%" PRIx64, value);
s = ArrayMap_put(planMap, key, s);
if (s != NULL) free(s);
}
int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat) {
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
initPlanMapLock();
#ifdef _OPENMP
omp_set_lock(&planMapLock);
#endif
if (!planFileLoaded) loadPlanFromFile();
int stat = planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10));
if (stat == 0) {
#ifdef _OPENMP
omp_unset_lock(&planMapLock);
#endif
return 0;
}
int ret = 1;
for(int j = p->log2len;j >= 0;j--) {
p->bestPath[j] = planMap_getU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat));
p->bestPathConfig[j] = planMap_getU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat));
if (p->bestPath[j] > MAXBUTWIDTH) ret = 0;
}
p->pathLen = 0;
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
#ifdef _OPENMP
omp_unset_lock(&planMapLock);
#endif
return ret;
}
void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat) {
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
initPlanMapLock();
#ifdef _OPENMP
omp_set_lock(&planMapLock);
#endif
if (!planFileLoaded) loadPlanFromFile();
if (planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)) != 0) {
#ifdef _OPENMP
omp_unset_lock(&planMapLock);
#endif
return;
}
for(int j = p->log2len;j >= 0;j--) {
planMap_putU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPath[j]);
planMap_putU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPathConfig[j]);
}
planMap_putU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10), 1);
if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
#ifdef _OPENMP
omp_unset_lock(&planMapLock);
#endif
}
int PlanManager_loadMeasurementResultsT(SleefDFT *p) {
assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
initPlanMapLock();
#ifdef _OPENMP
omp_set_lock(&planMapLock);
#endif
if (!planFileLoaded) loadPlanFromFile();
p->tmNoMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0));
p->tmMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1));
#ifdef _OPENMP
omp_unset_lock(&planMapLock);
#endif
return p->tmNoMT != 0;
}
void PlanManager_saveMeasurementResultsT(SleefDFT *p) {
assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
initPlanMapLock();
#ifdef _OPENMP
omp_set_lock(&planMapLock);
#endif
if (!planFileLoaded) loadPlanFromFile();
planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0), p->tmNoMT);
planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1), p->tmMT );
if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
#ifdef _OPENMP
omp_unset_lock(&planMapLock);
#endif
}

View File

@@ -0,0 +1,69 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#define CONFIGMAX 4
#define CONFIG_STREAM 1
#define CONFIG_MT 2
#define MAXLOG2LEN 32
typedef struct SleefDFT {
uint32_t magic;
uint64_t mode, mode2, mode3;
int baseTypeID;
const void *in;
void *out;
union {
struct {
uint32_t log2len;
void **tbl[MAXBUTWIDTH+1];
void *rtCoef0, *rtCoef1;
uint32_t **perm;
void **x0, **x1;
int isa;
int planMode;
int vecwidth, log2vecwidth;
int nThread;
uint64_t tm[CONFIGMAX][(MAXBUTWIDTH+1)*32];
uint64_t bestTime;
int16_t bestPath[32], bestPathConfig[32], pathLen;
};
struct {
int32_t hlen, vlen;
int32_t log2hlen, log2vlen;
uint64_t tmNoMT, tmMT;
struct SleefDFT *instH, *instV;
void *tBuf;
};
};
} SleefDFT;
#define SLEEF_MODE2_MT1D (1 << 0)
#define SLEEF_MODE3_MT2D (1 << 0)
#define PLANFILEID "SLEEFDFT0\n"
#define ENVVAR "SLEEFDFTPLAN"
#define SLEEF_MODE_MEASUREBITS (3 << 20)
void freeTables(SleefDFT *p);
uint32_t ilog2(uint32_t q);
//int PlanManager_loadMeasurementResultsB(SleefDFT *p);
//void PlanManager_saveMeasurementResultsB(SleefDFT *p, int butStat);
int PlanManager_loadMeasurementResultsT(SleefDFT *p);
void PlanManager_saveMeasurementResultsT(SleefDFT *p);
int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat);
void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat);
#define GETINT_VECWIDTH 100
#define GETINT_DFTPRIORITY 101

View File

@@ -0,0 +1,193 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#ifndef ENABLE_STREAM
#error ENABLE_STREAM not defined
#endif
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "Usage : %s <basetype> <unrollmax> <unrollmax2> <maxbutwidth> <isa> ...\n", argv[0]);
exit(-1);
}
const char *basetype = argv[1];
const int maxbutwidth = atoi(argv[2]);
const int isastart = 3;
const int isamax = argc - isastart;
#if ENABLE_STREAM == 1
const int enable_stream = 1;
#else
const int enable_stream = 0;
#endif
printf("#define MAXBUTWIDTH %d\n", maxbutwidth);
printf("\n");
if (strcmp(basetype, "paramonly") == 0) exit(0);
printf("#define ISAMAX %d\n", isamax);
printf("#define CONFIGMAX 4\n");
for(int k=isastart;k<argc;k++) {
for(int config=0;config<4;config++) {
#if ENABLE_STREAM == 0
if ((config & 1) != 0) continue;
#endif
for(int j=1;j<=maxbutwidth;j++) {
printf("void dft%df_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
printf("void dft%db_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
printf("void tbut%df_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
printf("void tbut%db_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
printf("void but%df_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
printf("void but%db_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
}
}
printf("void realSub0_%s(real *, const real *, const int, const real *, const real *);\n", argv[k]);
printf("void realSub1_%s(real *, const real *, const int, const real *, const real *, const int);\n", argv[k]);
printf("int getInt_%s(int);\n", argv[k]);
printf("const void *getPtr_%s(int);\n", argv[k]);
}
printf("\n");
printf("void (*dftf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
for(int config=0;config<4;config++) {
printf(" {\n");
for(int k=isastart;k<argc;k++) {
printf(" {NULL, ");
for(int i=1;i<=maxbutwidth;i++) {
if (enable_stream || (config & 1) == 0) {
printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
} else {
printf("NULL, ");
}
}
printf("},\n");
}
printf("},\n");
}
printf("};\n\n");
printf("void (*dftb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
for(int config=0;config<4;config++) {
printf(" {\n");
for(int k=isastart;k<argc;k++) {
printf(" {NULL, ");
for(int i=1;i<=maxbutwidth;i++) {
if (enable_stream || (config & 1) == 0) {
if (i == 1) {
printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
} else {
printf("dft%db_%d_%s, ", 1 << i, config, argv[k]);
}
} else {
printf("NULL, ");
}
}
printf("},\n");
}
printf("},\n");
}
printf("};\n\n");
printf("void (*tbutf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
for(int config=0;config<4;config++) {
printf(" {\n");
for(int k=isastart;k<argc;k++) {
printf(" {NULL, ");
for(int i=1;i<=maxbutwidth;i++) {
if (enable_stream || (config & 1) == 0) {
printf("tbut%df_%d_%s, ", 1 << i, config, argv[k]);
} else {
printf("NULL, ");
}
}
printf("},\n");
}
printf("},\n");
}
printf("};\n\n");
printf("void (*tbutb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
for(int config=0;config<4;config++) {
printf(" {\n");
for(int k=isastart;k<argc;k++) {
printf(" {NULL, ");
for(int i=1;i<=maxbutwidth;i++) {
if (enable_stream || (config & 1) == 0) {
printf("tbut%db_%d_%s, ", 1 << i, config, argv[k]);
} else {
printf("NULL, ");
}
}
printf("},\n");
}
printf("},\n");
}
printf("};\n\n");
printf("void (*butf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
for(int config=0;config<4;config++) {
printf(" {\n");
for(int k=isastart;k<argc;k++) {
printf(" {NULL, ");
for(int i=1;i<=maxbutwidth;i++) {
if (enable_stream || (config & 1) == 0) {
printf("but%df_%d_%s, ", 1 << i, config, argv[k]);
} else {
printf("NULL, ");
}
}
printf("},\n");
}
printf("},\n");
}
printf("};\n\n");
printf("void (*butb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
for(int config=0;config<4;config++) {
printf(" {\n");
for(int k=isastart;k<argc;k++) {
printf(" {NULL, ");
for(int i=1;i<=maxbutwidth;i++) {
if (enable_stream || (config & 1) == 0) {
printf("but%db_%d_%s, ", 1 << i, config, argv[k]);
} else {
printf("NULL, ");
}
}
printf("},\n");
}
printf("},\n");
}
printf("};\n\n");
//
printf("void (*realSub0_%s[ISAMAX])(real *, const real *, const int, const real *, const real *) = {\n ", basetype);
for(int k=isastart;k<argc;k++) printf("realSub0_%s, ", argv[k]);
printf("\n};\n\n");
printf("void (*realSub1_%s[ISAMAX])(real *, const real *, const int, const real *, const real *, const int) = {\n ", basetype);
for(int k=isastart;k<argc;k++) printf("realSub1_%s, ", argv[k]);
printf("\n};\n\n");
printf("int (*getInt_%s[16])(int) = {\n ", basetype);
for(int k=isastart;k<argc;k++) printf("getInt_%s, ", argv[k]);
for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
printf("\n};\n\n");
printf("const void *(*getPtr_%s[16])(int) = {\n ", basetype);
for(int k=isastart;k<argc;k++) printf("getPtr_%s, ", argv[k]);
for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
printf("\n};\n\n");
}

View File

@@ -0,0 +1,104 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#define CONFIGMAX 4
char *replaceAll(const char *in, const char *pat, const char *replace) {
const int replaceLen = (int)strlen(replace);
const int patLen = (int)strlen(pat);
char *str = malloc(strlen(in)+1);
strcpy(str, in);
for(;;) {
char *p = strstr(str, pat);
if (p == NULL) return str;
int replace_pos = (int)(p - str);
int tail_len = (int)strlen(p + patLen);
char *newstr = malloc(strlen(str) + (replaceLen - patLen) + 1);
memcpy(newstr, str, replace_pos);
memcpy(newstr + replace_pos, replace, replaceLen);
memcpy(newstr + replace_pos + replaceLen, str + replace_pos + patLen, tail_len+1);
free(str);
str = newstr;
}
return str;
}
#define LEN 1024
char line[LEN+10];
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage : %s <Base type> <ISA> ...\n", argv[0]);
exit(-1);
}
const char *baseType = argv[1];
const int isastart = 2;
for(int config=0;config<CONFIGMAX;config++) {
#if ENABLE_STREAM == 0
if ((config & 1) != 0) continue;
#endif
for(int isa=isastart;isa<argc;isa++) {
char *isaString = argv[isa];
char configString[100];
sprintf(configString, "%d", config);
FILE *fpin = fopen("unroll0.org", "r");
sprintf(line, "unroll_%d_%s.c", config, isaString);
FILE *fpout = fopen(line, "w");
fputs("#include \"vectortype.h\"\n\n", fpout);
fprintf(fpout, "extern %s ctbl_%s[];\n", baseType, baseType);
fprintf(fpout, "#define ctbl ctbl_%s\n\n", baseType);
for(;;) {
if (fgets(line, LEN, fpin) == NULL) break;
char *s;
if ((config & 1) == 0) {
char *s0 = replaceAll(line, "%ISA%", isaString);
s = replaceAll(s0, "%CONFIG%", configString);
free(s0);
} else {
char *s0 = replaceAll(line, "%ISA%", isaString);
char *s1 = replaceAll(s0, "%CONFIG%", configString);
char *s2 = replaceAll(s1, "store(", "stream(");
s = replaceAll(s2, "scatter(", "scstream(");
free(s0); free(s1); free(s2);
}
if ((config & 2) == 0) {
char *s0 = replaceAll(s, "#pragma", "//");
free(s);
s = s0;
}
if (config == 0) {
char *s0 = replaceAll(s, "#undef EMITREALSUB", "#define EMITREALSUB");
free(s);
s = s0;
}
fputs(s, fpout);
free(s);
}
fclose(fpin);
fclose(fpout);
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,145 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef __VECTORTYPE_H__
#define __VECTORTYPE_H__
#include <math.h>
#include "sleef.h"
#ifdef ENABLE_SSE2
#include "helpersse2.h"
#endif
#ifdef ENABLE_AVX
#include "helperavx.h"
#endif
#ifdef ENABLE_AVX2
#include "helperavx2.h"
#endif
#ifdef ENABLE_AVX512F
#include "helperavx512f.h"
#endif
#ifdef ENABLE_NEON32
#include "helperneon32.h"
#endif
#ifdef ENABLE_ADVSIMD
#include "helperadvsimd.h"
#endif
#ifdef ENABLE_SVE
#include "helpersve.h"
#endif
#if defined(ENABLE_RVVM1) || defined(ENABLE_RVVM2)
#include "helperrvv.h"
#endif
#ifdef ENABLE_VSX
#include "helperpower_128.h"
#endif
#ifdef ENABLE_VSX3
#include "helperpower_128.h"
#endif
#ifdef ENABLE_VXE
#include "helpers390x_128.h"
#endif
#ifdef ENABLE_VXE2
#include "helpers390x_128.h"
#endif
#ifdef ENABLE_VECEXT
#include "helpervecext.h"
#endif
#ifdef ENABLE_PUREC
#include "helperpurec.h"
#endif
#define IMPORT_IS_EXPORT
#include "sleefdft.h"
#if BASETYPEID == 1
#define LOG2VECWIDTH (LOG2VECTLENDP-1)
#define VECWIDTH (1 << LOG2VECWIDTH)
typedef double real;
typedef vdouble real2;
static int available(int name) { return vavailability_i(name); }
static INLINE real2 uminus(real2 d0) { return vneg_vd_vd(d0); }
static INLINE real2 uplusminus(real2 d0) { return vposneg_vd_vd(d0); }
static INLINE real2 uminusplus(real2 d0) { return vnegpos_vd_vd(d0); }
static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vd_vd_vd(d0, d1); }
static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vd_vd_vd(d0, d1); }
static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vd_vd_vd(d0, d1); }
static INLINE real2 times(real2 d0, real2 d1) { return vmul_vd_vd_vd(d0, d1); }
static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, d2, d1); }
static INLINE real2 ctimes(real2 d0, real d) { return vmul_vd_vd_vd(d0, vcast_vd_d(d)); }
static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, vcast_vd_d(c), d1); }
static INLINE real2 reverse(real2 d0) { return vrev21_vd_vd(d0); }
static INLINE real2 reverse2(real2 d0) { return vreva2_vd_vd(d0); }
static INLINE real2 loadc(real c) { return vcast_vd_d(c); }
static INLINE real2 load(const real *ptr, int offset) { return vload_vd_p(&ptr[2*offset]); }
static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vd_p(&ptr[2*offset]); }
static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vd(&ptr[2*offset], v); }
static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vd(&ptr[2*offset], v); }
static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vd(&ptr[2*offset], v); }
static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
#elif BASETYPEID == 2
#define LOG2VECWIDTH (LOG2VECTLENSP-1)
#define VECWIDTH (1 << LOG2VECWIDTH)
typedef float real;
typedef vfloat real2;
static int available(int name) { return vavailability_i(name); }
static INLINE real2 uminus(real2 d0) { return vneg_vf_vf(d0); }
static INLINE real2 uplusminus(real2 d0) { return vposneg_vf_vf(d0); }
static INLINE real2 uminusplus(real2 d0) { return vnegpos_vf_vf(d0); }
static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vf_vf_vf(d0, d1); }
static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vf_vf_vf(d0, d1); }
static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vf_vf_vf(d0, d1); }
static INLINE real2 times(real2 d0, real2 d1) { return vmul_vf_vf_vf(d0, d1); }
static INLINE real2 ctimes(real2 d0, real d) { return vmul_vf_vf_vf(d0, vcast_vf_f(d)); }
static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, d2, d1); }
static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, vcast_vf_f(c), d1); }
static INLINE real2 reverse(real2 d0) { return vrev21_vf_vf(d0); }
static INLINE real2 reverse2(real2 d0) { return vreva2_vf_vf(d0); }
static INLINE real2 loadc(real c) { return vcast_vf_f(c); }
static INLINE real2 load(const real *ptr, int offset) { return vload_vf_p(&ptr[2*offset]); }
static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vf_p(&ptr[2*offset]); }
static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vf(&ptr[2*offset], v); }
static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vf(&ptr[2*offset], v); }
static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vf(&ptr[2*offset], v); }
static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vf(ptr, offset, step, v); }
static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
#else
#error No BASETYPEID specified
#endif
#endif

View File

@@ -0,0 +1,16 @@
.PHONY: all
all : gencoef mkrempitab mkrempitabqp
gencoef : gencoef.c simplexfr.c sp.h dp.h ld.h qp.h
gcc -O gencoef.c simplexfr.c -o gencoef -lmpfr -lm
mkrempitab : mkrempitab.c
gcc -O mkrempitab.c -o mkrempitab -lmpfr
mkrempitabqp : mkrempitabqp.c
gcc -O mkrempitabqp.c -o mkrempitabqp -lmpfr
.PHONY: clean
clean :
rm -f gencoef gencoefdp gencoefld mkrempitab mkrempitabqp a.out *~
rm -f *.obj *.lib *.dll *.exp *.exe

View File

@@ -0,0 +1,196 @@
// This is part of SLEEF, written by Naoki
// Shibata. http://shibatch.sourceforge.net
// The code in this file is distributed under the Creative Commons
// Attribution 4.0 International License.
#define PREC_TARGET 53
#if 0
#define N 8 // Degree of equation
#define S 40 // Number of samples for phase 1
#define L 4 // Number of high precision coefficients
#define MIN 0.0 // Min argument
#define MAX (M_PI/4) // Max argument
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0 // Fix coef 0 to 1.0
#endif
#if 0
#define N 10
#define S 40
#define L 2
#define MIN 0.0
#define MAX (M_PI/4)
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
mpfr_t x;
mpfr_init(x);
mpfr_cos(ret, a, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clear(x);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#define FIXCOEF0 (-0.5)
#endif
#if 0 // for xsincospi4_u05
#define S 40
#define N 8
#define L 2
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_sin(ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0 // for xsincospi4_u05
#define N 8
#define S 40
#define L 2
#define MIN 0.0
#define MAX 1.0
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_cos(ret, x, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#endif
#if 0 // for xsincospi4
#define N 7
#define S 40
#define L 0
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_sin(ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0
#define N 17
#define S 60
#define L 0
#define MIN 0.0
#define MAX (M_PI/4)
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif
#if 0
#define N 11
#define S 35
#define L 2
#define MIN 1 //0.75
#define MAX 1.5
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t frd, mpfr_t fra) {
mpfr_t tmp, one;
mpfr_inits(tmp, one, NULL);
mpfr_set_d(one, 1, GMP_RNDN);
mpfr_add(tmp, fra, one, GMP_RNDN);
mpfr_sub(frd, fra, one, GMP_RNDN);
mpfr_div(frd, frd, tmp, GMP_RNDN);
mpfr_clears(tmp, one, NULL);
}
#define FIXCOEF0 2.0
#endif
#if 1
#define N 12
#define S 50
#define L 2
#define MIN -0.347
#define MAX 0.347 // 0.5 log 2
#define PMUL 1
#define PADD 0
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#define FIXCOEF1 1.0
//#define FIXCOEF2 0.5
#endif
#if 0
#define N 21
#define S 100
#define L 1
#define P 1.1
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif
#if 0
#define N 20
#define S 100
#define L 0
#define P 1.54
#define MIN 0.0
#define MAX 0.708
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_asin(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif

View File

@@ -0,0 +1,375 @@
// This is part of SLEEF, written by Naoki Shibata. http://shibatch.sourceforge.net
// Since the original code for simplex algorithm is developed by Haruhiko Okumura and
// the code is distributed under the Creative Commons Attribution 4.0 International License,
// the contents under this directory are also distributed under the same license.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <time.h>
#include <mpfr.h>
//#include "sp.h"
#include "dp.h"
//#include "ld.h"
//#include "qp.h"
#undef VERBOSE
#define PREC 4096
#define EPS 1e-50
#define PREC2 (PREC_TARGET*4)
#ifndef P
#define P 1
#endif
#ifndef Q
#define Q 10000
#endif
void mpfr_zinit(mpfr_t m);
void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result);
char *mpfrToStr(mpfr_t m) {
mpfr_t fra;
mpfr_init2(fra, mpfr_get_prec(m));
mpfr_abs(fra, m, GMP_RNDN);
mpfr_exp_t e;
char *s = mpfr_get_str(NULL, &e, 10, 0, fra, GMP_RNDN);
char *ret = malloc(strlen(s) + 20);
if (mpfr_sgn(m) == -1) ret[0] = '-'; else ret[0] = '+';
ret[1] = '0';
ret[2] = '.';
strcpy(&ret[3], s);
mpfr_free_str(s);
char estr[10];
sprintf(estr, "e%+d", (int)e);
strcat(ret, estr);
mpfr_clears(fra, NULL);
return ret;
}
double countULP(mpfr_t d, mpfr_t c) {
mpfr_t fry, frw;
mpfr_inits(fry, frw, NULL);
double c2 = mpfr_get_d(c, GMP_RNDN);
if (c2 == 0 && mpfr_cmp_d(d, 0) != 0) return 10000;
long e;
mpfr_get_d_2exp(&e, c, GMP_RNDN);
mpfr_set_ui_2exp(frw, 1, e-PREC_TARGET, GMP_RNDN);
mpfr_sub(fry, d, c, GMP_RNDN);
mpfr_div(fry, fry, frw, GMP_RNDN);
double u = fabs(mpfr_get_d(fry, GMP_RNDN));
mpfr_clears(fry, frw, NULL);
return u;
}
void func(mpfr_t s, mpfr_t x, mpfr_t *coef, int n) {
mpfr_set_prec(s, PREC_TARGET);
mpfr_set(s, coef[n-1], GMP_RNDN);
for(int i=n-1;i>0;i--) {
if (i == L-1) {
mpfr_t t;
mpfr_init2(t, PREC2);
mpfr_set(t, s, GMP_RNDN);
mpfr_set_prec(s, PREC2);
mpfr_set(s, t, GMP_RNDN);
mpfr_clear(t);
}
mpfr_mul(s, s, x, GMP_RNDN);
mpfr_add(s, s, coef[i-1], GMP_RNDN);
}
}
int main(int argc, char **argv)
{
int i, j;
int n, m;
double p;
mpfr_set_default_prec(PREC);
#if 0
{
mpfr_t a, b;
mpfr_inits(a, b, NULL);
float x = M_PI;
mpfr_set_d(a, x, GMP_RNDN);
x = nexttowardf(x, 100);
x = nexttowardf(x, 100);
x = nexttowardf(x, 100);
mpfr_set_d(b, x, GMP_RNDN);
printf("%g\n", countULP(b, a));
mpfr_clears(a, b, NULL);
exit(0);
}
#endif
#if 0
{
mpfr_t a, b;
mpfr_inits(a, b, NULL);
double x = M_PI;
mpfr_set_d(a, x, GMP_RNDN);
x = nexttoward(x, 100);
x = nexttoward(x, 100);
x = nexttoward(x, 100);
mpfr_set_d(b, x, GMP_RNDN);
printf("%g\n", countULP(b, a));
mpfr_clears(a, b, NULL);
exit(0);
}
#endif
#if 0
{
mpfr_t a, b;
mpfr_inits(a, b, NULL);
long double x = M_PI;
mpfr_set_ld(a, x, GMP_RNDN);
x = nexttowardl(x, 100);
x = nexttowardl(x, 100);
x = nexttowardl(x, 100);
mpfr_set_ld(b, x, GMP_RNDN);
printf("%g\n", countULP(b, a));
mpfr_clears(a, b, NULL);
exit(0);
}
#endif
#if 0
{
mpfr_t a, b;
mpfr_inits(a, b, NULL);
__float128 x = M_PI;
mpfr_set_f128(a, x, GMP_RNDN);
x = nextafterq(x, 100);
x = nextafterq(x, 100);
x = nextafterq(x, 100);
mpfr_set_f128(b, x, GMP_RNDN);
printf("%g\n", countULP(b, a));
mpfr_clears(a, b, NULL);
exit(0);
}
#endif
m = N+1;
n = argc >= 2 ? atoi(argv[1]) : S;
p = argc >= 3 ? atof(argv[2]) : P;
mpfr_t **x, *result; // x[m][n], result[m]
x = calloc(sizeof(mpfr_t *), m);
result = calloc(sizeof(mpfr_t), m);
for(i=0;i<m;i++) {
x[i] = calloc(sizeof(mpfr_t), n);
for(j=0;j<n;j++) mpfr_zinit(x[i][j]);
mpfr_zinit(result[i]);
}
mpfr_t fra, frb, frc, frd, fre;
mpfr_zinit(fra);
mpfr_zinit(frb);
mpfr_zinit(frc);
mpfr_zinit(frd);
mpfr_zinit(fre);
for(i=0;i<n;i++) {
double b = 1.0 - pow((double)i / (n-1), p);
double a = ((double)MAX - MIN) * b + MIN;
mpfr_set_d(fra, a, GMP_RNDN);
CFUNC(frd, fra);
for(j=0;j<m-1;j++) {
mpfr_set_d(frb, (double)j*PMUL+PADD, GMP_RNDN);
mpfr_pow(x[j][i], frd, frb, GMP_RNDN);
//printf("%g ", mpfr_get_d(x[j][i], GMP_RNDN));
}
TARGET(x[m-1][i], fra);
//printf(" : %g\n", mpfr_get_d(x[m-1][i], GMP_RNDN));
}
for(i=0;i<m-1;i++) mpfr_set_d(result[i], 0, GMP_RNDN);
regressMinRelError_fr(n, m-1, x, result);
for(i=m-2;i>=0;i--) {
mpfr_set_prec(fra, PREC_TARGET+4);
mpfr_set(fra, result[i], GMP_RNDN);
char *s;
printf("%s, \n", s = mpfrToStr(fra));
free(s);
}
printf("\n");
mpfr_set_prec(fra, PREC);
double emax = 0;
for(i=0;i<=n*10;i++) {
double a = i * (double)(MAX - MIN) / (n*10.0) + MIN;
mpfr_set_d(fra, a, GMP_RNDN);
CFUNC(frd, fra);
mpfr_set_d(frb, 0, GMP_RNDN);
for(j=m-1;j>=0;j--) {
mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
mpfr_pow(frc, frd, frc, GMP_RNDN);
mpfr_mul(frc, frc, result[j], GMP_RNDN);
mpfr_add(frb, frb, frc, GMP_RNDN);
}
TARGET(frc, fra);
double u = countULP(frb, frc);
if (u > emax) emax = u;
}
printf("Phase 1 : Max error = %g ULP\n\n", emax);
fflush(stdout);
//
mpfr_t bestcoef[N], curcoef[N];
for(i=0;i<N;i++) {
mpfr_init2(bestcoef[i], i >= L ? PREC_TARGET : PREC2);
mpfr_set(bestcoef[i], result[i], GMP_RNDN);
mpfr_init2(curcoef[i], i >= L ? PREC_TARGET : PREC2);
mpfr_set(curcoef[i], result[i], GMP_RNDN);
}
srandom(time(NULL));
mpfr_set_default_prec(PREC2);
static mpfr_t a[Q], v[Q], am[Q], aa[Q];
for(i=0;i<Q;i++) {
mpfr_inits(a[i], v[i], am[i], aa[i], NULL);
mpfr_set_d(fra, ((double)MAX - (double)MIN) * i / (double)(Q-1) + (double)MIN, GMP_RNDN);
TARGET(v[i], fra);
CFUNC(a[i], fra);
mpfr_set_d(frb, PMUL, GMP_RNDN);
mpfr_pow(am[i], a[i], frb, GMP_RNDN);
mpfr_set_d(frb, PADD, GMP_RNDN);
mpfr_pow(aa[i], a[i], frb, GMP_RNDN);
mpfr_clears(a[i], v[i], am[i], aa[i], NULL);
}
double best = 1e+100, bestsum = 1e+100, bestworstx;
for(int k=0;k<10000;k++) {
double emax = 0, esum = 0, worstx = 0;
#ifdef FIXCOEF0
mpfr_set_d(curcoef[0], FIXCOEF0, GMP_RNDN);
#endif
#ifdef FIXCOEF1
mpfr_set_d(curcoef[1], FIXCOEF1, GMP_RNDN);
#endif
#ifdef FIXCOEF2
mpfr_set_d(curcoef[2], FIXCOEF2, GMP_RNDN);
#endif
for(i=0;i<Q;i++) {
if (mpfr_cmp_d(v[i], 0) == 0) continue;
mpfr_set_d(frb, 0, GMP_RNDN);
for(j=N-1;j>=0;j--) {
mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
mpfr_pow(frc, a[i], frc, GMP_RNDN);
mpfr_mul(frc, frc, curcoef[j], GMP_RNDN);
mpfr_add(frb, frb, frc, GMP_RNDN);
}
double e = countULP(frb, v[i]);
//printf("c = %.20g, t = %.20g, ulp = %g\n", mpfr_get_d(v[i], GMP_RNDN), mpfr_get_d(frb, GMP_RNDN), e);
if (!isfinite(e)) continue;
if (e > emax) { emax = e; worstx = mpfr_get_d(a[i], GMP_RNDN); }
esum += e;
}
mpfr_set_prec(frb, PREC);
//printf("emax = %g\n", emax);
if (emax < best || (emax == best && esum < bestsum)) {
for(i=0;i<N;i++) {
mpfr_set(bestcoef[i], curcoef[i], GMP_RNDN);
}
if (best == 1e+100 || k > 10) printf("Max error = %g ULP, Sum error = %g (Max error at %g)\n", emax, esum, worstx);
if ((best - emax) / best > 0.0001) k = 0;
best = emax;
bestsum = esum;
bestworstx = worstx;
}
for(i=0;i<N;i++) {
mpfr_set(curcoef[i], bestcoef[i], GMP_RNDN);
}
for(i=0;i<N;i++) {
static int tab[] = {0, 0, 0, 0, 0, 0, 1, -1};
//static int tab[] = {0, 0, 0, 0, 2, -2, 1, -1};
int r = tab[random() & 7];
if (r > 0) {
for(int j=0;j<r;j++) mpfr_nextabove(curcoef[i]);
} else if (r < 0) {
for(int j=0;j>r;j--) mpfr_nextbelow(curcoef[i]);
}
}
}
printf("\n");
for(i=N-1;i>=0;i--) {
mpfr_set_prec(fra, i >= L ? PREC_TARGET+4 : PREC2);
mpfr_set(fra, bestcoef[i], GMP_RNDN);
char *s;
printf("%s, \n", s = mpfrToStr(fra));
free(s);
}
printf("\nPhase 2 : max error = %g ULP at %g\n", best, bestworstx);
exit(0);
}

View File

@@ -0,0 +1,43 @@
With this small tool, the coefficients for polynomial approximation
used in kernels can be generated.
Usage
Edit gencoefdp.c. In the beginning of the file, specifications of the
parameters for generating coefficients are listed. Enable one of them
by changing #if. Then, run make to compile the source code. Run the
gencoef, and it will show the generated coefficients in a few minutes.
How it works
There are two phases of the program.
The first phase is the regression for minimizing the maximum relative
error. This problem can be reduced to a linear programming problem,
and the Simplex method is used in this implementation. This requires
multi-precision calculation, and the implementation uses the MPFR
library to do this. In this phase, only a small number of values
(specified by S macro, usually 40 or so) of the function to
approximate are sampled within the argument range. The function to
approximate can be given by FRFUNC function. Specifying higher values
for S does not always give better results.
The second phase is to optimize the coefficients so that it gives good
accuracy with double precision calculation. In this phase, it checks
100000 points (specified by Q macro) within the specified argument
range to see if the polynomial gives good error bound. In some cases,
the last few terms have to be calculated in higher precision in order
to achieve 1 ULP overall accuracy, and this implementation can take
care of that. The L parameter specifies the number of high precision
coefficients.
In some cases, it is desirable to fix the last few coefficients to
values like 1. This can be specified if you define FIXCOEF0
macro. This sometimes does not work, however. In this case, you need
to specify the function to approximate as shown in the definition for
cos.
Finding a set of good parameters is not a straightforward process. You
usually need many iterations of trial and error.

View File

@@ -0,0 +1,178 @@
// This is part of SLEEF, written by Naoki
// Shibata. http://shibatch.sourceforge.net
// The code in this file is distributed under the Creative Commons
// Attribution 4.0 International License.
#define PREC_TARGET 64
#if 0
#define N 8 // Degree of equation
#define S 40 // Number of samples for phase 1
#define L 4 // Number of high precision coefficients
#define MIN 0.0 // Min argument
#define MAX (M_PI/4) // Max argument
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0 // Fix coef 0 to 1.0
#endif
#if 0
#define N 10
#define S 40
#define L 2
#define MIN 0.0
#define MAX (M_PI/4)
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
mpfr_t x;
mpfr_init(x);
mpfr_cos(ret, a, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clear(x);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#define FIXCOEF0 (-0.5)
#endif
#if 0 // for xsincospi4_u05
#define N 9
#define S 40
#define L 2
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_sin(ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0 // for xsincospi4_u05
#define N 9
#define S 40
#define L 2
#define MIN 0.0
#define MAX 1.0
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_cos(ret, x, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#endif
#if 0 // for xsincospi4
#define N 7
#define S 40
#define L 0
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_sin(ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0
#define N 17
#define S 40
#define L 0
#define MIN 0.0
#define MAX (M_PI/4)
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif
#if 0
#define N 9
#define S 40
#define L 2
#define MIN 1 //0.75
#define MAX 1.5
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t frd, mpfr_t fra) {
mpfr_t tmp, one;
mpfr_inits(tmp, one, NULL);
mpfr_set_d(one, 1, GMP_RNDN);
mpfr_add(tmp, fra, one, GMP_RNDN);
mpfr_sub(frd, fra, one, GMP_RNDN);
mpfr_div(frd, frd, tmp, GMP_RNDN);
mpfr_clear(tmp, one, NULL);
}
#define FIXCOEF0 2.0
#endif
#if 0
#define N 12
#define S 50
#define L 0
#define MIN -0.347
#define MAX 0.347 // 0.5 log 2
#define PMUL 1
#define PADD 0
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#define FIXCOEF1 1.0
#define FIXCOEF2 0.5
#endif
#if 0
#define N 22
#define S 100
#define L 2
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif

View File

@@ -0,0 +1,121 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <mpfr.h>
static int64_t doubleToRawLongBits(double d) {
union {
double f;
int64_t i;
} tmp;
tmp.f = d;
return tmp.i;
}
static double longBitsToDouble(int64_t i) {
union {
double f;
int64_t i;
} tmp;
tmp.i = i;
return tmp.f;
}
static double removelsb(double d) {
return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffffffffffeLL);
}
static int32_t floatToRawIntBits(float d) {
union {
float f;
int32_t i;
} tmp;
tmp.f = d;
return tmp.i;
}
static float intBitsToFloat(int32_t i) {
union {
float f;
int32_t i;
} tmp;
tmp.i = i;
return tmp.f;
}
static float removelsbf(float x) {
return intBitsToFloat(0xfffffffc & floatToRawIntBits(x));
}
int main(int argc, char **argv) {
mpfr_set_default_prec(2048);
mpfr_t pi, rpi, xrpi, x, y, z, r;
mpfr_inits(pi, rpi, xrpi, x, y, z, r, NULL);
mpfr_const_pi(pi, GMP_RNDN);
mpfr_set_d(x, 0.5, GMP_RNDN);
mpfr_div(rpi, x, pi, GMP_RNDN);
printf("NOEXPORT ALIGNED(64) const double rempitabdp[] = {\n");
for(int i=55;i<1024;i++) {
int M = i > 700 ? -64 : 0;
int ex = i - 53;
if (ex < -52) ex = -52;
mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
mpfr_mul(y, x, rpi, GMP_RNDN);
mpfr_frac(xrpi, y, GMP_RNDN);
mpfr_div(xrpi, xrpi, x, GMP_RNDN);
mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
mpfr_set(x, xrpi, GMP_RNDN);
double rpi0 = removelsb(mpfr_get_d(x, GMP_RNDN));
mpfr_set_d(y, rpi0, GMP_RNDN);
mpfr_sub(x, x, y, GMP_RNDN);
double rpi1 = removelsb(mpfr_get_d(x, GMP_RNDN));
mpfr_set_d(y, rpi1, GMP_RNDN);
mpfr_sub(x, x, y, GMP_RNDN);
double rpi2 = removelsb(mpfr_get_d(x, GMP_RNDN));
mpfr_set_d(y, rpi2, GMP_RNDN);
mpfr_sub(x, x, y, GMP_RNDN);
double rpi3 = mpfr_get_d(x, GMP_RNDN);
printf(" %.20g, %.20g, %.20g, %.20g,\n", rpi0, rpi1, rpi2, rpi3);
}
printf("};\n\n");
printf("NOEXPORT ALIGNED(64) const float rempitabsp[] = {\n");
for(int i=25;i<128;i++) {
int M = i > 90 ? -64 : 0;
int ex = i - 23;
mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
mpfr_mul(y, x, rpi, GMP_RNDN);
mpfr_frac(xrpi, y, GMP_RNDN);
mpfr_div(xrpi, xrpi, x, GMP_RNDN);
mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
mpfr_set(x, xrpi, GMP_RNDN);
float rpi20 = removelsbf(mpfr_get_d(x, GMP_RNDN));
mpfr_set_d(y, rpi20, GMP_RNDN);
mpfr_sub(x, x, y, GMP_RNDN);
float rpi21 = removelsbf(mpfr_get_d(x, GMP_RNDN));
mpfr_set_d(y, rpi21, GMP_RNDN);
mpfr_sub(x, x, y, GMP_RNDN);
float rpi22 = removelsbf(mpfr_get_d(x, GMP_RNDN));
mpfr_set_d(y, rpi22, GMP_RNDN);
mpfr_sub(x, x, y, GMP_RNDN);
float rpi23 = mpfr_get_d(x, GMP_RNDN);
printf(" %.10g, %.10g, %.10g, %.10g,\n", rpi20, rpi21, rpi22, rpi23);
}
printf("};\n");
}

View File

@@ -0,0 +1,63 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stdint.h>
#include <math.h>
#include <mpfr.h>
#include <quadmath.h>
#define N 8
#define B 8
#define NCOL (53-B)
#define NROW ((16385+(53-B)*N-106)/NCOL+1)
static double *rempitabqp = NULL;
void generateRempitabqp() {
rempitabqp = calloc(16385-106+(53-B)*(N+1), sizeof(double));
int orgprec = mpfr_get_default_prec();
mpfr_set_default_prec(18000);
mpfr_t pi, m, n, o;
mpfr_inits(pi, m, n, o, NULL);
mpfr_const_pi(pi, GMP_RNDN);
mpfr_d_div(n, 0.5, pi, GMP_RNDN);
for(int e=106;e<16385+(53-B)*N;e++) {
mpfr_set(m, n, GMP_RNDN);
mpfr_set_ui_2exp(o, 1, -(113 - e), GMP_RNDN);
mpfr_mul(m, m, o, GMP_RNDN);
mpfr_frac(m, m, GMP_RNDN);
mpfr_set_ui_2exp(o, 1, (53-B), GMP_RNDN);
mpfr_mul(m, m, o, GMP_RNDN);
mpfr_trunc(m, m);
mpfr_set_ui_2exp(o, 1, 7-(53-B), GMP_RNDN);
mpfr_mul(m, m, o, GMP_RNDN);
int col = (e - 106) % NCOL;
int row = (e - 106) / NCOL;
rempitabqp[col * NROW + row] = mpfr_get_d(m, GMP_RNDN);
}
mpfr_clears(pi, m, n, o, NULL);
mpfr_set_default_prec(orgprec);
}
int main(int argc, char **argv) {
generateRempitabqp();
printf("NOEXPORT const double Sleef_rempitabqp[] = {\n ");
for(int i=0;i<16385-106+(53-B)*(N+1);i++) {
printf("%.20g, ", rempitabqp[i]);
if ((i & 3) == 3) printf("\n ");
}
printf("\n};\n");
}

View File

@@ -0,0 +1,161 @@
// This is part of SLEEF, written by Naoki
// Shibata. http://shibatch.sourceforge.net
// The code in this file is distributed under the Creative Commons
// Attribution 4.0 International License.
#define PREC_TARGET 113
//
#if 0
#define N 15 // Degree of equation
#define S 150 // Number of samples for phase 1
#define L 0 // Number of high precision coefficients
#define P 0.37
#define MIN 0.0 // Min argument
#define MAX (M_PI/2) // Max argument
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
#define PADD 3
void TARGET(mpfr_t ret, mpfr_t a) { // The function to approximate
mpfr_sin(ret, a, GMP_RNDN);
mpfr_sub(ret, ret, a, GMP_RNDN); // ret = sin(a) - a
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0
#define N 15
#define S 150
#define L 0
#define MIN 0.0
#define MAX (M_PI/2)
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
mpfr_t x;
mpfr_init(x);
mpfr_cos(ret, a, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clear(x);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
//#define FIXCOEF0 (-0.5)
#endif
#if 0 // for xsincospi4_u05
#define N 13
#define S 150
#define L 2
#define P 0.9
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_sin(ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0 // for xsincospi4_u05
#define N 13
#define S 150
#define L 2
#define MIN 0.0
#define MAX 1.0
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_cos(ret, x, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#endif
#if 0 // running
#define N 31
#define S 100
#define P 1.7
#define L 0
#define MIN 0.0
#define MAX (M_PI/4)
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif
#if 0 // running
#define N 20
#define S 110
#define L 2
#define MIN 1 //0.75
#define MAX 1.5
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t frd, mpfr_t fra) {
mpfr_t tmp, one;
mpfr_inits(tmp, one, NULL);
mpfr_set_d(one, 1, GMP_RNDN);
mpfr_add(tmp, fra, one, GMP_RNDN);
mpfr_sub(frd, fra, one, GMP_RNDN);
mpfr_div(frd, frd, tmp, GMP_RNDN);
mpfr_clears(tmp, one, NULL);
}
#define FIXCOEF0 2.0
#endif
#if 1
#define N 22
#define S 140
#define L 2
#define MIN -0.347
#define MAX 0.347 // 0.5 log 2
#define PMUL 1
#define PADD 0
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#define FIXCOEF1 1.0
//#define FIXCOEF2 0.5
#endif
#if 0 // running
#define N 45
#define S 100
#define P 1.55
#define L 2
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif

View File

@@ -0,0 +1,459 @@
// The original code for simplex algorithm is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.
// https://creativecommons.org/licenses/by/4.0/
// The code is modified by Naoki Shibata to process arbitrary precision numbers.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <float.h>
#include <time.h>
#include <mpfr.h>
#define PREC 4096
#define EPS 1e-50
#define OK 0
#define MAXIMIZABLE_TO_INFINITY 1
#define NOT_FEASIBLE 2
#define ERROR (-1)
#define NOP (-1)
#define EQU (0)
#define LEQ 1
#define GEQ 2
static int m, n, n1, n2, n3, jmax;
static int *col, *row, *nonzero_row, *inequality;
static mpfr_t **a, *c, **q, *pivotcolumn;
static mpfr_t zero, one, eps, minuseps, large;
void mpfr_zinit(mpfr_t m) {
mpfr_init(m);
mpfr_set_d(m, 0, GMP_RNDN);
}
static void init(int n0, int m0) {
int i, j;
m = m0; n = n0;
mpfr_init(zero); mpfr_set_d(zero, 0, GMP_RNDN);
mpfr_init(one); mpfr_set_d(one, 1, GMP_RNDN);
mpfr_init(eps);
mpfr_set_d(eps, EPS, GMP_RNDN);
mpfr_init(minuseps);
mpfr_set_d(minuseps, -EPS, GMP_RNDN);
mpfr_init(large);
mpfr_set_d(large, 1.0 / EPS, GMP_RNDN);
a = malloc(sizeof(mpfr_t *) * (m + 1));
for(i=0;i < m+1;i++) {
a[i] = malloc(sizeof(mpfr_t) * (n + 1));
for(j=0;j < (n+1);j++) {
mpfr_zinit(a[i][j]);
}
}
q = malloc(sizeof(mpfr_t *) * (m + 1));
for(i=0;i < m+1;i++) {
q[i] = malloc(sizeof(mpfr_t) * (m + 1));
for(j=0;j < m+1;j++) {
mpfr_zinit(q[i][j]);
}
}
c = malloc(sizeof(mpfr_t) * (n + 1));
for(j=0;j < (n+1);j++) {
mpfr_zinit(c[j]);
}
pivotcolumn = malloc(sizeof(mpfr_t) * (m + 1));
for(j=0;j < (m+1);j++) {
mpfr_zinit(pivotcolumn[j]);
}
col = calloc(m+1, sizeof(int));
row = calloc(n+2*m+1, sizeof(int));
nonzero_row = calloc(n+2*m+1, sizeof(int));
inequality = calloc(m+1, sizeof(int));
}
static void dispose() {
mpfr_clears(zero, one, eps, minuseps, large, (mpfr_ptr)0);
int i, j;
for(i=0;i < m+1;i++) {
for(j=0;j < m+1;j++) {
mpfr_clear(q[i][j]);
}
free(q[i]);
}
free(q);
for(i=0;i < m+1;i++) {
for(j=0;j < n+1;j++) {
mpfr_clear(a[i][j]);
}
free(a[i]);
}
free(a);
for(j=0;j < n+1;j++) {
mpfr_clear(c[j]);
}
free(c);
for(j=0;j < m+1;j++) {
mpfr_clear(pivotcolumn[j]);
}
free(pivotcolumn);
free(col);
free(row);
free(nonzero_row);
free(inequality);
}
static void prepare() {
int i;
n1 = n;
for (i = 1; i <= m; i++)
if (inequality[i] == GEQ) {
n1++; nonzero_row[n1] = i;
}
n2 = n1;
for (i = 1; i <= m; i++)
if (inequality[i] == LEQ) {
n2++; col[i] = n2;
nonzero_row[n2] = row[n2] = i;
}
n3 = n2;
for (i = 1; i <= m; i++)
if (inequality[i] != LEQ) {
n3++; col[i] = n3;
nonzero_row[n3] = row[n3] = i;
}
for (i = 0; i <= m; i++) {
mpfr_set_d(q[i][i], 1, GMP_RNDN);
}
}
static void tableau(mpfr_t ret, int i, int j) {
int k;
if (col[i] < 0) { mpfr_set_d(ret, 0, GMP_RNDN); return; }
if (j <= n) {
mpfr_t s;
mpfr_zinit(s);
mpfr_set_d(s, 0, GMP_RNDN);
mpfr_t *tab = malloc(sizeof(mpfr_t) * (m + 1));
mpfr_ptr *ptab = malloc(sizeof(mpfr_ptr) * (m + 1));
for (k = 0; k <= m; k++) {
mpfr_zinit(tab[k]);
ptab[k] = (mpfr_ptr)&tab[k];
mpfr_mul(tab[k], q[i][k], a[k][j], GMP_RNDN);
}
mpfr_sum(s, ptab, m+1, GMP_RNDN);
for (k = 0; k <= m; k++) {
mpfr_clear(tab[k]);
}
free(ptab);
free(tab);
mpfr_set(ret, s, GMP_RNDN);
mpfr_clear(s);
return;
}
mpfr_set(ret, q[i][nonzero_row[j]], GMP_RNDN);
if (j <= n1) { mpfr_neg(ret, ret, GMP_RNDN); return; }
if (j <= n2 || i != 0) return;
mpfr_add(ret, ret, one, GMP_RNDN);
return;
}
static void pivot(int ipivot, int jpivot) {
int i, j;
mpfr_t u;
mpfr_zinit(u);
mpfr_set(u, pivotcolumn[ipivot], GMP_RNDN);
for (j = 1; j <= m; j++) {
mpfr_div(q[ipivot][j], q[ipivot][j], u, GMP_RNDN);
}
for (i = 0; i <= m; i++)
if (i != ipivot) {
mpfr_set(u, pivotcolumn[i], GMP_RNDN);
for (j = 1; j <= m; j++) {
mpfr_fms(q[i][j], q[ipivot][j], u, q[i][j], GMP_RNDN);
mpfr_neg(q[i][j], q[i][j], GMP_RNDN);
}
}
row[col[ipivot]] = 0;
col[ipivot] = jpivot; row[jpivot] = ipivot;
mpfr_clear(u);
}
static int minimize() {
int i, ipivot, jpivot;
mpfr_t t, u;
mpfr_inits(t, u, (mpfr_ptr)0);
for (;;) {
for (jpivot = 1; jpivot <= jmax; jpivot++) {
if (row[jpivot] == 0) {
tableau(pivotcolumn[0], 0, jpivot);
if (mpfr_cmp(pivotcolumn[0], minuseps) < 0) break;
}
}
if (jpivot > jmax) {
mpfr_clears(t, u, (mpfr_ptr)0);
return 1;
}
mpfr_set(u, large, GMP_RNDN);
ipivot = 0;
for (i = 1; i <= m; i++) {
tableau(pivotcolumn[i], i, jpivot);
if (mpfr_cmp(pivotcolumn[i], eps) > 0) {
tableau(t, i, 0);
mpfr_div(t, t, pivotcolumn[i], GMP_RNDN);
if (mpfr_cmp(t, u) < 0) { ipivot = i; mpfr_set(u, t, GMP_RNDN); }
}
}
if (ipivot == 0) {
mpfr_clears(t, u, (mpfr_ptr)0);
return 0; // the objective function can be minimized to -infinite
}
pivot(ipivot, jpivot);
}
}
static int phase1() {
int i, j;
mpfr_t u;
mpfr_zinit(u);
jmax = n3;
for (i = 0; i <= m; i++) {
if (col[i] > n2) mpfr_set_d(q[0][i], -1, GMP_RNDN);
}
minimize();
tableau(u, 0, 0);
if (mpfr_cmp(u, minuseps) < 0) {
mpfr_clear(u);
return 0;
}
for (i = 1; i <= m; i++) {
if (col[i] > n2) {
col[i] = -1;
}
}
mpfr_set_d(q[0][0], 1, GMP_RNDN);
for (j = 1; j <= m; j++) mpfr_set_d(q[0][j], 0, GMP_RNDN);
for (i = 1; i <= m; i++) {
if ((j = col[i]) > 0 && j <= n && mpfr_cmp_d(c[j], 0) != 0) {
mpfr_set(u, c[j], GMP_RNDN);
for (j = 1; j <= m; j++) {
mpfr_fms(q[0][j], q[i][j], u, q[0][j], GMP_RNDN);
mpfr_neg(q[0][j], q[0][j], GMP_RNDN);
}
}
}
mpfr_clear(u);
return 1;
}
static int phase2() {
int j;
jmax = n2;
for (j = 0; j <= n; j++) {
mpfr_set(a[0][j], c[j], GMP_RNDN);
}
return minimize();
}
int solve_fr(mpfr_t *result, int n0, int m0, mpfr_t **a0, int *ineq0, mpfr_t *c0) {
int i,j;
m = m0; // number of inequations
n = n0+1; // number of variables
init(n, m);
mpfr_t csum;
mpfr_zinit(csum);
for(j=0;j<n0+1;j++) {
mpfr_set(c[j], c0[j], GMP_RNDN);
}
for(j=1;j<n0+1;j++) {
mpfr_add(csum, csum, c0[j], GMP_RNDN);
}
mpfr_set(c[n], csum, GMP_RNDN);
mpfr_neg(c[n], c[n], GMP_RNDN);
for(i=0;i<m;i++) {
mpfr_set_d(csum, 0, GMP_RNDN);
for(j=0;j<n0+1;j++) mpfr_set(a[i+1][j], a0[i][j], GMP_RNDN);
mpfr_neg(a[i+1][0], a[i+1][0], GMP_RNDN);
for(j=1;j<n0+1;j++) {
mpfr_add(csum, csum, a0[i][j], GMP_RNDN);
}
mpfr_set(a[i+1][n], csum, GMP_RNDN);
mpfr_neg(a[i+1][n], a[i+1][n], GMP_RNDN);
inequality[i+1] = ineq0[i];
if (mpfr_cmp_d(a[i+1][0], 0) < 0) {
if (inequality[i+1] == GEQ) inequality[i+1] = LEQ;
else if (inequality[i+1] == LEQ) inequality[i+1] = GEQ;
for (j = 0; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
} else if (mpfr_cmp_d(a[i+1][0], 0) == 0 && inequality[i+1] == GEQ) {
inequality[i+1] = LEQ;
for (j = 1; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
}
}
int p1r = 1;
prepare();
if (n3 != n2) p1r = phase1();
if (!p1r) {
dispose();
return NOT_FEASIBLE;
}
int b = phase2();
mpfr_t *s = calloc(sizeof(mpfr_t), n);
for(j=0;j<n;j++) {
mpfr_zinit(s[j]);
}
for (j = 1; j < n; j++) {
if ((i = row[j]) != 0) {
tableau(s[j], i, 0);
}
}
mpfr_t cs;
mpfr_zinit(cs);
if (row[n] != 0) tableau(cs, row[n], 0);
for (j = 1; j < n; j++) {
mpfr_sub(s[j], s[j], cs, GMP_RNDN);
}
for(j=0;j<n;j++) {
mpfr_set(result[j], s[j], GMP_RNDN);
}
mpfr_clear(cs);
for(j=0;j<n;j++) mpfr_clear(s[j]);
free(s);
dispose();
return b ? OK : MAXIMIZABLE_TO_INFINITY;
}
void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result) {
int m0 = n * 3, n0 = m + 2 * n, i, j;
mpfr_t **a0, *c0, *result0;
int in0[m0];
a0 = malloc(sizeof(mpfr_t *) * m0);
for(i=0;i<m0;i++) {
a0[i] = calloc(n0+1, sizeof(mpfr_t));
for(j=0;j<n0+1;j++) mpfr_zinit(a0[i][j]);
}
c0 = calloc(n0+1, sizeof(mpfr_t));
result0 = calloc(n0+1, sizeof(mpfr_t));
for(j=0;j<n0+1;j++) {
mpfr_zinit(c0[j]);
mpfr_zinit(result0[j]);
}
for(i=0;i<n;i++) {
long double ld = mpfr_get_ld(x[m][i], GMP_RNDN);
if (ld < DBL_MIN) ld = 1;
#if 1
mpfr_set_ld(c0[m+i +1], 1.0/fabsl(ld), GMP_RNDN);
mpfr_set_ld(c0[m+n+i+1], 1.0/fabsl(ld), GMP_RNDN);
#else
int e;
frexpl(ld, &e);
ld = 1.0 / ldexpl(1.0, e);
mpfr_set_ld(c0[m+i +1], ld, GMP_RNDN);
mpfr_set_ld(c0[m+n+i+1], ld, GMP_RNDN);
#endif
mpfr_set_d(a0[i*3+0][m+i+1], 1, GMP_RNDN);
in0[i*3+0] = GEQ;
mpfr_set_d(a0[i*3+1][m+n+i+1], 1, GMP_RNDN);
in0[i*3+1] = GEQ;
for(j=0;j<m;j++) {
mpfr_set(a0[i*3+2][j+1], x[j][i], GMP_RNDN);
}
mpfr_set_d(a0[i*3+2][m+i+1], 1, GMP_RNDN);
mpfr_set_d(a0[i*3+2][m+n+i+1], -1, GMP_RNDN);
in0[i*3+2] = EQU;
mpfr_set(a0[i*3+2][0], x[m][i], GMP_RNDN);
mpfr_neg(a0[i*3+2][0], a0[i*3+2][0], GMP_RNDN);
}
int status = solve_fr(result0, n0, m0, a0, in0, c0);
if (status == NOT_FEASIBLE) {
printf("not feasible\n");
} else {
if (status == MAXIMIZABLE_TO_INFINITY) printf("maximizable to inf\n");
}
for(i=0;i<m;i++) {
mpfr_set(result[i], result0[i+1], GMP_RNDN);
}
free(result0);
free(c0);
}

View File

@@ -0,0 +1,159 @@
// This is part of SLEEF, written by Naoki
// Shibata. http://shibatch.sourceforge.net
// The code in this file is distributed under the Creative Commons
// Attribution 4.0 International License.
#define PREC_TARGET 24
#if 1
#define N 5 // Degree of equation
#define S 81 // Number of samples for phase 1
#define L 0 // Number of high precision coefficients
#define P 0.37
#define MIN 0.0 // Min argument
#define MAX (M_PI/2) // Max argument
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0 // Fix coef 0 to 1.0
#endif
#if 0
#define N 5
#define S 40
#define L 0
#define MIN 0.0
#define MAX (M_PI/2)
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
mpfr_t x;
mpfr_init(x);
mpfr_cos(ret, a, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clear(x);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#define FIXCOEF0 (-0.5)
#endif
#if 0
// xsincospi4
#define N 5
#define S 30
#define P 0.69
#define L 2
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_sin(ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#endif
#if 0
// xsincospi4
#define N 5
#define S 60
#define P 0.7
#define L 1
#define MIN 0.0
#define MAX 1.0
void TARGET(mpfr_t ret, mpfr_t a) {
mpfr_t x, y;
mpfr_inits(x, y, NULL);
mpfr_const_pi(x, GMP_RNDN);
mpfr_set_d(y, 1.0/4, GMP_RNDN);
mpfr_mul(x, x, y, GMP_RNDN);
mpfr_mul(x, x, a, GMP_RNDN);
mpfr_cos(ret, x, GMP_RNDN);
mpfr_set_ld(x, 1, GMP_RNDN);
mpfr_sub(ret, ret, x, GMP_RNDN);
mpfr_clears(x, y, NULL);
}
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define PMUL 2
#define PADD 2
#endif
#if 0
#define N 7
#define S 40
#define L 2
#define MIN 0.0
#define MAX (M_PI/4)
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif
#if 0
#define N 5
#define S 40
#define L 2
#define MIN 1 //0.75
#define MAX 1.5
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t frd, mpfr_t fra) {
mpfr_t tmp, one;
mpfr_inits(tmp, one, NULL);
mpfr_set_d(one, 1, GMP_RNDN);
mpfr_add(tmp, fra, one, GMP_RNDN);
mpfr_sub(frd, fra, one, GMP_RNDN);
mpfr_div(frd, frd, tmp, GMP_RNDN);
mpfr_clears(tmp, one, NULL);
}
#define FIXCOEF0 2.0
#endif
#if 0
#define N 7
#define S 50
#define L 0
#define MIN -0.347
#define MAX 0.347 // 0.5 log 2
#define PMUL 1
#define PADD 0
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#define FIXCOEF1 1.0
//#define FIXCOEF2 0.5
#endif
#if 0
#define N 10
#define S 100
#define L 2
#define MIN 0.0
#define MAX 1.0
#define PMUL 2
#define PADD 1
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
#define FIXCOEF0 1.0
#endif

View File

@@ -0,0 +1,153 @@
ICCAVAILABLE := $(shell command -v icc 2> /dev/null)
ARCH := $(shell uname -p)
all :
ifndef BUILDDIR
@echo
@echo Please set the build directory to BUILDDIR environment variable and run make once again.
@echo e.g. export BUILDDIR='`pwd`'/../../build
@echo
else
@echo
@echo You can start measurement by "'"make measure"'".
ifdef ICCAVAILABLE
@echo You can start measurement with SVML by "'"make measureSVML"'".
endif
@echo Then, you can plot the results of measurement by "'"make plot"'".
@echo
@echo You have to install java and gnuplot to do plotting.
@echo Stop all tasks on the computer before starting measurement.
@echo
endif
benchsvml128_10.o : benchsvml128.c bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_10.o
benchsvml128_40.o : benchsvml128.c bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_40.o
benchsvml256_10.o : benchsvml256.c bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_10.o
benchsvml256_40.o : benchsvml256.c bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_40.o
benchsvml512_10.o : benchsvml512.c bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_10.o
benchsvml512_40.o : benchsvml512.c bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_40.o
benchsvml_10 : benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_10
benchsvml_40 : benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o bench.h
-command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_40
#
ifeq ($(ARCH),aarch64)
benchsleef : benchsleef.c benchsleef128.o bench.h
$(CC) benchsleef.c benchsleef128.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
benchsleef128.o : benchsleef128.c bench.h
$(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
else ifeq ($(ARCH),s390x)
benchsleef : benchsleef.c benchsleef128.o bench.h
$(CC) benchsleef.c benchsleef128.o -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
benchsleef128.o : benchsleef128.c bench.h
$(CC) benchsleef128.c -Wall -mzvector -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
else ifeq ($(ARCH),ppc64le)
benchsleef : benchsleef.c benchsleef128.o bench.h
$(CC) benchsleef.c benchsleef128.o -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
benchsleef128.o : benchsleef128.c bench.h
$(CC) benchsleef128.c -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
else
benchsleef : benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o bench.h
$(CC) benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
benchsleef128.o : benchsleef128.c bench.h
$(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
benchsleef256.o : benchsleef256.c bench.h
$(CC) benchsleef256.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
benchsleef512.o : benchsleef512.c bench.h
$(CC) benchsleef512.c -Wall -mavx512f -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
endif
#
ProcessData.class : ProcessData.java
javac ProcessData.java
#
ifndef BUILDDIR
measure :
@echo
@echo Please set the build directory to BUILDDIR environment variable and run make once again.
@echo e.g. export BUILDDIR='`pwd`'/../../build
@echo
else
measure : benchsleef
chmod +x ./measure.sh
LD_LIBRARY_PATH=$(BUILDDIR)/lib ./measure.sh ./benchsleef
@echo
@echo Now, you can plot the results of measurement by "'"make plot"'".
@echo You can do another measurement by "'"make measure"'".
ifdef ICCAVAILABLE
@echo You can start another measurement with SVML by "'"make measureSVML"'".
endif
@echo You can start over by "'"make restart"'".
@echo
endif
measureSVML : all benchsvml_10 benchsvml_40
chmod +x ./measure.sh
./measure.sh ./benchsvml_10 ./benchsvml_40
@echo
@echo Now, you can plot the results of measurement by "'"make plot"'".
@echo You can do another measurement by "'"make measure"'".
ifdef ICCAVAILABLE
@echo You can start another measurement with SVML by "'"make measureSVML"'".
endif
@echo You can start over by "'"make restart"'".
@echo
plot : ProcessData.class counter.txt
java ProcessData *dptrig*.out
gnuplot script.out
mv output.png trigdp.png
java ProcessData *dpnontrig*.out
gnuplot script.out
mv output.png nontrigdp.png
java ProcessData *sptrig*.out
gnuplot script.out
mv output.png trigsp.png
java ProcessData *spnontrig*.out
gnuplot script.out
mv output.png nontrigsp.png
@echo
@echo Plotted results are in trigdp.png, nontrigdp.png, trigsp.png and nontrigsp.png.
@echo
clean :
rm -f *~ a.out *.so *.so.* *.a *.s *.o
rm -rf *.dSYM *.dylib
rm -f *.obj *.lib *.dll *.exp *.exe *.stackdump
rm -f *.class *.png benchsleef benchsvml_10 benchsvml_40 *.out counter.txt
restart :
rm -f *.out counter.txt

View File

@@ -0,0 +1,193 @@
import java.util.*;
import java.io.*;
public class ProcessData {
static final int DP = 64, SP = 32;
static LinkedHashMap<String, Integer> funcNameOrder = new LinkedHashMap<String, Integer>();
static class Key {
final String funcName;
final int prec, bits;
final ArrayList<Double> range = new ArrayList<Double>();
final double ulps;
Key(String s) {
String[] a = s.split(",");
funcName = a[0].trim();
if (funcNameOrder.get(funcName) == null) {
funcNameOrder.put(funcName, funcNameOrder.size());
}
prec =
a[1].trim().equals("DP") ? DP :
a[1].trim().equals("SP") ? SP :
0;
bits = Integer.parseInt(a[2].trim());
int c;
for(c = 3;;c++) {
if (a[c].trim().endsWith("ulps")) break;
range.add(Double.parseDouble(a[c]));
}
ulps = Double.parseDouble(a[c].trim().replace("ulps", ""));
}
public int hashCode() {
int h = funcName.hashCode();
h ^= prec ^ bits;
return h;
}
public boolean equals(Object o) {
if (this == o) return true;
Key k = (Key) o;
if (funcName.compareTo(k.funcName) != 0) return false;
if (prec != k.prec) return false;
if (bits != k.bits) return false;
if (range.size() != k.range.size()) return false;
for(int i=0;i<range.size();i++) {
if ((double)range.get(i) != (double)k.range.get(i)) return false;
}
if (ulps != k.ulps) return false;
return true;
}
public String toString() {
String s = funcName + " ";
s += prec == DP ? "DP " : "SP ";
s += bits + "bit ";
s += String.format(" %.0fulp ", ulps);
for(int i=0;i<range.size();i+=2) {
s += "[" + String.format("%.3g", range.get(i)) + ", " + String.format("%.3g", range.get(i+1)) + "]";
if (i + 2 < range.size()) s += " ";
}
return s;
}
}
static class KeyComparator implements Comparator<Key> {
public int compare(Key d0, Key d1) {
if (d0 == d1) return 0;
if (d0.prec < d1.prec) return 1;
if (d0.prec > d1.prec) return -1;
if (d0.ulps > d1.ulps) return 1;
if (d0.ulps < d1.ulps) return -1;
int fc = (int)funcNameOrder.get(d0.funcName) - (int)funcNameOrder.get(d1.funcName);
if (fc != 0) return fc;
if (d0.bits > d1.bits) return 1;
if (d0.bits < d1.bits) return -1;
if (d0.range.size() > d1.range.size()) return 1;
if (d0.range.size() < d1.range.size()) return -1;
for(int i=0;i<d0.range.size();i++) {
if (d0.range.get(i) > d1.range.get(i)) return 1;
if (d0.range.get(i) < d1.range.get(i)) return -1;
}
return 0;
}
}
public static void main(String[] args) throws Exception {
LinkedHashMap<Key, LinkedHashMap<String, Double>> allData = new LinkedHashMap<Key, LinkedHashMap<String, Double>>();
TreeSet<Key> allKeys = new TreeSet<Key>(new KeyComparator());
LinkedHashSet<String> allColumnTitles = new LinkedHashSet<String>();
double maximum = 0;
for(int i=0;i<args.length;i++) {
LineNumberReader lnr = new LineNumberReader(new FileReader(args[i]));
String columnTitle = lnr.readLine();
allColumnTitles.add(columnTitle);
for(;;) {
String s = lnr.readLine();
if (s == null) break;
Key key = new Key(s);
allKeys.add(key);
LinkedHashMap<String, Double> v = allData.get(key);
if (v == null) {
v = new LinkedHashMap<String, Double>();
allData.put(key, v);
}
String[] a = s.split(",");
double time = Double.parseDouble(a[a.length-1]);
v.put(columnTitle, time);
maximum = Math.max(maximum, time);
}
lnr.close();
}
PrintStream ps = new PrintStream("data.out");
for(Key k : allKeys) {
ps.print("\"" + k + "\" ");
LinkedHashMap<String, Double> v = allData.get(k);
for(String s : allColumnTitles) {
Double d = v.get(s);
if (d != null) ps.print(d);
if (d == null) ps.print("0");
ps.print("\t");
}
ps.println();
}
ps.close();
ps = new PrintStream("script.out");
ps.println("set terminal pngcairo size 1280, 800 font \",10\"");
ps.println("set output \"output.png\"");
ps.println("color00 = \"#FF5050\";"); // red
ps.println("color01 = \"#0066FF\";"); // blue
ps.println("color02 = \"#00FF00\";"); // green
ps.println("color03 = \"#FF9900\";"); // orange
ps.println("color04 = \"#CC00CC\";"); // purple
ps.println("color05 = \"#880000\";"); // brown
ps.println("color06 = \"#003300\";"); // dark green
ps.println("color07 = \"#000066\";"); // dark blue
ps.println("set style data histogram");
ps.println("set style histogram cluster gap 1");
ps.println("set style fill solid 1.00");
ps.println("set boxwidth 0.9");
ps.println("set xtics format \"\"");
ps.println("set xtics rotate by -90");
ps.println("set grid ytics");
ps.println("set ylabel \"Execution time in micro sec.\"");
ps.println("set yrange [0:*]");
ps.println("set bmargin 24");
ps.println("set title \"Single execution time in micro sec.\"");
ps.print("plot");
int i = 0;
for(String s : allColumnTitles) {
ps.print("\"data.out\" using " + (i+2) + ":xtic(1) title \"" + s +
"\" linecolor rgb color" + String.format("%02d", i));
if (i != allColumnTitles.size()-1) ps.print(", ");
i++;
}
ps.println();
ps.close();
}
}

View File

@@ -0,0 +1,58 @@
#define NITER1 100000
#define NITER2 10000
#define NITER (NITER1 * NITER2)
#define callFuncSLEEF1_1(funcName, name, xmin, xmax, ulp, arg, type) ({ \
printf("%s\n", #funcName); \
uint64_t t = Sleef_currentTimeMicros(); \
for(int j=0;j<NITER2;j++) { \
type *p = (type *)(arg); \
for(int i=0;i<NITER1;i++) funcName(*p++); \
} \
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
(double)xmin, (double)xmax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
})
#define callFuncSLEEF1_2(funcName, name, xmin, xmax, ymin, ymax, ulp, arg1, arg2, type) ({ \
printf("%s\n", #funcName); \
uint64_t t = Sleef_currentTimeMicros(); \
for(int j=0;j<NITER2;j++) { \
type *p1 = (type *)(arg1), *p2 = (type *)(arg2); \
for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++); \
} \
fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n", \
(double)xmin, (double)xmax, (double)ymin, (double)ymax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
})
#define callFuncSVML1_1(funcName, name, xmin, xmax, arg, type) ({ \
printf("%s\n", #funcName); \
uint64_t t = Sleef_currentTimeMicros(); \
for(int j=0;j<NITER2;j++) { \
type *p = (type *)(arg); \
for(int i=0;i<NITER1;i++) funcName(*p++); \
} \
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
(double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
})
#define callFuncSVML2_1(funcName, name, xmin, xmax, arg, type) ({ \
printf("%s\n", #funcName); \
uint64_t t = Sleef_currentTimeMicros(); \
for(int j=0;j<NITER2;j++) { \
type *p = (type *)(arg), c; \
for(int i=0;i<NITER1;i++) funcName(&c, *p++); \
} \
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
(double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
})
#define callFuncSVML1_2(funcName, name, xmin, xmax, ymin, ymax, arg1, arg2, type) ({ \
printf("%s\n", #funcName); \
uint64_t t = Sleef_currentTimeMicros(); \
for(int j=0;j<NITER2;j++) { \
type *p1 = (type *)(arg1), *p2 = (type *)(arg2); \
for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++); \
} \
fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n", \
(double)xmin, (double)xmax, (double)ymin, (double)ymax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
})

View File

@@ -0,0 +1,144 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <sleef.h>
#include "bench.h"
int veclen = 16;
double *abufdp, *bbufdp;
float *abufsp, *bbufsp;
FILE *fp;
#if defined(__i386__) || defined(__x86_64__)
void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
uint32_t a, b, c, d;
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
}
int cpuSupportsAVX() {
int32_t reg[4];
x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 28)) != 0;
}
int cpuSupportsAVX512F() {
int32_t reg[4];
x86CpuID(reg, 7, 0);
return (reg[1] & (1 << 16)) != 0;
}
#endif
void fillDP(double *buf, double min, double max) {
for(int i=0;i<NITER1*veclen;i++) {
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
buf[i] = r * (max - min) + min;
}
}
void fillSP(float *buf, double min, double max) {
for(int i=0;i<NITER1*veclen;i++) {
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
buf[i] = r * (max - min) + min;
}
}
void benchSleef128_DPTrig();
void benchSleef256_DPTrig();
void benchSleef512_DPTrig();
void benchSleef128_DPNontrig();
void benchSleef256_DPNontrig();
void benchSleef512_DPNontrig();
void benchSleef128_SPTrig();
void benchSleef256_SPTrig();
void benchSleef512_SPTrig();
void benchSleef128_SPNontrig();
void benchSleef256_SPNontrig();
void benchSleef512_SPNontrig();
//
int main(int argc, char **argv) {
char *columnTitle = "SLEEF", *fnBase = "sleef";
char fn[1024];
if (argc != 1) columnTitle = argv[1];
if (argc >= 3) fnBase = argv[2];
srandom(time(NULL));
#if defined(__i386__) || defined(__x86_64__)
int do128bit = 1;
int do256bit = cpuSupportsAVX();
int do512bit = cpuSupportsAVX512F();
#elif defined(__ARM_NEON) || defined(__VSX__) || defined(__VX__)
int do128bit = 1;
#else
#error Unsupported architecture
#endif
posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
abufsp = (float *)abufdp;
bbufsp = (float *)bbufdp;
sprintf(fn, "%sdptrig.out", fnBase);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do128bit) benchSleef128_DPTrig();
#if defined(__i386__) || defined(__x86_64__)
if (do256bit) benchSleef256_DPTrig();
if (do512bit) benchSleef512_DPTrig();
#endif
fclose(fp);
sprintf(fn, "%sdpnontrig.out", fnBase);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do128bit) benchSleef128_DPNontrig();
#if defined(__i386__) || defined(__x86_64__)
if (do256bit) benchSleef256_DPNontrig();
if (do512bit) benchSleef512_DPNontrig();
#endif
fclose(fp);
sprintf(fn, "%ssptrig.out", fnBase);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do128bit) benchSleef128_SPTrig();
#if defined(__i386__) || defined(__x86_64__)
if (do256bit) benchSleef256_SPTrig();
if (do512bit) benchSleef512_SPTrig();
#endif
fclose(fp);
sprintf(fn, "%sspnontrig.out", fnBase);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do128bit) benchSleef128_SPNontrig();
#if defined(__i386__) || defined(__x86_64__)
if (do256bit) benchSleef256_SPNontrig();
if (do512bit) benchSleef512_SPNontrig();
#endif
fclose(fp);
exit(0);
}

View File

@@ -0,0 +1,195 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <sleef.h>
void fillDP(double *buf, double min, double max);
void fillSP(float *buf, double min, double max);
extern char x86BrandString[256], versionString[1024];
extern int veclen;
extern double *abufdp, *bbufdp;
extern float *abufsp, *bbufsp;
extern FILE *fp;
#include "bench.h"
#ifdef __SSE2__
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
typedef __m128d vdouble;
typedef __m128 vfloat;
#define ENABLED
#elif defined(__ARM_NEON)
#include <arm_neon.h>
typedef float64x2_t vdouble;
typedef float32x4_t vfloat;
#define ENABLED
#elif defined(__VSX__)
#include <altivec.h>
typedef __vector double vdouble;
typedef __vector float vfloat;
#define ENABLED
#elif defined(__VX__)
#include <vecintrin.h>
typedef __vector double vdouble;
typedef __vector float vfloat;
#define ENABLED
#endif
#ifdef ENABLED
void benchSleef128_DPTrig() {
fillDP(abufdp, 0, 6.28);
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
fillDP(abufdp, 0, 1e+6);
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
fillDP(abufdp, 0, 1e+100);
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
}
void benchSleef128_DPNontrig() {
fillDP(abufdp, 0, 1e+300);
callFuncSLEEF1_1(Sleef_logd2_u10 , "log, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_log10d2_u10, "log10, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_log1pd2_u10, "log1p, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_logd2_u35 , "log, DP, 128", 0, 1e+300, 4.0, abufdp, vdouble);
fillDP(abufdp, -700, 700);
callFuncSLEEF1_1(Sleef_expd2_u10 , "exp, DP, 128", -700, 700, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_exp2d2_u10 , "exp2, DP, 128", -700, 700, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_exp10d2_u10, "exp10, DP, 128", -700, 700, 1.0, abufdp, vdouble);
fillDP(abufdp, -30, 30);
fillDP(bbufdp, -30, 30);
callFuncSLEEF1_2(Sleef_powd2_u10, "pow, DP, 128", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
fillDP(abufdp, -1.0, 1.0);
callFuncSLEEF1_1(Sleef_asind2_u10, "asin, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_acosd2_u10, "acos, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_asind2_u35, "asin, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_acosd2_u35, "acos, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
fillDP(abufdp, -10, 10);
fillDP(bbufdp, -10, 10);
callFuncSLEEF1_1(Sleef_atand2_u10, "atan, DP, 128", -10, 10, 1.0, abufdp, vdouble);
callFuncSLEEF1_2(Sleef_atan2d2_u10, "atan2, DP, 128", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
callFuncSLEEF1_1(Sleef_atand2_u35, "atan, DP, 128", -10, 10, 4.0, abufdp, vdouble);
callFuncSLEEF1_2(Sleef_atan2d2_u35, "atan2, DP, 128", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
}
void benchSleef128_SPTrig() {
fillSP(abufsp, 0, 6.28);
callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
fillSP(abufsp, 0, 1e+20);
callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
}
void benchSleef128_SPNontrig() {
fillSP(abufsp, 0, 1e+38);
callFuncSLEEF1_1(Sleef_logf4_u10 , "log, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_log10f4_u10, "log10, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log1pf4_u10, "log1p, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_logf4_u35 , "log, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log10f4_u35, "log10, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log1pf4_u35, "log1p, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
fillSP(abufsp, -100, 100);
callFuncSLEEF1_1(Sleef_expf4_u10 , "exp, SP, 128", -100, 100, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_exp2f4_u10 , "exp2, SP, 128", -100, 100, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_exp10f4_u10, "exp10, SP, 128", -100, 100, 1.0, abufsp, vfloat);
fillSP(abufsp, -30, 30);
fillSP(bbufsp, -30, 30);
callFuncSLEEF1_2(Sleef_powf4_u10, "pow, SP, 128", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
fillSP(abufsp, -1.0, 1.0);
callFuncSLEEF1_1(Sleef_asinf4_u10, "asin, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_acosf4_u10, "acos, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_asinf4_u35, "asin, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_acosf4_u35, "acos, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
fillSP(abufsp, -10, 10);
fillSP(bbufsp, -10, 10);
callFuncSLEEF1_1(Sleef_atanf4_u10, "atan, SP, 128", -10, 10, 1.0, abufsp, vfloat);
callFuncSLEEF1_2(Sleef_atan2f4_u10, "atan2, SP, 128", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
callFuncSLEEF1_1(Sleef_atanf4_u35, "atan, SP, 128", -10, 10, 4.0, abufsp, vfloat);
callFuncSLEEF1_2(Sleef_atan2f4_u35, "atan2, SP, 128", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
}
#else // #ifdef ENABLED
void benchSleef128_DPTrig() {}
void benchSleef128_DPNontrig() {}
void benchSleef128_SPTrig() {}
void benchSleef128_SPNontrig() {}
#endif // #ifdef ENABLED

View File

@@ -0,0 +1,181 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <sleef.h>
void fillDP(double *buf, double min, double max);
void fillSP(float *buf, double min, double max);
extern char x86BrandString[256], versionString[1024];
extern int veclen;
extern double *abufdp, *bbufdp;
extern float *abufsp, *bbufsp;
extern FILE *fp;
#include "bench.h"
#ifdef __AVX__
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
typedef __m256d vdouble;
typedef __m256 vfloat;
#define ENABLED
#endif
#ifdef ENABLED
void benchSleef256_DPTrig() {
fillDP(abufdp, 0, 6.28);
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
fillDP(abufdp, 0, 1e+6);
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
fillDP(abufdp, 0, 1e+100);
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
}
void benchSleef256_DPNontrig() {
fillDP(abufdp, 0, 1e+300);
callFuncSLEEF1_1(Sleef_logd4_u10 , "log, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_log10d4_u10, "log10, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_log1pd4_u10, "log1p, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_logd4_u35 , "log, DP, 256", 0, 1e+300, 4.0, abufdp, vdouble);
fillDP(abufdp, -700, 700);
callFuncSLEEF1_1(Sleef_expd4_u10 , "exp, DP, 256", -700, 700, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_exp2d4_u10 , "exp2, DP, 256", -700, 700, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_exp10d4_u10, "exp10, DP, 256", -700, 700, 1.0, abufdp, vdouble);
fillDP(abufdp, -30, 30);
fillDP(bbufdp, -30, 30);
callFuncSLEEF1_2(Sleef_powd4_u10, "pow, DP, 256", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
fillDP(abufdp, -1.0, 1.0);
callFuncSLEEF1_1(Sleef_asind4_u10, "asin, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_acosd4_u10, "acos, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_asind4_u35, "asin, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_acosd4_u35, "acos, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
fillDP(abufdp, -10, 10);
fillDP(bbufdp, -10, 10);
callFuncSLEEF1_1(Sleef_atand4_u10, "atan, DP, 256", -10, 10, 1.0, abufdp, vdouble);
callFuncSLEEF1_2(Sleef_atan2d4_u10, "atan2, DP, 256", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
callFuncSLEEF1_1(Sleef_atand4_u35, "atan, DP, 256", -10, 10, 4.0, abufdp, vdouble);
callFuncSLEEF1_2(Sleef_atan2d4_u35, "atan2, DP, 256", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
}
void benchSleef256_SPTrig() {
fillSP(abufsp, 0, 6.28);
callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
fillSP(abufsp, 0, 1e+20);
callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
}
void benchSleef256_SPNontrig() {
fillSP(abufsp, 0, 1e+38);
callFuncSLEEF1_1(Sleef_logf8_u10 , "log, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_log10f8_u10, "log10, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log1pf8_u10, "log1p, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_logf8_u35 , "log, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log10f8_u35, "log10, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log1pf8_u35, "log1p, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
fillSP(abufsp, -100, 100);
callFuncSLEEF1_1(Sleef_expf8_u10 , "exp, SP, 256", -100, 100, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_exp2f8_u10 , "exp2, SP, 256", -100, 100, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_exp10f8_u10, "exp10, SP, 256", -100, 100, 1.0, abufsp, vfloat);
fillSP(abufsp, -30, 30);
fillSP(bbufsp, -30, 30);
callFuncSLEEF1_2(Sleef_powf8_u10, "pow, SP, 256", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
fillSP(abufsp, -1.0, 1.0);
callFuncSLEEF1_1(Sleef_asinf8_u10, "asin, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_acosf8_u10, "acos, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_asinf8_u35, "asin, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_acosf8_u35, "acos, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
fillSP(abufsp, -10, 10);
fillSP(bbufsp, -10, 10);
callFuncSLEEF1_1(Sleef_atanf8_u10, "atan, SP, 256", -10, 10, 1.0, abufsp, vfloat);
callFuncSLEEF1_2(Sleef_atan2f8_u10, "atan2, SP, 256", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
callFuncSLEEF1_1(Sleef_atanf8_u35, "atan, SP, 256", -10, 10, 4.0, abufsp, vfloat);
callFuncSLEEF1_2(Sleef_atan2f8_u35, "atan2, SP, 256", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
}
#else // #ifdef ENABLED
void zeroupper256() {}
void benchSleef256_DPTrig() {}
void benchSleef256_DPNontrig() {}
void benchSleef256_SPTrig() {}
void benchSleef256_SPNontrig() {}
#endif // #ifdef ENABLED

View File

@@ -0,0 +1,180 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <sleef.h>
void fillDP(double *buf, double min, double max);
void fillSP(float *buf, double min, double max);
extern char x86BrandString[256], versionString[1024];
extern int veclen;
extern double *abufdp, *bbufdp;
extern float *abufsp, *bbufsp;
extern FILE *fp;
#include "bench.h"
#ifdef __AVX512F__
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
typedef __m512d vdouble;
typedef __m512 vfloat;
#define ENABLED
#endif
#ifdef ENABLED
void benchSleef512_DPTrig() {
fillDP(abufdp, 0, 6.28);
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
fillDP(abufdp, 0, 1e+6);
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
fillDP(abufdp, 0, 1e+100);
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
}
void benchSleef512_DPNontrig() {
fillDP(abufdp, 0, 1e+300);
callFuncSLEEF1_1(Sleef_logd8_u10 , "log, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_log10d8_u10, "log10, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_log1pd8_u10, "log1p, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_logd8_u35 , "log, DP, 512", 0, 1e+300, 4.0, abufdp, vdouble);
fillDP(abufdp, -700, 700);
callFuncSLEEF1_1(Sleef_expd8_u10 , "exp, DP, 512", -700, 700, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_exp2d8_u10 , "exp2, DP, 512", -700, 700, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_exp10d8_u10, "exp10, DP, 512", -700, 700, 1.0, abufdp, vdouble);
fillDP(abufdp, -30, 30);
fillDP(bbufdp, -30, 30);
callFuncSLEEF1_2(Sleef_powd8_u10, "pow, DP, 512", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
fillDP(abufdp, -1.0, 1.0);
callFuncSLEEF1_1(Sleef_asind8_u10, "asin, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_acosd8_u10, "acos, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_asind8_u35, "asin, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
callFuncSLEEF1_1(Sleef_acosd8_u35, "acos, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
fillDP(abufdp, -10, 10);
fillDP(bbufdp, -10, 10);
callFuncSLEEF1_1(Sleef_atand8_u10, "atan, DP, 512", -10, 10, 1.0, abufdp, vdouble);
callFuncSLEEF1_2(Sleef_atan2d8_u10, "atan2, DP, 512", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
callFuncSLEEF1_1(Sleef_atand8_u35, "atan, DP, 512", -10, 10, 4.0, abufdp, vdouble);
callFuncSLEEF1_2(Sleef_atan2d8_u35, "atan2, DP, 512", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
}
void benchSleef512_SPTrig() {
fillSP(abufsp, 0, 6.28);
callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
fillSP(abufsp, 0, 1e+20);
callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
}
void benchSleef512_SPNontrig() {
fillSP(abufsp, 0, 1e+38);
callFuncSLEEF1_1(Sleef_logf16_u10 , "log, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_log10f16_u10, "log10, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log1pf16_u10, "log1p, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_logf16_u35 , "log, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log10f16_u35, "log10, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
//callFuncSLEEF1_1(Sleef_log1pf16_u35, "log1p, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
fillSP(abufsp, -100, 100);
callFuncSLEEF1_1(Sleef_expf16_u10 , "exp, SP, 512", -100, 100, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_exp2f16_u10 , "exp2, SP, 512", -100, 100, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_exp10f16_u10, "exp10, SP, 512", -100, 100, 1.0, abufsp, vfloat);
fillSP(abufsp, -30, 30);
fillSP(bbufsp, -30, 30);
callFuncSLEEF1_2(Sleef_powf16_u10, "pow, SP, 512", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
fillSP(abufsp, -1.0, 1.0);
callFuncSLEEF1_1(Sleef_asinf16_u10, "asin, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_acosf16_u10, "acos, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_asinf16_u35, "asin, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
callFuncSLEEF1_1(Sleef_acosf16_u35, "acos, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
fillSP(abufsp, -10, 10);
fillSP(bbufsp, -10, 10);
callFuncSLEEF1_1(Sleef_atanf16_u10, "atan, SP, 512", -10, 10, 1.0, abufsp, vfloat);
callFuncSLEEF1_2(Sleef_atan2f16_u10, "atan2, SP, 512", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
callFuncSLEEF1_1(Sleef_atanf16_u35, "atan, SP, 512", -10, 10, 4.0, abufsp, vfloat);
callFuncSLEEF1_2(Sleef_atan2f16_u35, "atan2, SP, 512", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
}
#else // #ifdef ENABLED
void benchSleef512_DPTrig() {}
void benchSleef512_DPNontrig() {}
void benchSleef512_SPTrig() {}
void benchSleef512_SPNontrig() {}
#endif // #ifdef ENABLED

View File

@@ -0,0 +1,153 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#include <x86intrin.h>
#include "bench.h"
int veclen = 16;
int enableLogExp;
double *abufdp, *bbufdp;
float *abufsp, *bbufsp;
FILE *fp;
#if defined(__i386__) || defined(__x86_64__)
void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
uint32_t a, b, c, d;
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
}
int cpuSupportsAVX() {
int32_t reg[4];
x86CpuID(reg, 1, 0);
return (reg[2] & (1 << 28)) != 0;
}
int cpuSupportsAVX512F() {
int32_t reg[4];
x86CpuID(reg, 7, 0);
return (reg[1] & (1 << 16)) != 0;
}
#endif
uint64_t Sleef_currentTimeMicros() {
struct timespec tp;
clock_gettime(CLOCK_MONOTONIC, &tp);
return (uint64_t)tp.tv_sec * 1000000LL + ((uint64_t)tp.tv_nsec/1000);
}
void fillDP(double *buf, double min, double max) {
for(int i=0;i<NITER1*veclen;i++) {
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
buf[i] = r * (max - min) + min;
}
}
void fillSP(float *buf, double min, double max) {
for(int i=0;i<NITER1*veclen;i++) {
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
buf[i] = r * (max - min) + min;
}
}
void zeroupper256();
void benchSVML128_DPTrig();
void benchSVML256_DPTrig();
void benchSVML512_DPTrig();
void benchSVML128_DPNontrig();
void benchSVML256_DPNontrig();
void benchSVML512_DPNontrig();
void benchSVML128_SPTrig();
void benchSVML256_SPTrig();
void benchSVML512_SPTrig();
void benchSVML128_SPNontrig();
void benchSVML256_SPNontrig();
void benchSVML512_SPNontrig();
//
int main(int argc, char **argv) {
char *columnTitle = "SVML", *fnBase = "svml";
char fn[1024];
if (argc != 1) columnTitle = argv[1];
if (argc >= 3) fnBase = argv[2];
srandom(time(NULL));
#if defined(__i386__) || defined(__x86_64__)
int do128bit = 1;
int do256bit = cpuSupportsAVX();
int do512bit = cpuSupportsAVX512F();
#elif defined(__ARM_NEON)
int do128bit = 1;
int do256bit = 0;
int do512bit = 0;
#else
#error Unsupported architecture
#endif
posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
abufsp = (float *)abufdp;
bbufsp = (float *)bbufdp;
enableLogExp = SVMLULP < 2;
sprintf(fn, "%sdptrig%gulp.out", fnBase, (double)SVMLULP);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do256bit) zeroupper256();
if (do128bit) benchSVML128_DPTrig();
if (do256bit) benchSVML256_DPTrig();
if (do512bit) benchSVML512_DPTrig();
fclose(fp);
sprintf(fn, "%sdpnontrig%gulp.out", fnBase, (double)SVMLULP);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do256bit) zeroupper256();
if (do128bit) benchSVML128_DPNontrig();
if (do256bit) benchSVML256_DPNontrig();
if (do512bit) benchSVML512_DPNontrig();
fclose(fp);
sprintf(fn, "%ssptrig%gulp.out", fnBase, (double)SVMLULP);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do256bit) zeroupper256();
if (do128bit) benchSVML128_SPTrig();
if (do256bit) benchSVML256_SPTrig();
if (do512bit) benchSVML512_SPTrig();
fclose(fp);
sprintf(fn, "%sspnontrig%gulp.out", fnBase, (double)SVMLULP);
fp = fopen(fn, "w");
fprintf(fp, "%s\n", columnTitle);
if (do256bit) zeroupper256();
if (do128bit) benchSVML128_SPNontrig();
if (do256bit) benchSVML256_SPNontrig();
if (do512bit) benchSVML512_SPNontrig();
fclose(fp);
exit(0);
}

View File

@@ -0,0 +1,144 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#include <x86intrin.h>
uint64_t Sleef_currentTimeMicros();
void fillDP(double *buf, double min, double max);
void fillSP(float *buf, double min, double max);
extern char x86BrandString[256], versionString[1024];
extern int veclen;
extern int enableLogExp;
extern double *abufdp, *bbufdp;
extern float *abufsp, *bbufsp;
extern FILE *fp;
#include "bench.h"
#ifdef __SSE2__
typedef __m128d vdouble;
typedef __m128 vfloat;
#define ENABLED
#endif
#ifdef ENABLED
void benchSVML128_DPTrig() {
fillDP(abufdp, 0, 6.28);
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 6.28, abufdp, vdouble);
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 6.28, abufdp, vdouble);
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 6.28, abufdp, vdouble);
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 6.28, abufdp, vdouble);
fillDP(abufdp, 0, 1e+6);
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+6, abufdp, vdouble);
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+6, abufdp, vdouble);
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+6, abufdp, vdouble);
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+6, abufdp, vdouble);
fillDP(abufdp, 0, 1e+100);
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+100, abufdp, vdouble);
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+100, abufdp, vdouble);
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+100, abufdp, vdouble);
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+100, abufdp, vdouble);
}
void benchSVML128_DPNontrig() {
fillDP(abufdp, 0, 1e+300);
callFuncSVML1_1(_mm_log_pd , "log, DP, 128", 0, 1e+300, abufdp, vdouble);
if (enableLogExp) {
callFuncSVML1_1(_mm_log10_pd, "log10, DP, 128", 0, 1e+300, abufdp, vdouble);
callFuncSVML1_1(_mm_log1p_pd, "log1p, DP, 128", 0, 1e+300, abufdp, vdouble);
fillDP(abufdp, -700, 700);
callFuncSVML1_1(_mm_exp_pd , "exp, DP, 128", -700, 700, abufdp, vdouble);
callFuncSVML1_1(_mm_exp2_pd , "exp2, DP, 128", -700, 700, abufdp, vdouble);
callFuncSVML1_1(_mm_exp10_pd, "exp10, DP, 128", -700, 700, abufdp, vdouble);
fillDP(abufdp, -30, 30);
fillDP(bbufdp, -30, 30);
callFuncSVML1_2(_mm_pow_pd, "pow, DP, 128", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
}
fillDP(abufdp, -1.0, 1.0);
callFuncSVML1_1(_mm_asin_pd, "asin, DP, 128", -1.0, 1.0, abufdp, vdouble);
callFuncSVML1_1(_mm_acos_pd, "acos, DP, 128", -1.0, 1.0, abufdp, vdouble);
fillDP(abufdp, -10, 10);
fillDP(bbufdp, -10, 10);
callFuncSVML1_1(_mm_atan_pd, "atan, DP, 128", -10, 10, abufdp, vdouble);
callFuncSVML1_2(_mm_atan2_pd, "atan2, DP, 128", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
}
void benchSVML128_SPTrig() {
fillSP(abufsp, 0, 6.28);
callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 6.28, abufsp, vfloat);
callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 6.28, abufsp, vfloat);
callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 6.28, abufsp, vfloat);
callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 6.28, abufsp, vfloat);
fillSP(abufsp, 0, 1e+20);
callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 1e+20, abufsp, vfloat);
callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 1e+20, abufsp, vfloat);
callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 1e+20, abufsp, vfloat);
callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 1e+20, abufsp, vfloat);
}
void benchSVML128_SPNontrig() {
fillSP(abufsp, 0, 1e+38);
callFuncSVML1_1(_mm_log_ps , "log, SP, 128", 0, 1e+38, abufsp, vfloat);
if (enableLogExp) {
callFuncSVML1_1(_mm_log10_ps, "log10, SP, 128", 0, 1e+38, abufsp, vfloat);
//callFuncSVML1_1(_mm_log1p_ps, "log1p, SP, 128", 0, 1e+38, abufsp, vfloat);
fillSP(abufsp, -100, 100);
callFuncSVML1_1(_mm_exp_ps , "exp, SP, 128", -100, 100, abufsp, vfloat);
callFuncSVML1_1(_mm_exp2_ps , "exp2, SP, 128", -100, 100, abufsp, vfloat);
callFuncSVML1_1(_mm_exp10_ps, "exp10, SP, 128", -100, 100, abufsp, vfloat);
fillSP(abufsp, -30, 30);
fillSP(bbufsp, -30, 30);
callFuncSVML1_2(_mm_pow_ps, "pow, SP, 128", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
}
fillSP(abufsp, -1.0, 1.0);
callFuncSVML1_1(_mm_asin_ps, "asin, SP, 128", -1.0, 1, abufsp, vfloat);
callFuncSVML1_1(_mm_acos_ps, "acos, SP, 128", -1.0, 1, abufsp, vfloat);
fillSP(abufsp, -10, 10);
fillSP(bbufsp, -10, 10);
callFuncSVML1_1(_mm_atan_ps, "atan, SP, 128", -10, 10, abufsp, vfloat);
callFuncSVML1_2(_mm_atan2_ps, "atan2, SP, 128", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
}
#else // #ifdef ENABLED
void benchSVML128_DPTrig() {}
void benchSVML128_DPNontrig() {}
void benchSVML128_SPTrig() {}
void benchSVML128_SPNontrig() {}
#endif // #ifdef ENABLED

View File

@@ -0,0 +1,147 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#include <x86intrin.h>
uint64_t Sleef_currentTimeMicros();
void fillDP(double *buf, double min, double max);
void fillSP(float *buf, double min, double max);
extern char x86BrandString[256], versionString[1024];
extern int veclen;
extern int enableLogExp;
extern double *abufdp, *bbufdp;
extern float *abufsp, *bbufsp;
extern FILE *fp;
#include "bench.h"
#ifdef __AVX__
typedef __m256d vdouble;
typedef __m256 vfloat;
#define ENABLED
#endif
#ifdef ENABLED
void zeroupper256() { _mm256_zeroupper(); }
void benchSVML256_DPTrig() {
fillDP(abufdp, 0, 6.28);
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 6.28, abufdp, vdouble);
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 6.28, abufdp, vdouble);
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 6.28, abufdp, vdouble);
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 6.28, abufdp, vdouble);
fillDP(abufdp, 0, 1e+6);
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+6, abufdp, vdouble);
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+6, abufdp, vdouble);
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+6, abufdp, vdouble);
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+6, abufdp, vdouble);
fillDP(abufdp, 0, 1e+100);
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+100, abufdp, vdouble);
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+100, abufdp, vdouble);
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+100, abufdp, vdouble);
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+100, abufdp, vdouble);
}
void benchSVML256_DPNontrig() {
fillDP(abufdp, 0, 1e+300);
callFuncSVML1_1(_mm256_log_pd , "log, DP, 256", 0, 1e+300, abufdp, vdouble);
if (enableLogExp) {
callFuncSVML1_1(_mm256_log10_pd, "log10, DP, 256", 0, 1e+300, abufdp, vdouble);
callFuncSVML1_1(_mm256_log1p_pd, "log1p, DP, 256", 0, 1e+300, abufdp, vdouble);
fillDP(abufdp, -700, 700);
callFuncSVML1_1(_mm256_exp_pd , "exp, DP, 256", -700, 700, abufdp, vdouble);
callFuncSVML1_1(_mm256_exp2_pd , "exp2, DP, 256", -700, 700, abufdp, vdouble);
callFuncSVML1_1(_mm256_exp10_pd, "exp10, DP, 256", -700, 700, abufdp, vdouble);
fillDP(abufdp, -30, 30);
fillDP(bbufdp, -30, 30);
callFuncSVML1_2(_mm256_pow_pd, "pow, DP, 256", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
}
fillDP(abufdp, -1.0, 1.0);
callFuncSVML1_1(_mm256_asin_pd, "asin, DP, 256", -1.0, 1.0, abufdp, vdouble);
callFuncSVML1_1(_mm256_acos_pd, "acos, DP, 256", -1.0, 1.0, abufdp, vdouble);
fillDP(abufdp, -10, 10);
fillDP(bbufdp, -10, 10);
callFuncSVML1_1(_mm256_atan_pd, "atan, DP, 256", -10, 10, abufdp, vdouble);
callFuncSVML1_2(_mm256_atan2_pd, "atan2, DP, 256", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
}
void benchSVML256_SPTrig() {
fillSP(abufsp, 0, 6.28);
callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 6.28, abufsp, vfloat);
callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 6.28, abufsp, vfloat);
callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 6.28, abufsp, vfloat);
callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 6.28, abufsp, vfloat);
fillSP(abufsp, 0, 1e+20);
callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 1e+20, abufsp, vfloat);
callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 1e+20, abufsp, vfloat);
callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 1e+20, abufsp, vfloat);
callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 1e+20, abufsp, vfloat);
}
void benchSVML256_SPNontrig() {
fillSP(abufsp, 0, 1e+38);
callFuncSVML1_1(_mm256_log_ps , "log, SP, 256", 0, 1e+38, abufsp, vfloat);
if (enableLogExp) {
callFuncSVML1_1(_mm256_log10_ps, "log10, SP, 256", 0, 1e+38, abufsp, vfloat);
//callFuncSVML1_1(_mm256_log1p_ps, "log1p, SP, 256", 0, 1e+38, abufsp, vfloat);
fillSP(abufsp, -100, 100);
callFuncSVML1_1(_mm256_exp_ps , "exp, SP, 256", -100, 100, abufsp, vfloat);
callFuncSVML1_1(_mm256_exp2_ps , "exp2, SP, 256", -100, 100, abufsp, vfloat);
callFuncSVML1_1(_mm256_exp10_ps, "exp10, SP, 256", -100, 100, abufsp, vfloat);
fillSP(abufsp, -30, 30);
fillSP(bbufsp, -30, 30);
callFuncSVML1_2(_mm256_pow_ps, "pow, SP, 256", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
}
fillSP(abufsp, -1.0, 1.0);
callFuncSVML1_1(_mm256_asin_ps, "asin, SP, 256", -1.0, 1, abufsp, vfloat);
callFuncSVML1_1(_mm256_acos_ps, "acos, SP, 256", -1.0, 1, abufsp, vfloat);
fillSP(abufsp, -10, 10);
fillSP(bbufsp, -10, 10);
callFuncSVML1_1(_mm256_atan_ps, "atan, SP, 256", -10, 10, abufsp, vfloat);
callFuncSVML1_2(_mm256_atan2_ps, "atan2, SP, 256", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
}
#else // #ifdef ENABLED
void zeroupper256() {}
void benchSVML256_DPTrig() {}
void benchSVML256_DPNontrig() {}
void benchSVML256_SPTrig() {}
void benchSVML256_SPNontrig() {}
#endif // #ifdef ENABLED

View File

@@ -0,0 +1,144 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#include <x86intrin.h>
uint64_t Sleef_currentTimeMicros();
void fillDP(double *buf, double min, double max);
void fillSP(float *buf, double min, double max);
extern char x86BrandString[256], versionString[1024];
extern int veclen;
extern int enableLogExp;
extern double *abufdp, *bbufdp;
extern float *abufsp, *bbufsp;
extern FILE *fp;
#include "bench.h"
#ifdef __AVX512F__
typedef __m512d vdouble;
typedef __m512 vfloat;
#define ENABLED
#endif
#ifdef ENABLED
void benchSVML512_DPTrig() {
fillDP(abufdp, 0, 6.28);
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 6.28, abufdp, vdouble);
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 6.28, abufdp, vdouble);
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 6.28, abufdp, vdouble);
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 6.28, abufdp, vdouble);
fillDP(abufdp, 0, 1e+6);
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+6, abufdp, vdouble);
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+6, abufdp, vdouble);
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+6, abufdp, vdouble);
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+6, abufdp, vdouble);
fillDP(abufdp, 0, 1e+100);
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+100, abufdp, vdouble);
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+100, abufdp, vdouble);
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+100, abufdp, vdouble);
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+100, abufdp, vdouble);
}
void benchSVML512_DPNontrig() {
fillDP(abufdp, 0, 1e+300);
callFuncSVML1_1(_mm512_log_pd , "log, DP, 512", 0, 1e+300, abufdp, vdouble);
if (enableLogExp) {
callFuncSVML1_1(_mm512_log10_pd, "log10, DP, 512", 0, 1e+300, abufdp, vdouble);
callFuncSVML1_1(_mm512_log1p_pd, "log1p, DP, 512", 0, 1e+300, abufdp, vdouble);
fillDP(abufdp, -700, 700);
callFuncSVML1_1(_mm512_exp_pd , "exp, DP, 512", -700, 700, abufdp, vdouble);
callFuncSVML1_1(_mm512_exp2_pd , "exp2, DP, 512", -700, 700, abufdp, vdouble);
callFuncSVML1_1(_mm512_exp10_pd, "exp10, DP, 512", -700, 700, abufdp, vdouble);
fillDP(abufdp, -30, 30);
fillDP(bbufdp, -30, 30);
callFuncSVML1_2(_mm512_pow_pd, "pow, DP, 512", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
}
fillDP(abufdp, -1.0, 1.0);
callFuncSVML1_1(_mm512_asin_pd, "asin, DP, 512", -1.0, 1.0, abufdp, vdouble);
callFuncSVML1_1(_mm512_acos_pd, "acos, DP, 512", -1.0, 1.0, abufdp, vdouble);
fillDP(abufdp, -10, 10);
fillDP(bbufdp, -10, 10);
callFuncSVML1_1(_mm512_atan_pd, "atan, DP, 512", -10, 10, abufdp, vdouble);
callFuncSVML1_2(_mm512_atan2_pd, "atan2, DP, 512", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
}
void benchSVML512_SPTrig() {
fillSP(abufsp, 0, 6.28);
callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 6.28, abufsp, vfloat);
callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 6.28, abufsp, vfloat);
callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 6.28, abufsp, vfloat);
callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 6.28, abufsp, vfloat);
fillSP(abufsp, 0, 1e+20);
callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 1e+20, abufsp, vfloat);
callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 1e+20, abufsp, vfloat);
callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 1e+20, abufsp, vfloat);
callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 1e+20, abufsp, vfloat);
}
void benchSVML512_SPNontrig() {
fillSP(abufsp, 0, 1e+38);
callFuncSVML1_1(_mm512_log_ps , "log, SP, 512", 0, 1e+38, abufsp, vfloat);
if (enableLogExp) {
callFuncSVML1_1(_mm512_log10_ps, "log10, SP, 512", 0, 1e+38, abufsp, vfloat);
//callFuncSVML1_1(_mm512_log1p_ps, "log1p, SP, 512", 0, 1e+38, abufsp, vfloat);
fillSP(abufsp, -100, 100);
callFuncSVML1_1(_mm512_exp_ps , "exp, SP, 512", -100, 100, abufsp, vfloat);
callFuncSVML1_1(_mm512_exp2_ps , "exp2, SP, 512", -100, 100, abufsp, vfloat);
callFuncSVML1_1(_mm512_exp10_ps, "exp10, SP, 512", -100, 100, abufsp, vfloat);
fillSP(abufsp, -30, 30);
fillSP(bbufsp, -30, 30);
callFuncSVML1_2(_mm512_pow_ps, "pow, SP, 512", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
}
fillSP(abufsp, -1.0, 1.0);
callFuncSVML1_1(_mm512_asin_ps, "asin, SP, 512", -1.0, 1, abufsp, vfloat);
callFuncSVML1_1(_mm512_acos_ps, "acos, SP, 512", -1.0, 1, abufsp, vfloat);
fillSP(abufsp, -10, 10);
fillSP(bbufsp, -10, 10);
callFuncSVML1_1(_mm512_atan_ps, "atan, SP, 512", -10, 10, abufsp, vfloat);
callFuncSVML1_2(_mm512_atan2_ps, "atan2, SP, 512", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
}
#else // #ifdef ENABLED
void benchSVML512_DPTrig() {}
void benchSVML512_DPNontrig() {}
void benchSVML512_SPTrig() {}
void benchSVML512_SPNontrig() {}
#endif // #ifdef ENABLED

View File

@@ -0,0 +1,17 @@
#!/bin/sh
echo
read -p "Enter label of measurement(e.g. My desktop PC) : " label
if [ -f counter.txt ]
then
counter=`cat counter.txt`
else
counter=0
fi
echo Measurement in progress. This may take several minutes.
for i in $*; do
$i "$label" $counter
done
counter=$((counter+1))
echo $counter > counter.txt

View File

@@ -0,0 +1,517 @@
# Settings
# TESTER3_DEFINITIONS
set(TESTER3_DEFINITIONS_SSE2 ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=sse2)
set(TESTER3_DEFINITIONS_SSE4 ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=sse4)
set(TESTER3_DEFINITIONS_AVX2128 ATR=finz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=avx2128)
set(TESTER3_DEFINITIONS_AVX ATR=cinz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8 EXTSPEC=avx)
set(TESTER3_DEFINITIONS_FMA4 ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8 EXTSPEC=fma4)
set(TESTER3_DEFINITIONS_AVX2 ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8 EXTSPEC=avx2)
set(TESTER3_DEFINITIONS_AVX512F ATR=finz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512f)
set(TESTER3_DEFINITIONS_AVX512FNOFMA ATR=cinz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512fnofma)
set(TESTER3_DEFINITIONS_ADVSIMD ATR=finz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimd)
set(TESTER3_DEFINITIONS_ADVSIMDNOFMA ATR=cinz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimdnofma)
set(TESTER3_DEFINITIONS_SVE ATR=finz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=sve)
set(TESTER3_DEFINITIONS_SVENOFMA ATR=cinz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=svenofma)
set(TESTER3_DEFINITIONS_VSX ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx)
set(TESTER3_DEFINITIONS_VSXNOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsxnofma)
set(TESTER3_DEFINITIONS_VSX3 ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3)
set(TESTER3_DEFINITIONS_VSX3NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3nofma)
set(TESTER3_DEFINITIONS_VXE ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe)
set(TESTER3_DEFINITIONS_VXENOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxenofma)
set(TESTER3_DEFINITIONS_VXE2 ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2)
set(TESTER3_DEFINITIONS_VXE2NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2nofma)
set(TESTER3_DEFINITIONS_RVVM1 ATR=finz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1 ENABLE_RVVM1)
set(TESTER3_DEFINITIONS_RVVM1NOFMA ATR=cinz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1nofma ENABLE_RVVM1)
set(TESTER3_DEFINITIONS_RVVM2 ATR=finz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2 ENABLE_RVVM2)
set(TESTER3_DEFINITIONS_RVVM2NOFMA ATR=cinz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2nofma ENABLE_RVVM2)
set(TESTER3_DEFINITIONS_PUREC_SCALAR ATR=cinz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purec)
set(TESTER3_DEFINITIONS_PURECFMA_SCALAR ATR=finz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purecfma)
#
if (SLEEF_ARCH_X86)
set(TEST3_CINZ purec_scalar sse2 sse4 avx avx512fnofma)
set(TEST3_FINZ purecfma_scalar avx2128 avx2 avx512f)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(TEST3_CINZ purec_scalar advsimdnofma svenofma)
set(TEST3_FINZ purecfma_scalar advsimd sve)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
set(TEST3_CINZ purec_scalar)
set(TEST3_FINZ purecfma_scalar)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
set(TEST3_CINZ purec_scalar vsxnofma vsx3nofma)
set(TEST3_FINZ purecfma_scalar vsx vsx3)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
set(TEST3_CINZ purec_scalar vxenofma vxe2nofma)
set(TEST3_FINZ purecfma_scalar vxe vxe2)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
set(TEST3_CINZ purec_scalar rvvm1nofma rvvm2nofma)
set(TEST3_FINZ purecfma_scalar rvvm1 rvvm2)
endif()
#
link_directories(${sleef_BINARY_DIR}/lib) # libsleef
link_directories(${sleef_BINARY_DIR}/src/common) # common.a
include_directories(${sleef_BINARY_DIR}/include) # sleef.h
include_directories(${sleef_SOURCE_DIR}/src/libm) # rename.h
include_directories(${sleef_BINARY_DIR}/src/libm/include) # rename headers
if(NOT LIB_MPFR)
find_program(TESTER_COMMAND tester)
endif(NOT LIB_MPFR)
if (SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified and tester is not available")
endif(SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
find_library(LIBRT rt)
if (NOT LIBRT)
set(LIBRT "")
endif()
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}")
set(COMMON_TARGET_PROPERTIES
C_STANDARD 99 # -std=gnu99
)
if (SLEEF_ENABLE_LTO)
list(APPEND COMMON_TARGET_PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) # -flto
endif()
#
function(add_test_iut IUT C)
if (LIB_MPFR)
set(TESTER ${TARGET_TESTER})
elseif(TESTER_COMMAND)
set(TESTER ${TESTER_COMMAND})
endif()
# When we are crosscompiling using the mkrename* tools from a native
# build, we use the tester executable from the native build.
if (CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
set(TESTER ${NATIVE_BUILD_DIR}/bin/${TARGET_TESTER})
endif(CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
if (TESTER)
if (NOT EMULATOR)
if (SDE_COMMAND)
set(FLAGS_SDE "--sde" ${SDE_COMMAND})
else()
set(FLAGS_SDE)
endif()
if (ARMIE_COMMAND)
set(FLAGS_ARMIE ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS})
else()
set(FLAGS_ARMIE)
endif()
add_test(NAME ${IUT}
COMMAND ${TESTER} ${FLAGS_SDE} ${FLAGS_ARMIE} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
set_tests_properties(${IUT} PROPERTIES COST ${C})
else()
add_test(NAME ${IUT}
COMMAND ${TESTER} "--qemu" ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
set_tests_properties(${IUT} PROPERTIES COST ${C})
endif()
endif()
endfunction()
# Compile executable 'iut'
add_executable(${TARGET_IUT} iut.c testerutil.c)
target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF}
${LIBM} ${LIBRT})
set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
add_test_iut(${TARGET_IUT} 1.0)
set(IUT_LIST ${TARGET_IUT})
# Compile executable 'iutcuda'
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
add_executable(iutcuda iutcuda.cu)
set_target_properties(iutcuda PROPERTIES LINKER_LANGUAGE CUDA)
target_compile_options(iutcuda PRIVATE "--fmad=false;-Xcompiler;-ffp-contract=off")
add_dependencies(iutcuda ${TARGET_INLINE_HEADERS})
add_test_iut(iutcuda 20.0)
list(APPEND IUT_LIST iutcuda)
endif()
set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
# Add vector extension `iut`s
macro(test_extension SIMD)
if(COMPILER_SUPPORTS_${SIMD})
string(TOLOWER ${SIMD} LCSIMD)
string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD})
add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC})
target_compile_options(${TARGET_IUT${SIMD}}
PRIVATE ${FLAGS_ENABLE_${SIMD}})
target_compile_definitions(${TARGET_IUT${SIMD}}
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}
${LIBM} ${LIBRT})
if (FORCE_AAVPCS)
target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1)
endif(FORCE_AAVPCS)
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS})
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF})
set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (DEFINED COSTOVERRIDE_${SIMD})
add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}})
else()
add_test_iut(${TARGET_IUT${SIMD}} 1.0)
endif()
list(APPEND IUT_LIST ${TARGET_IUT${SIMD}})
# The iut programs whose names begin with "iuty" are the iut for the
# deterministic version of functions. By checking the result of
# testing with iutysse2, for example, it can be checked that the
# corresponding deterministic functions passes the accuracy and
# nonnumber tests.
string(CONCAT IUTYNAME "iuty" ${LCSIMD})
add_executable(${IUTYNAME} ${IUT_SRC})
target_compile_options(${IUTYNAME}
PRIVATE ${FLAGS_ENABLE_${SIMD}})
target_compile_definitions(${IUTYNAME}
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF}
${LIBM} ${LIBRT})
add_dependencies(${IUTYNAME} ${TARGET_HEADERS})
add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF})
set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (DEFINED COSTOVERRIDE_${SIMD})
add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}})
else()
add_test_iut(${IUTYNAME} 1.0)
endif()
list(APPEND IUT_LIST ${IUTYNAME})
# The iut programs whose names begin with "iuti" are the iut for the
# inline version of functions.
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
string(CONCAT IUTINAME "iuti" ${LCSIMD})
add_executable(${IUTINAME} ${IUT_SRC})
target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}})
target_compile_definitions(${IUTINAME}
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
MACRO_ONLY_HEADER="macroonly${SIMD}.h"
SIMD_SUFFIX=_${LCSIMD}_sleef
)
target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include)
target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
if (DEFINED COSTOVERRIDE_${SIMD})
add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}})
else()
add_test_iut(${IUTINAME} 1.0)
endif()
list(APPEND IUT_LIST ${IUTINAME})
endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
if(LIB_MPFR AND NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND NOT MINGW)
# Build tester2 SIMD
string(TOLOWER ${SIMD} SCSIMD)
foreach(P dp sp)
set(T "tester2${SCSIMD}${P}")
add_executable(${T} tester2simd${P}.c testerutil.c)
if(FORCE_AAVPCS)
target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
endif(FORCE_AAVPCS)
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
add_dependencies(${T} ${TARGET_HEADERS})
add_dependencies(${T} ${TARGET_LIBSLEEF})
if (MPFR_INCLUDE_DIR)
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
endif()
# The tester2 programs whose name begins with "tester2y" are the
# testing program for the deterministic version of functions.
set(T "tester2y${SCSIMD}${P}")
add_executable(${T} tester2simd${P}.c testerutil.c)
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
add_dependencies(${T} ${TARGET_HEADERS})
add_dependencies(${T} ${TARGET_LIBSLEEF})
if (MPFR_INCLUDE_DIR)
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
endif()
endforeach()
endif()
if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND SLEEF_OPENSSL_FOUND)
# Build tester3
string(TOLOWER ${SIMD} SCSIMD)
set(T "tester3${SCSIMD}")
add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
target_compile_definitions(${T} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${TESTER3_DEFINITIONS_${SIMD}})
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
# Enable Vector PCS for Advanced SIMD (if supported)
if(FORCE_AAVPCS)
host_target_AAVPCS_definitions(${T})
endif()
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBM} ${SLEEF_OPENSSL_LIBRARIES})
target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
add_dependencies(${T} ${TARGET_HEADERS})
add_dependencies(${T} ${TARGET_LIBSLEEF})
# Add test with tester3
list(FIND TEST3_CINZ ${SCSIMD} INDEX_TEST3_CINZ)
if (NOT INDEX_TEST3_CINZ EQUAL -1)
if (SDE_COMMAND)
add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
elseif(EMULATOR)
add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
else()
add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
endif()
if (DEFINED COSTOVERRIDE_${SIMD})
set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
else()
set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
endif()
endif()
list(FIND TEST3_FINZ ${SCSIMD} INDEX_TEST3_FINZ)
if (NOT INDEX_TEST3_FINZ EQUAL -1)
if (SDE_COMMAND)
add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
elseif(EMULATOR)
add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
else()
add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
endif()
if (DEFINED COSTOVERRIDE_${SIMD})
set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
else()
set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
endif()
endif()
endif()
endif(COMPILER_SUPPORTS_${SIMD})
endmacro(test_extension)
foreach(SIMD ${SLEEF_SUPPORTED_LIBM_EXTENSIONS})
test_extension(${SIMD})
endforeach()
function(add_gnuabi_compatibility_test SIMD MASKED)
if (MASKED)
set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD}_masked)
else(MASKED)
set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD})
endif(MASKED)
add_executable(${GNUABI_COMPATIBILITY_TEST} gnuabi_compatibility.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
set_target_properties(${GNUABI_COMPATIBILITY_TEST} PROPERTIES ${COMMON_TARGET_PROPERTIES})
target_compile_options(${GNUABI_COMPATIBILITY_TEST}
PRIVATE ${FLAGS_ENABLE_${SIMD}})
if (MASKED)
target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} MASKED_GNUABI=1)
else(MASKED)
target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
endif(MASKED)
if (FORCE_AAVPCS)
target_compile_definitions(${GNUABI_COMPATIBILITY_TEST} PRIVATE ENABLE_AAVPCS=1)
endif(FORCE_AAVPCS)
target_link_libraries(${GNUABI_COMPATIBILITY_TEST} ${TARGET_LIBSLEEFGNUABI} ${LIBM})
# These are linker tests that don't really need to be executed,
# but seeing them in the report of ctest gives an idea of what
# has been built for testing.
if (EMULATOR)
add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
COMMAND ${EMULATOR} $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
elseif(SDE_COMMAND)
add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
COMMAND ${SDE_COMMAND} "--" $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
else()
add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
COMMAND $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
endif(EMULATOR)
endfunction(add_gnuabi_compatibility_test)
if(ENABLE_GNUABI)
foreach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
if(COMPILER_SUPPORTS_${SIMD})
# GNUABI compatibility for the unmasked symbols.
add_gnuabi_compatibility_test(${SIMD} OFF)
# GNUABI compatibility for the masked symbols.
if (MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
add_gnuabi_compatibility_test(${SIMD} ON)
endif(MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
endif (COMPILER_SUPPORTS_${SIMD})
endforeach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
endif(ENABLE_GNUABI)
#
if (SLEEF_ARCH_X86)
# iutdsp128
add_executable(iutdsp128 ${IUT_SRC})
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
add_test_iut(iutdsp128 1.0)
list(APPEND IUT_LIST iutdsp128)
# iutdsp256
add_executable(iutdsp256 ${IUT_SRC})
target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX})
target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
add_test_iut(iutdsp256 1.0)
list(APPEND IUT_LIST iutdsp256)
endif(SLEEF_ARCH_X86)
if (SLEEF_ARCH_PPC64)
add_executable(iutdsp128 ${IUT_SRC})
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX})
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
add_test_iut(iutdsp128 1.0)
list(APPEND IUT_LIST iutdsp128)
endif(SLEEF_ARCH_PPC64)
if (SLEEF_ARCH_S390X)
add_executable(iutdsp128 ${IUT_SRC})
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE})
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
add_test_iut(iutdsp128 1.0)
list(APPEND IUT_LIST iutdsp128)
endif(SLEEF_ARCH_S390X)
if(SLEEF_BUILD_SCALAR_LIB)
# Compile executable 'iutscalar'
add_executable(iutscalar iut.c testerutil.c)
target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS})
target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES})
add_test_iut(iutscalar 1.0)
list(APPEND IUT_LIST iutscalar)
endif()
if(LIB_MPFR AND NOT MINGW)
# Build tester2 scalar
set(PRECISIONS dp sp)
if(COMPILER_SUPPORTS_LONG_DOUBLE)
list(APPEND PRECISIONS ld)
endif()
if(COMPILER_SUPPORTS_QUADMATH)
list(APPEND PRECISIONS qp)
set(LIBQUADMATH "-lquadmath")
set(ENABLEFLOAT128 PRIVATE ENABLEFLOAT128=1)
endif()
foreach(P ${PRECISIONS})
set(T "tester2${P}")
add_executable(${T} tester2${P}.c testerutil.c)
target_compile_definitions(${T} PRIVATE USEMPFR=1 ${ENABLEFLOAT128} ${COMMON_TARGET_DEFINITIONS})
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (FORCE_AAVPCS)
target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
endif(FORCE_AAVPCS)
if (MPFR_INCLUDE_DIR)
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
endif()
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${LIB_MPFR} ${LIBM} ${LIBGMP})
add_dependencies(${T} ${TARGET_HEADERS})
add_dependencies(${T} ${TARGET_LIBSLEEF})
endforeach()
# Compile executable 'tester'
add_host_executable(${TARGET_TESTER} tester.c testerutil.c)
if (NOT CMAKE_CROSSCOMPILING)
target_link_libraries(${TARGET_TESTER} ${LIB_MPFR} ${TARGET_LIBSLEEF} ${LIBM} ${LIBGMP})
target_compile_definitions(${TARGET_TESTER}
PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
target_compile_options(${TARGET_TESTER} PRIVATE -Wno-unused-result)
set_target_properties(${TARGET_TESTER} PROPERTIES ${COMMON_TARGET_PROPERTIES})
if (MPFR_INCLUDE_DIR)
target_include_directories(${TARGET_TESTER} PRIVATE ${MPFR_INCLUDE_DIR})
endif()
endif()
endif(LIB_MPFR AND NOT MINGW)
if(ENABLE_GNUABI AND COMPILER_SUPPORTS_OMP_SIMD AND NOT SLEEF_TARGET_PROCESSOR MATCHES "^i.86$")
# Build tester for vectorabi
add_executable(testervecabi testervecabi.c)
target_compile_definitions(testervecabi PRIVATE ${COMMON_TARGET_DEFINITIONS})
target_compile_options(testervecabi PRIVATE ${OpenMP_C_FLAGS})
target_link_libraries(testervecabi ${TARGET_LIBSLEEF} ${OpenMP_C_FLAGS})
set_target_properties(testervecabi PROPERTIES C_STANDARD 99)
add_test(NAME testervecabi COMMAND ${EMULATOR} testervecabi
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endif()
# mveclibtest
if (ENABLE_GNUABI AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
add_executable(mveclibtest-sse2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
set_target_properties(mveclibtest-sse2 PROPERTIES C_STANDARD 99)
target_compile_options(mveclibtest-sse2 PRIVATE ${FLAGS_FASTMATH} "-O3")
target_link_libraries(mveclibtest-sse2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
add_dependencies(mveclibtest-sse2 ${TARGET_HEADERS})
add_test(NAME mveclibtest-sse2 COMMAND mveclibtest-sse2)
add_executable(mveclibtest-avx mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
set_target_properties(mveclibtest-avx PROPERTIES C_STANDARD 99)
target_compile_options(mveclibtest-avx PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX} "-O3")
target_link_libraries(mveclibtest-avx ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
add_dependencies(mveclibtest-avx ${TARGET_HEADERS})
add_test(NAME mveclibtest-avx COMMAND mveclibtest-avx)
add_executable(mveclibtest-avx2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
set_target_properties(mveclibtest-avx2 PROPERTIES C_STANDARD 99)
target_compile_options(mveclibtest-avx2 PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX2} "-O3")
target_link_libraries(mveclibtest-avx2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
add_dependencies(mveclibtest-avx2 ${TARGET_HEADERS})
add_test(NAME mveclibtest-avx2 COMMAND mveclibtest-avx2)
add_executable(mveclibtest-avx512f mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
set_target_properties(mveclibtest-avx512f PROPERTIES C_STANDARD 99)
target_compile_options(mveclibtest-avx512f PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX512F} "-O3")
target_link_libraries(mveclibtest-avx512f ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
add_dependencies(mveclibtest-avx512f ${TARGET_HEADERS})
add_test(NAME mveclibtest-avx512f COMMAND mveclibtest-avx512f)
endif()
#
if (FILECHECK_COMMAND AND COMPILER_SUPPORTS_OPENMP AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
add_test(NAME autovec-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-AVX2")
add_test(NAME autovec-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-SSE2")
add_test(NAME testervecabi-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-SSE2")
add_test(NAME testervecabi-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-AVX2")
endif()
# Tests depends on the library
add_dependencies(${TARGET_IUT} ${TARGET_HEADERS})

View File

@@ -0,0 +1,651 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#define SLEEF_ENABLE_OMP_SIMD
#include "sleef.h"
#define N 1024
double a[N], b[N], c[N], d[N];
float e[N], f[N], g[N], h[N];
void testsind1_u10() {
// CHECK-SSE2: testsind1_u10
// CHECK-AVX2: testsind1_u10
for(int i=0;i<N;i++) a[i] = Sleef_sind1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u10
}
void testsind1_u35() {
// CHECK-SSE2: testsind1_u35
// CHECK-AVX2: testsind1_u35
for(int i=0;i<N;i++) a[i] = Sleef_sind1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u35
}
void testsinf1_u10() {
// CHECK-SSE2: testsinf1_u10
// CHECK-AVX2: testsinf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u10
}
void testsinf1_u35() {
// CHECK-SSE2: testsinf1_u35
// CHECK-AVX2: testsinf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u35
}
void testcosd1_u10() {
// CHECK-SSE2: testcosd1_u10
// CHECK-AVX2: testcosd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u10
}
void testcosd1_u35() {
// CHECK-SSE2: testcosd1_u35
// CHECK-AVX2: testcosd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u35
}
void testcosf1_u10() {
// CHECK-SSE2: testcosf1_u10
// CHECK-AVX2: testcosf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u10
}
void testcosf1_u35() {
// CHECK-SSE2: testcosf1_u35
// CHECK-AVX2: testcosf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u35
}
void testtand1_u10() {
// CHECK-SSE2: testtand1_u10
// CHECK-AVX2: testtand1_u10
for(int i=0;i<N;i++) a[i] = Sleef_tand1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u10
}
void testtand1_u35() {
// CHECK-SSE2: testtand1_u35
// CHECK-AVX2: testtand1_u35
for(int i=0;i<N;i++) a[i] = Sleef_tand1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u35
}
void testtanf1_u10() {
// CHECK-SSE2: testtanf1_u10
// CHECK-AVX2: testtanf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u10
}
void testtanf1_u35() {
// CHECK-SSE2: testtanf1_u35
// CHECK-AVX2: testtanf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u35
}
void testasind1_u10() {
// CHECK-SSE2: testasind1_u10
// CHECK-AVX2: testasind1_u10
for(int i=0;i<N;i++) a[i] = Sleef_asind1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u10
}
void testasind1_u35() {
// CHECK-SSE2: testasind1_u35
// CHECK-AVX2: testasind1_u35
for(int i=0;i<N;i++) a[i] = Sleef_asind1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u35
}
void testasinf1_u10() {
// CHECK-SSE2: testasinf1_u10
// CHECK-AVX2: testasinf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u10
}
void testasinf1_u35() {
// CHECK-SSE2: testasinf1_u35
// CHECK-AVX2: testasinf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u35
}
void testacosd1_u10() {
// CHECK-SSE2: testacosd1_u10
// CHECK-AVX2: testacosd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u10
}
void testacosd1_u35() {
// CHECK-SSE2: testacosd1_u35
// CHECK-AVX2: testacosd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u35
}
void testacosf1_u10() {
// CHECK-SSE2: testacosf1_u10
// CHECK-AVX2: testacosf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u10
}
void testacosf1_u35() {
// CHECK-SSE2: testacosf1_u35
// CHECK-AVX2: testacosf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u35
}
void testatand1_u10() {
// CHECK-SSE2: testatand1_u10
// CHECK-AVX2: testatand1_u10
for(int i=0;i<N;i++) a[i] = Sleef_atand1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u10
}
void testatand1_u35() {
// CHECK-SSE2: testatand1_u35
// CHECK-AVX2: testatand1_u35
for(int i=0;i<N;i++) a[i] = Sleef_atand1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u35
}
void testatanf1_u10() {
// CHECK-SSE2: testatanf1_u10
// CHECK-AVX2: testatanf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u10
}
void testatanf1_u35() {
// CHECK-SSE2: testatanf1_u35
// CHECK-AVX2: testatanf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u35
}
void testatan2d1_u10() {
// CHECK-SSE2: testatan2d1_u10
// CHECK-AVX2: testatan2d1_u10
for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u10(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u10
// CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u10
}
void testatan2d1_u35() {
// CHECK-SSE2: testatan2d1_u35
// CHECK-AVX2: testatan2d1_u35
for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u35(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u35
// CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u35
}
void testatan2f1_u10() {
// CHECK-SSE2: testatan2f1_u10
// CHECK-AVX2: testatan2f1_u10
for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u10(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u10
// CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u10
}
void testatan2f1_u35() {
// CHECK-SSE2: testatan2f1_u35
// CHECK-AVX2: testatan2f1_u35
for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u35(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u35
// CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u35
}
void testsinhd1_u10() {
// CHECK-SSE2: testsinhd1_u10
// CHECK-AVX2: testsinhd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u10
}
void testsinhd1_u35() {
// CHECK-SSE2: testsinhd1_u35
// CHECK-AVX2: testsinhd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u35
}
void testsinhf1_u10() {
// CHECK-SSE2: testsinhf1_u10
// CHECK-AVX2: testsinhf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u10
}
void testsinhf1_u35() {
// CHECK-SSE2: testsinhf1_u35
// CHECK-AVX2: testsinhf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u35
}
void testcoshd1_u10() {
// CHECK-SSE2: testcoshd1_u10
// CHECK-AVX2: testcoshd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u10
}
void testcoshd1_u35() {
// CHECK-SSE2: testcoshd1_u35
// CHECK-AVX2: testcoshd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u35
}
void testcoshf1_u10() {
// CHECK-SSE2: testcoshf1_u10
// CHECK-AVX2: testcoshf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u10
}
void testcoshf1_u35() {
// CHECK-SSE2: testcoshf1_u35
// CHECK-AVX2: testcoshf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u35
}
void testtanhd1_u10() {
// CHECK-SSE2: testtanhd1_u10
// CHECK-AVX2: testtanhd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u10
}
void testtanhd1_u35() {
// CHECK-SSE2: testtanhd1_u35
// CHECK-AVX2: testtanhd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u35
}
void testtanhf1_u10() {
// CHECK-SSE2: testtanhf1_u10
// CHECK-AVX2: testtanhf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u10
}
void testtanhf1_u35() {
// CHECK-SSE2: testtanhf1_u35
// CHECK-AVX2: testtanhf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u35
}
void testasinhd1_u10() {
// CHECK-SSE2: testasinhd1_u10
// CHECK-AVX2: testasinhd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_asinhd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_asinhd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_asinhd1_u10
}
void testasinhf1_u10() {
// CHECK-SSE2: testasinhf1_u10
// CHECK-AVX2: testasinhf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_asinhf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_asinhf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_asinhf1_u10
}
void testacoshd1_u10() {
// CHECK-SSE2: testacoshd1_u10
// CHECK-AVX2: testacoshd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_acoshd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_acoshd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_acoshd1_u10
}
void testacoshf1_u10() {
// CHECK-SSE2: testacoshf1_u10
// CHECK-AVX2: testacoshf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_acoshf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_acoshf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_acoshf1_u10
}
void testatanhd1_u10() {
// CHECK-SSE2: testatanhd1_u10
// CHECK-AVX2: testatanhd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_atanhd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_atanhd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_atanhd1_u10
}
void testatanhf1_u10() {
// CHECK-SSE2: testatanhf1_u10
// CHECK-AVX2: testatanhf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_atanhf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_atanhf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_atanhf1_u10
}
void testlogd1_u10() {
// CHECK-SSE2: testlogd1_u10
// CHECK-AVX2: testlogd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_logd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u10
}
void testlogd1_u35() {
// CHECK-SSE2: testlogd1_u35
// CHECK-AVX2: testlogd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_logd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u35
}
void testlogf1_u10() {
// CHECK-SSE2: testlogf1_u10
// CHECK-AVX2: testlogf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_logf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u10
}
void testlogf1_u35() {
// CHECK-SSE2: testlogf1_u35
// CHECK-AVX2: testlogf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_logf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u35
}
void testlog2d1_u10() {
// CHECK-SSE2: testlog2d1_u10
// CHECK-AVX2: testlog2d1_u10
for(int i=0;i<N;i++) a[i] = Sleef_log2d1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_log2d1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_log2d1_u10
}
void testlog2f1_u10() {
// CHECK-SSE2: testlog2f1_u10
// CHECK-AVX2: testlog2f1_u10
for(int i=0;i<N;i++) e[i] = Sleef_log2f1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_log2f1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_log2f1_u10
}
void testlog10d1_u10() {
// CHECK-SSE2: testlog10d1_u10
// CHECK-AVX2: testlog10d1_u10
for(int i=0;i<N;i++) a[i] = Sleef_log10d1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_log10d1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_log10d1_u10
}
void testlog10f1_u10() {
// CHECK-SSE2: testlog10f1_u10
// CHECK-AVX2: testlog10f1_u10
for(int i=0;i<N;i++) e[i] = Sleef_log10f1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_log10f1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_log10f1_u10
}
void testlog1pd1_u10() {
// CHECK-SSE2: testlog1pd1_u10
// CHECK-AVX2: testlog1pd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_log1pd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_log1pd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_log1pd1_u10
}
void testlog1pf1_u10() {
// CHECK-SSE2: testlog1pf1_u10
// CHECK-AVX2: testlog1pf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_log1pf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_log1pf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_log1pf1_u10
}
void testexpd1_u10() {
// CHECK-SSE2: testexpd1_u10
// CHECK-AVX2: testexpd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_expd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_expd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_expd1_u10
}
void testexpf1_u10() {
// CHECK-SSE2: testexpf1_u10
// CHECK-AVX2: testexpf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_expf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_expf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_expf1_u10
}
void testexp2d1_u10() {
// CHECK-SSE2: testexp2d1_u10
// CHECK-AVX2: testexp2d1_u10
for(int i=0;i<N;i++) a[i] = Sleef_exp2d1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_exp2d1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_exp2d1_u10
}
void testexp2f1_u10() {
// CHECK-SSE2: testexp2f1_u10
// CHECK-AVX2: testexp2f1_u10
for(int i=0;i<N;i++) e[i] = Sleef_exp2f1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_exp2f1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_exp2f1_u10
}
void testexp10d1_u10() {
// CHECK-SSE2: testexp10d1_u10
// CHECK-AVX2: testexp10d1_u10
for(int i=0;i<N;i++) a[i] = Sleef_exp10d1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_exp10d1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_exp10d1_u10
}
void testexp10f1_u10() {
// CHECK-SSE2: testexp10f1_u10
// CHECK-AVX2: testexp10f1_u10
for(int i=0;i<N;i++) e[i] = Sleef_exp10f1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_exp10f1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_exp10f1_u10
}
void testexpm1d1_u10() {
// CHECK-SSE2: testexpm1d1_u10
// CHECK-AVX2: testexpm1d1_u10
for(int i=0;i<N;i++) a[i] = Sleef_expm1d1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_expm1d1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_expm1d1_u10
}
void testexpm1f1_u10() {
// CHECK-SSE2: testexpm1f1_u10
// CHECK-AVX2: testexpm1f1_u10
for(int i=0;i<N;i++) e[i] = Sleef_expm1f1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_expm1f1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_expm1f1_u10
}
void testpowd1_u10() {
// CHECK-SSE2: testpowd1_u10
// CHECK-AVX2: testpowd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_powd1_u10(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_powd1_u10
// CHECK-AVX2: _ZGVdN4vv_Sleef_powd1_u10
}
void testpowf1_u10() {
// CHECK-SSE2: testpowf1_u10
// CHECK-AVX2: testpowf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_powf1_u10(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_powf1_u10
// CHECK-AVX2: _ZGVdN8vv_Sleef_powf1_u10
}
void testcbrtd1_u10() {
// CHECK-SSE2: testcbrtd1_u10
// CHECK-AVX2: testcbrtd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u10
}
void testcbrtd1_u35() {
// CHECK-SSE2: testcbrtd1_u35
// CHECK-AVX2: testcbrtd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u35(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u35
// CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u35
}
void testcbrtf1_u10() {
// CHECK-SSE2: testcbrtf1_u10
// CHECK-AVX2: testcbrtf1_u10
for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u10
}
void testcbrtf1_u35() {
// CHECK-SSE2: testcbrtf1_u35
// CHECK-AVX2: testcbrtf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u35(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u35
// CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u35
}
void testhypotd1_u05() {
// CHECK-SSE2: testhypotd1_u05
// CHECK-AVX2: testhypotd1_u05
for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u05(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u05
// CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u05
}
void testhypotd1_u35() {
// CHECK-SSE2: testhypotd1_u35
// CHECK-AVX2: testhypotd1_u35
for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u35(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u35
// CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u35
}
void testhypotf1_u05() {
// CHECK-SSE2: testhypotf1_u05
// CHECK-AVX2: testhypotf1_u05
for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u05(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u05
// CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u05
}
void testhypotf1_u35() {
// CHECK-SSE2: testhypotf1_u35
// CHECK-AVX2: testhypotf1_u35
for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u35(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u35
// CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u35
}
void testerfd1_u10() {
// CHECK-SSE2: testerfd1_u10
// CHECK-AVX2: testerfd1_u10
for(int i=0;i<N;i++) a[i] = Sleef_erfd1_u10(b[i]);
// CHECK-SSE2: _ZGVbN2v_Sleef_erfd1_u10
// CHECK-AVX2: _ZGVdN4v_Sleef_erfd1_u10
}
void testerff1_u10() {
// CHECK-SSE2: testerff1_u10
// CHECK-AVX2: testerff1_u10
for(int i=0;i<N;i++) e[i] = Sleef_erff1_u10(f[i]);
// CHECK-SSE2: _ZGVbN4v_Sleef_erff1_u10
// CHECK-AVX2: _ZGVdN8v_Sleef_erff1_u10
}
void testfmodd1() {
// CHECK-SSE2: testfmodd1
// CHECK-AVX2: testfmodd1
for(int i=0;i<N;i++) a[i] = Sleef_fmodd1(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_fmodd1
// CHECK-AVX2: _ZGVdN4vv_Sleef_fmodd1
}
void testfmodf1() {
// CHECK-SSE2: testfmodf1
// CHECK-AVX2: testfmodf1
for(int i=0;i<N;i++) e[i] = Sleef_fmodf1(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_fmodf1
// CHECK-AVX2: _ZGVdN8vv_Sleef_fmodf1
}
void testremainderd1() {
// CHECK-SSE2: testremainderd1
// CHECK-AVX2: testremainderd1
for(int i=0;i<N;i++) a[i] = Sleef_remainderd1(b[i], c[i]);
// CHECK-SSE2: _ZGVbN2vv_Sleef_remainderd1
// CHECK-AVX2: _ZGVdN4vv_Sleef_remainderd1
}
void testremainderf1() {
// CHECK-SSE2: testremainderf1
// CHECK-AVX2: testremainderf1
for(int i=0;i<N;i++) e[i] = Sleef_remainderf1(f[i], g[i]);
// CHECK-SSE2: _ZGVbN4vv_Sleef_remainderf1
// CHECK-AVX2: _ZGVdN8vv_Sleef_remainderf1
}

View File

@@ -0,0 +1,714 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
/// This program makes sure that all the symbols that a
/// GNUABI-compatible compiler (clang or gcc) can generate when
/// vectorizing functions call from `#include <math.h>` are present in
/// `libsleefgnuabi.so`.
///
/// The header `math.h` is not the same on all systems, and different
/// macros can activate different sets of functions. The list provide
/// here shoudl cover the union of all possible systems that we want
/// to support. In particular, the test is checking that the "finite"
/// symmbols from `#include <bits/math-finite.h>` are present for
/// those systems supporting them.
#include <setjmp.h>
#include <stdio.h>
#include <string.h>
#if defined(ENABLE_SSE4) || defined(ENABLE_SSE2)
#include <x86intrin.h>
#define ISA_TOKEN b
#define VLEN_SP 4
#define VLEN_DP 2
#define VECTOR_CC
typedef __m128i vopmask;
typedef __m128d vdouble;
typedef __m128 vfloat;
typedef __m128i vint;
typedef __m128i vint2;
#endif /* defined(ENABLE_SSE4) || defined(ENABLE_SSE2) */
#ifdef ENABLE_AVX
#include <x86intrin.h>
#define ISA_TOKEN c
#define VLEN_SP 8
#define VLEN_DP 4
#define VECTOR_CC
typedef __m256i vopmask;
typedef __m256d vdouble;
typedef __m256 vfloat;
typedef __m128i vint;
typedef struct { __m128i x, y; } vint2;
#endif /* ENABLE_AVX */
#ifdef ENABLE_AVX2
#include <x86intrin.h>
#define ISA_TOKEN d
#define VLEN_SP 8
#define VLEN_DP 4
#define VECTOR_CC
typedef __m256i vopmask;
typedef __m256d vdouble;
typedef __m256 vfloat;
typedef __m128i vint;
typedef __m256i vint2;
#endif /* ENABLE_AVX2 */
#ifdef ENABLE_AVX512F
#include <x86intrin.h>
#define ISA_TOKEN e
#define VLEN_SP 16
#define VLEN_DP 8
#define VECTOR_CC
typedef __mmask16 vopmask;
typedef __m512d vdouble;
typedef __m512 vfloat;
typedef __m256i vint;
typedef __m512i vint2;
#endif /* ENABLE_AVX512F */
#ifdef ENABLE_ADVSIMD
#include <arm_neon.h>
#define ISA_TOKEN n
#define VLEN_DP 2
#define VLEN_SP 4
#ifdef ENABLE_AAVPCS
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
#else
#define VECTOR_CC
#endif
typedef uint32x4_t vopmask;
typedef float64x2_t vdouble;
typedef float32x4_t vfloat;
typedef int32x2_t vint;
typedef int32x4_t vint2;
#endif /* ENABLE_ADVSIMDF */
#ifdef ENABLE_SVE
#include <arm_sve.h>
#define ISA_TOKEN s
#define VLEN_SP (svcntw())
#define VLEN_DP (svcntd())
#define VLA_TOKEN x
#define VECTOR_CC
typedef svbool_t vopmask;
typedef svfloat64_t vdouble;
typedef svfloat32_t vfloat;
typedef svint32_t vint;
typedef svint32_t vint2;
#endif /* ENABLE_SVE */
// GNUABI name mangling macro.
#ifndef MASKED_GNUABI
#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name
#define __DECLARE_vd_vd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
#define __CALL_vd_vd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
#define __DECLARE_vi_vd(name, t, vl, p) \
extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
#define __CALL_vi_vd(name, t, vl, p) \
do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint)
#define __CALL_vd_vd_vi(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0)
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble)
#define __CALL_vd_vd_vd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0)
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble)
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0)
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *)
#define __CALL_vd_vd_pvd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0)
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *)
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0)
#define __DECLARE_vf_vf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
#define __CALL_vf_vf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat)
#define __CALL_vf_vf_vf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0)
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat)
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0)
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *)
#define __CALL_vf_vf_pvf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0)
#define __DECLARE_vi_vf(name, t, vl, p) \
extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
#define __CALL_vi_vf(name, t, vl, p) \
do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2)
#define __CALL_vf_vf_vi(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0)
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*)
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0)
#else /******************** MASKED_GNUABI *****************************/
#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name
#define __DECLARE_vd_vd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
#define __CALL_vd_vd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
#define __DECLARE_vi_vd(name, t, vl, p) \
extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
#define __CALL_vi_vd(name, t, vl, p) \
do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask)
#define __CALL_vd_vd_vi(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0)
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask)
#define __CALL_vd_vd_vd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0)
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask)
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0)
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask)
#define __CALL_vd_vd_pvd(name, t, vl, p) \
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0)
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask)
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0)
#define __DECLARE_vf_vf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
#define __CALL_vf_vf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask)
#define __CALL_vf_vf_vf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0)
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask)
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0)
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask)
#define __CALL_vf_vf_pvf(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0)
#define __DECLARE_vi_vf(name, t, vl, p) \
extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
#define __CALL_vi_vf(name, t, vl, p) \
do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask)
#define __CALL_vf_vf_vi(name, t, vl, p) \
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0)
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask)
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0)
#endif /* MASKED_GNUABI */
// Level-1 expansion macros for declaration and call. The signature of
// each function has three input paramters to avoid segfaults of
// sincos-like functions that are effectively loading data from
// memory.
// Make sure that the architectural macros are defined for each vector
// extension.
#ifndef ISA_TOKEN
#error "Missing ISA token"
#endif
#ifndef VLEN_DP
#error "Missing VLEN_DP"
#endif
#ifndef VLEN_DP
#error "Missing VLEN_SP"
#endif
#if defined(ENABLE_SVE) && !defined(VLA_TOKEN)
#error "Missing VLA_TOKEN"
#endif /* defined(ENABLE_SVE) && !defined(VLA_TOKEN) */
// Declaration and call, first level expantion to pick up the
// ISA_TOKEN and VLEN_* architectural macros.
#ifndef ENABLE_SVE
#define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
#define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
#define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
#define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
#define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
#define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
#define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
#define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
#define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
#define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
#else /* ENABLE_SVE */
#define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
#define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
#define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
#define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
#define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi0)
#define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
#define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
#define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
#define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
#define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
#define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
#define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi20)
#define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
#define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
#define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
#endif /* ENABLE_SVE */
//
// Douple precision function declarations.
DECLARE_DP_vd_vd(__acos_finite, v);
DECLARE_DP_vd_vd(__acosh_finite, v);
DECLARE_DP_vd_vd(__asin_finite, v);
DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
DECLARE_DP_vd_vd(__atanh_finite, v);
DECLARE_DP_vd_vd(__cosh_finite, v);
DECLARE_DP_vd_vd(__exp10_finite, v);
DECLARE_DP_vd_vd(__exp2_finite, v);
DECLARE_DP_vd_vd(__exp_finite, v);
DECLARE_DP_vd_vd_vd(__fmod_finite, vv);
DECLARE_DP_vd_vd_pvd(__modf_finite, vl8);
DECLARE_DP_vd_vd_vd(__hypot_finite, vv);
DECLARE_DP_vd_vd(__log10_finite, v);
// DECLARE_DP_vd_vd(__log2_finite,v);
DECLARE_DP_vd_vd(__log_finite, v);
DECLARE_DP_vd_vd_vd(__pow_finite, vv);
DECLARE_DP_vd_vd(__sinh_finite, v);
DECLARE_DP_vd_vd(__sqrt_finite, v);
DECLARE_DP_vd_vd(acos, v);
DECLARE_DP_vd_vd(acosh, v);
DECLARE_DP_vd_vd(asin, v);
DECLARE_DP_vd_vd(asinh, v);
DECLARE_DP_vd_vd(atan, v);
DECLARE_DP_vd_vd_vd(atan2, vv);
DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
DECLARE_DP_vd_vd(atanh, v);
DECLARE_DP_vd_vd(cbrt, v);
DECLARE_DP_vd_vd(ceil, v);
DECLARE_DP_vd_vd_vd(copysign, vv);
DECLARE_DP_vd_vd(cos, v);
DECLARE_DP_vd_vd(cosh, v);
DECLARE_DP_vd_vd(cospi, v);
DECLARE_DP_vd_vd(erf, v);
DECLARE_DP_vd_vd(erfc, v);
DECLARE_DP_vd_vd(exp, v);
DECLARE_DP_vd_vd(exp10, v);
DECLARE_DP_vd_vd(exp2, v);
DECLARE_DP_vi_vd(expfrexp, v);
DECLARE_DP_vd_vd(expm1, v);
DECLARE_DP_vd_vd(fabs, v);
DECLARE_DP_vd_vd_vd(fdim, vv);
DECLARE_DP_vd_vd(floor, v);
DECLARE_DP_vd_vd_vd_vd(fma, vvv);
DECLARE_DP_vd_vd_vd(fmax, vv);
DECLARE_DP_vd_vd_vd(fmin, vv);
DECLARE_DP_vd_vd_vd(fmod, vv);
DECLARE_DP_vd_vd(frfrexp, v);
DECLARE_DP_vd_vd_vd(hypot, vv);
DECLARE_DP_vi_vd(ilogb, v);
DECLARE_DP_vd_vd_vi(ldexp, vv);
DECLARE_DP_vd_vd(lgamma, v);
DECLARE_DP_vd_vd(log, v);
DECLARE_DP_vd_vd(log10, v);
DECLARE_DP_vd_vd(log1p, v);
DECLARE_DP_vd_vd(log2, v);
DECLARE_DP_vd_vd_pvd(modf, vl8);
DECLARE_DP_vd_vd_vd(nextafter, vv);
DECLARE_DP_vd_vd_vd(pow, vv);
DECLARE_DP_vd_vd(rint, v);
DECLARE_DP_vd_vd(round, v);
DECLARE_DP_vd_vd(sin, v);
DECLARE_DP_v_vd_pvd_pvd(sincos, vl8l8);
DECLARE_DP_v_vd_pvd_pvd(sincospi, vl8l8);
DECLARE_DP_vd_vd(sinh, v);
DECLARE_DP_vd_vd(sinpi, v);
DECLARE_DP_vd_vd(sqrt, v);
DECLARE_DP_vd_vd(tan, v);
DECLARE_DP_vd_vd(tanh, v);
DECLARE_DP_vd_vd(tgamma, v);
DECLARE_DP_vd_vd(trunc, v);
// Single precision function declarations.
DECLARE_SP_vf_vf(__acosf_finite, v);
DECLARE_SP_vf_vf(__acoshf_finite, v);
DECLARE_SP_vf_vf(__asinf_finite, v);
DECLARE_SP_vf_vf_vf(__atan2f_finite, vv);
DECLARE_SP_vf_vf(__atanhf_finite, v);
DECLARE_SP_vf_vf(__coshf_finite, v);
DECLARE_SP_vf_vf(__exp10f_finite, v);
DECLARE_SP_vf_vf(__exp2f_finite, v);
DECLARE_SP_vf_vf(__expf_finite, v);
DECLARE_SP_vf_vf_vf(__fmodf_finite, vv);
DECLARE_SP_vf_vf_pvf(__modff_finite, vl4);
DECLARE_SP_vf_vf_vf(__hypotf_finite, vv);
DECLARE_SP_vf_vf(__log10f_finite, v);
// DECLARE_SP_vf_vf(__log2f_finite,v);
DECLARE_SP_vf_vf(__logf_finite, v);
DECLARE_SP_vf_vf_vf(__powf_finite, vv);
DECLARE_SP_vf_vf(__sinhf_finite, v);
DECLARE_SP_vf_vf(__sqrtf_finite, v);
DECLARE_SP_vf_vf(acosf, v);
DECLARE_SP_vf_vf(acoshf, v);
DECLARE_SP_vf_vf(asinf, v);
DECLARE_SP_vf_vf(asinhf, v);
DECLARE_SP_vf_vf(atanf, v);
DECLARE_SP_vf_vf_vf(atan2f, vv);
DECLARE_SP_vf_vf(atanhf, v);
DECLARE_SP_vf_vf(cbrtf, v);
DECLARE_SP_vf_vf(ceilf, v);
DECLARE_SP_vf_vf_vf(copysignf, vv);
DECLARE_SP_vf_vf(cosf, v);
DECLARE_SP_vf_vf(coshf, v);
DECLARE_SP_vf_vf(cospif, v);
DECLARE_SP_vf_vf(erff, v);
DECLARE_SP_vf_vf(erfcf, v);
DECLARE_SP_vf_vf(expf, v);
DECLARE_SP_vf_vf(exp10f, v);
DECLARE_SP_vf_vf(exp2f, v);
DECLARE_SP_vf_vf(expm1f, v);
DECLARE_SP_vf_vf(fabsf, v);
DECLARE_SP_vf_vf_vf(fdimf, vv);
DECLARE_SP_vf_vf(floorf, v);
DECLARE_SP_vf_vf_vf_vf(fmaf, vvv);
DECLARE_SP_vf_vf_vf(fmaxf, vv);
DECLARE_SP_vf_vf_vf(fminf, vv);
DECLARE_SP_vf_vf_vf(fmodf, vv);
DECLARE_SP_vf_vf(frfrexpf, v);
DECLARE_SP_vf_vf_vf(hypotf, vv);
#ifndef ENABLE_AVX
// These two functions are not checked in some configurations due to
// the issue in https://github.com/shibatch/sleef/issues/221
DECLARE_SP_vi_vf(expfrexpf, v);
DECLARE_SP_vi_vf(ilogbf, v);
#endif
DECLARE_SP_vf_vf_vi(ldexpf, vv);
DECLARE_SP_vf_vf(lgammaf, v);
DECLARE_SP_vf_vf(logf, v);
DECLARE_SP_vf_vf(log10f, v);
DECLARE_SP_vf_vf(log1pf, v);
DECLARE_SP_vf_vf(log2f, v);
DECLARE_SP_vf_vf_pvf(modff, vl4);
DECLARE_SP_vf_vf_vf(nextafterf, vv);
DECLARE_SP_vf_vf_vf(powf, vv);
DECLARE_SP_vf_vf(rintf, v);
DECLARE_SP_vf_vf(roundf, v);
DECLARE_SP_vf_vf(sinf, v);
DECLARE_SP_v_vf_pvf_pvf(sincosf, vl4l4);
DECLARE_SP_v_vf_pvf_pvf(sincospif, vl4l4);
DECLARE_SP_vf_vf(sinhf, v);
DECLARE_SP_vf_vf(sinpif, v);
DECLARE_SP_vf_vf(sqrtf, v);
DECLARE_SP_vf_vf(tanf, v);
DECLARE_SP_vf_vf(tanhf, v);
DECLARE_SP_vf_vf(tgammaf, v);
DECLARE_SP_vf_vf(truncf, v);
#ifndef ENABLE_SVE
vdouble vd0, vd1, vd2, vd3;
vfloat vf0, vf1, vf2, vf3;
vint vi0, vi1, vi2, vi3;
vint2 vi20, vi21, vi22, vi23;
vopmask mask;
#else
volatile char outbuf[1024];
#endif
int check_feature(double d, float f) {
#ifdef ENABLE_SVE
vdouble vd0 = svdup_n_f64(d), vd1 = svdup_n_f64(d);
#ifdef MASKED_GNUABI
vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(f), svdup_n_s32(0));
#endif
#endif
CALL_DP_vd_vd(__acos_finite, v);
#ifdef ENABLE_SVE
svst1_f64(svptrue_b8(), (double *)outbuf, vd0);
#endif
return 1;
}
int main2(int argc, char **argv) {
#ifdef ENABLE_SVE
vdouble vd0 = svdup_n_f64(argc), vd1 = svdup_n_f64(argc), vd2 = svdup_n_f64(argc), vd3 = svdup_n_f64(argc);
vfloat vf0 = svdup_n_f32(argc), vf1 = svdup_n_f32(argc), vf2 = svdup_n_f32(argc), vf3 = svdup_n_f32(argc);
vint vi0 = svdup_n_s32(argc), vi2 = svdup_n_s32(argc);
vint2 vi20 = svdup_n_s32(argc), vi22 = svdup_n_s32(argc);
#ifdef MASKED_GNUABI
vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(argc), svdup_n_s32(0));
#endif
#endif
// Double precision function call.
CALL_DP_vd_vd(__acos_finite, v);
CALL_DP_vd_vd(__acosh_finite, v);
CALL_DP_vd_vd(__asin_finite, v);
CALL_DP_vd_vd_vd(__atan2_finite, vv);
CALL_DP_vd_vd(__atanh_finite, v);
CALL_DP_vd_vd(__cosh_finite, v);
CALL_DP_vd_vd(__exp10_finite, v);
CALL_DP_vd_vd(__exp2_finite, v);
CALL_DP_vd_vd(__exp_finite, v);
CALL_DP_vd_vd_vd(__fmod_finite, vv);
CALL_DP_vd_vd_pvd(__modf_finite, vl8);
CALL_DP_vd_vd_vd(__hypot_finite, vv);
CALL_DP_vd_vd(__log10_finite, v);
// CALL_DP_vd_vd(__log2_finite,v);
CALL_DP_vd_vd(__log_finite, v);
CALL_DP_vd_vd_vd(__pow_finite, vv);
CALL_DP_vd_vd(__sinh_finite, v);
CALL_DP_vd_vd(__sqrt_finite, v);
CALL_DP_vd_vd(acos, v);
CALL_DP_vd_vd(acosh, v);
CALL_DP_vd_vd(asin, v);
CALL_DP_vd_vd(asinh, v);
CALL_DP_vd_vd(atan, v);
CALL_DP_vd_vd_vd(atan2, vv);
CALL_DP_vd_vd(atanh, v);
CALL_DP_vd_vd(cbrt, v);
CALL_DP_vd_vd(ceil, v);
CALL_DP_vd_vd_vd(copysign, vv);
CALL_DP_vd_vd(cos, v);
CALL_DP_vd_vd(cosh, v);
CALL_DP_vd_vd(cospi, v);
CALL_DP_vd_vd(erf, v);
CALL_DP_vd_vd(erfc, v);
CALL_DP_vd_vd(exp, v);
CALL_DP_vd_vd(exp10, v);
CALL_DP_vd_vd(exp2, v);
CALL_DP_vi_vd(expfrexp, v);
CALL_DP_vd_vd(expm1, v);
CALL_DP_vd_vd(fabs, v);
CALL_DP_vd_vd_vd(fdim, vv);
CALL_DP_vd_vd(floor, v);
CALL_DP_vd_vd_vd_vd(fma, vvv);
CALL_DP_vd_vd_vd(fmax, vv);
CALL_DP_vd_vd_vd(fmin, vv);
CALL_DP_vd_vd_vd(fmod, vv);
CALL_DP_vd_vd(frfrexp, v);
CALL_DP_vd_vd_vd(hypot, vv);
CALL_DP_vi_vd(ilogb, v);
CALL_DP_vd_vd_vi(ldexp, vv);
CALL_DP_vd_vd(lgamma, v);
CALL_DP_vd_vd(log, v);
CALL_DP_vd_vd(log10, v);
CALL_DP_vd_vd(log1p, v);
CALL_DP_vd_vd(log2, v);
CALL_DP_vd_vd_pvd(modf, vl8);
CALL_DP_vd_vd_vd(nextafter, vv);
CALL_DP_vd_vd_vd(pow, vv);
CALL_DP_vd_vd(rint, v);
CALL_DP_vd_vd(round, v);
CALL_DP_vd_vd(sin, v);
CALL_DP_v_vd_pvd_pvd(sincos, vl8l8);
CALL_DP_v_vd_pvd_pvd(sincospi, vl8l8);
CALL_DP_vd_vd(sinh, v);
CALL_DP_vd_vd(sinpi, v);
CALL_DP_vd_vd(sqrt, v);
CALL_DP_vd_vd(tan, v);
CALL_DP_vd_vd(tanh, v);
CALL_DP_vd_vd(tgamma, v);
CALL_DP_vd_vd(trunc, v);
// Single precision function call.
CALL_SP_vf_vf(__acosf_finite, v);
CALL_SP_vf_vf(__acoshf_finite, v);
CALL_SP_vf_vf(__asinf_finite, v);
CALL_SP_vf_vf_vf(__atan2f_finite, vv);
CALL_SP_vf_vf(__atanhf_finite, v);
CALL_SP_vf_vf(__coshf_finite, v);
CALL_SP_vf_vf(__exp10f_finite, v);
CALL_SP_vf_vf(__exp2f_finite, v);
CALL_SP_vf_vf(__expf_finite, v);
CALL_SP_vf_vf_vf(__fmodf_finite, vv);
CALL_SP_vf_vf_pvf(__modff_finite, vl4);
CALL_SP_vf_vf_vf(__hypotf_finite, vv);
CALL_SP_vf_vf(__log10f_finite, v);
// CALL_SP_vf_vf(__log2f_finite,v);
CALL_SP_vf_vf(__logf_finite, v);
CALL_SP_vf_vf_vf(__powf_finite, vv);
CALL_SP_vf_vf(__sinhf_finite, v);
CALL_SP_vf_vf(__sqrtf_finite, v);
CALL_SP_vf_vf(acosf, v);
CALL_SP_vf_vf(acoshf, v);
CALL_SP_vf_vf(asinf, v);
CALL_SP_vf_vf(asinhf, v);
CALL_SP_vf_vf(atanf, v);
CALL_SP_vf_vf_vf(atan2f, vv);
CALL_SP_vf_vf(atanhf, v);
CALL_SP_vf_vf(cbrtf, v);
CALL_SP_vf_vf(ceilf, v);
CALL_SP_vf_vf_vf(copysignf, vv);
CALL_SP_vf_vf(cosf, v);
CALL_SP_vf_vf(coshf, v);
CALL_SP_vf_vf(cospif, v);
CALL_SP_vf_vf(erff, v);
CALL_SP_vf_vf(erfcf, v);
CALL_SP_vf_vf(expf, v);
CALL_SP_vf_vf(exp10f, v);
CALL_SP_vf_vf(exp2f, v);
CALL_SP_vf_vf(expm1f, v);
CALL_SP_vf_vf(fabsf, v);
CALL_SP_vf_vf_vf(fdimf, vv);
CALL_SP_vf_vf(floorf, v);
CALL_SP_vf_vf_vf_vf(fmaf, vvv);
CALL_SP_vf_vf_vf(fmaxf, vv);
CALL_SP_vf_vf_vf(fminf, vv);
CALL_SP_vf_vf_vf(fmodf, vv);
CALL_SP_vf_vf(frfrexpf, v);
CALL_SP_vf_vf_vf(hypotf, vv);
#ifndef ENABLE_AVX
// These two functions are not checked in some configurations due to
// the issue in https://github.com/shibatch/sleef/issues/221
CALL_SP_vi_vf(expfrexpf, v);
CALL_SP_vi_vf(ilogbf, v);
#endif
CALL_SP_vf_vf_vi(ldexpf, vv);
CALL_SP_vf_vf(lgammaf, v);
CALL_SP_vf_vf(logf, v);
CALL_SP_vf_vf(log10f, v);
CALL_SP_vf_vf(log1pf, v);
CALL_SP_vf_vf(log2f, v);
CALL_SP_vf_vf_pvf(modff, vl4);
CALL_SP_vf_vf_vf(nextafterf, vv);
CALL_SP_vf_vf_vf(powf, vv);
CALL_SP_vf_vf(rintf, v);
CALL_SP_vf_vf(roundf, v);
CALL_SP_vf_vf(sinf, v);
CALL_SP_v_vf_pvf_pvf(sincosf, vl4l4);
CALL_SP_v_vf_pvf_pvf(sincospif, vl4l4);
CALL_SP_vf_vf(sinhf, v);
CALL_SP_vf_vf(sinpif, v);
CALL_SP_vf_vf(sqrtf, v);
CALL_SP_vf_vf(tanf, v);
CALL_SP_vf_vf(tanhf, v);
CALL_SP_vf_vf(tgammaf, v);
CALL_SP_vf_vf(truncf, v);
return 0;
}

View File

@@ -0,0 +1,129 @@
sin u35 bc50dfbcbd8ef534541d1babe90860c7
sin u10 dbc2cf81f292ef50fa0119e222c6c9f9
cos u35 506e34a809b80ad3603ed46ba2a574b0
cos u10 a0f69df5937152b8f8f0e671f3676289
tan u35 970b5cd7f0e05defa22ebb155ab61a40
tan u10 5fd08e0552e3ab853439bf5fd2bd344d
sincos u10 7c164edcaa45988f6165b653fc76c495
sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4
sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
log u10 4855b27222d900bea47a27cadba71727
log u35 c95484de57c167da3d8d6d1baadf9ffa
log2 u10 2662df9af919680ca62e1752fb1b7539
log2 u35 1cd6d7f194a5e8364191497adc5c5cec
log10 u10 36645e8031d873d66fd0ec2c5959f273
log1p u10 1383924fb56cf2e7eda27de21320c591
exp u10 13692a48edf2cf7a3e047b16ddfb7b81
exp2 u10 436146f8d6dcaa4a754837108a9aa3e1
exp2 u35 8881d075d9101a1dfa3f6a10b9ee8373
exp10 u10 9d704b310f683872a6446cfc97726a4d
exp10 u35 bc07745ebc22a7ee97679154c24b23cc
expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
pow u10 a0ea63b27d33262346a35c9439741075
cbrt u10 5d8bf28ac74624594fd1be9217817690
cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
cbrt u35 73daa306764e208aab1627ac110b10d7
cbrt u35 c29b7bf200215425b4ba948c8cc94c42
hypot u05 cc2f18e409e19a02cadf7b91fd869120
hypot u35 5194e0a554174a6145511ce3df9c1f46
asin u10 86c061caec3fa2e1bc71bda4dad29f4c
asin u35 31303b88bdc00206265002d6cc5e89e4
acos u10 0a1a403590f2ac8364f132b334920945
acos u35 493f960c1cce57931d95a5a22a0587a3
atan u10 c97624a24ec034cc0c8985acb61d13cd
atan u10 0be0f550406923016cfeb5ef62c25b15
atan u35 9d6d83e066b5a4851d44771418c9948c
atan u35 f32c1aa4caa08c6945afd1125ba8b113
atan2 u10 6b1d9d25fcd96053acc19d1633fab36a
atan2 u35 afb07894347062a96dab705b34eb1763
sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
cosh u10 f77eb95f79e274c12b4e92dc0389259b
tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
asinh u10 01136e54e2a434839530dda54f33cfdb
acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
atanh u10 601a77ba8c1d5175f2808b48a41260c1
lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7
erf u10 f4ae148b59bb7501d8f5746300850376
erfc u15 5e116a4316dafa742769f71e18f6f9fe
fabs bef2f2ac8a4789357e580b4da4f9b9fe
copysign 3219022f267464e3704f90558e8df3bc
fmax 4e4f5220ccfef191864c316df0d18fc0
fmin c0f8effb6c611e2b3b91b820ad943f62
fdim e876d103931f18ceede5bfd7e3df7ab0
fmod 618aa751e13012afdb41ec80dd35e6ba
remainder 8d692dbb44bbc9be5af0c0657d3008b8
modf f03ce73cd4f9ea7f69c017f6e53355d5
nextafter 9eba4e30d12d74dc4e8003fcff0f1582
trunc 1bc7e909eba121dcef7f0e4046937ae5
floor 2cff66b499dc8a30cec9467de659b774
ceil b080e632dcb8f8134d8715752be12917
round 8907e21687ca9c2a539297536e754950
rint e49f837096bc661fe1c742801dd99a30
sinf u35 833d845950b9cbb025629fe4c040f8f6
sinf u10 9c21afa4d7d6af3fc666309c3cd647fe
cosf u35 74d7f871a6553cd0019087895e2052ad
cosf u10 35349e94c323c1614f22093959288010
tanf u35 bbb7c092d017e96d2454a38a20687735
tanf u10 227423bc04f42d76a8f68082ba696126
sincosf u10 83ecc4e3d5295056e9d8c52bc196b666
sincosf u35 533319caa49a961e4909bd6dcab40721
sincospif u05 8b3762b67a661957c1414c351ec49034
sincospif u35 cec15ed76a358091632634166fa77b66
logf u10 c5a90119943acc4199e1cc7030b5def8
logf u35 af2fbe4bfa2caaf59c734e3749dd15be
log2f u10 ba8acae369bbb7b6404cccbc633fe25b
log2f u35 ba32ebaa8c470899ebd433d190c00f03
log10f u10 7e235a82d960e4434575dd39648d8bb7
log1pf u10 350fc4f13502b36bb1107e1b1122acb1
expf u10 ee4adaabefa3fac6c0f1925b2a948eea
exp2f u10 b0d283dbae0f36f1b3c7eed9871f0d0d
exp2f u35 522cc30f722f77fceb07015830b351a3
exp10f u10 b0564be151965600f5744ff2e4992bc9
exp10f u35 d142f1fb40e44f0c9e042718f27ee3e0
expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
powf u10 a7cba3239c87969662e8b41a4dd8b4ab
cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
cbrtf u10 2a245b03f83e9114644d03b40dac707b
cbrtf u35 3ce62350fd585f0524a12c974fbe6cf5
cbrtf u35 2aca0404626a28f7af7f60105ad6e217
hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
hypotf u35 a6f0f774b346a6bba08889ff9ba3f193
asinf u10 7f77f7453b961512c89e87e49c549cfe
asinf u35 22ed8760aa328e1f714031eec592a4d8
acosf u10 15617dd0429b90e59d2923415934c2a6
acosf u35 af0b132d9e263721f9296187dbf9b9bf
atanf u10 26b77fb423104b45633cf24500237d6e
atanf u10 4313d0bc2708de53f74d804aac6564d4
atanf u35 97a1797897955643c722c7d291987331
atanf u35 7d3f47169415058e8578f11d899bfd10
atan2f u10 098a33f730fe95ce4774a991db4cee14
atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363
sinhf u10 0780a2f57df3a831718195d1ee5c19ef
coshf u10 cfbb6aed408e43a7b7f053474100ff2d
tanhf u10 d19f254d41e8726c748df87b95bc9acd
asinhf u10 260d129221468a86bbfd609c27bfea6a
acoshf u10 24ced7e5631c78b20a5716faeedbaa92
atanhf u10 164fd77b8372b8c131baaacab1c9e650
lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
tgammaf u10 f3a8d25c852068622bdfcae4cb813583
erff u10 f34af3814153de040b93e573ca7d21d8
erfcf u15 915ab9830de89a5a504b3ce7cd2fecda
fabsf a3c72220bc0ade68fe22e0a15eb730d4
copysignf 6b35517b8e1da78d9c9b52915d9a9b19
fmaxf 9833a60a2080e8fd9ae8de32c758966f
fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0
fdimf c5c0fe7b095eb8ccbb19fbf934a36b24
fmodf 77aa84a9703e202a56e5f4609bd2482b
remainderf 5a453b1217c173e4dc0b0211066750be
modff 5fa4f044f20478216aa085a01b189697
nextafterf 517c1c8f072e9024518d3d9ead98b85b
truncf 6937050850be63c44d4b7dbd666febe6
floorf 9341be69ee345c8554bf3ab4e9316133
ceilf c70874771cbe9741f1f05fedd4b629e9
roundf 0cf52f6b8015099771e9a7dfa6b090bc
rintf bed68e788e2b11543c09c9d52198abf8
fastsinf u3500 8eb51f86fb40414dd21284f020f24b6c
fastcosf u3500 69cbc3703f1d2c68695b00b1b09287b2
fastpowf u3500 e02e6a692cfa22a6b7149168c67ea1d2

View File

@@ -0,0 +1,129 @@
sin u35 c163e4a7e9ccebb2181dcc8653367d8c
sin u10 0d6bf6f2c935db82588222da95659019
cos u35 52f902bd939d751b5b544ac70181fcff
cos u10 afcdba92a75a76d56b8cf2f22d4bec9e
tan u35 906cc42b6755fe514c5e185fcb4d2f55
tan u10 c98f29a62067fa63646d9bcc29a310c6
sincos u10 3fe37f4eb805505152f2b14a22a9f94e
sincos u35 95a7b7f48c71febf10ec6eff796dd391
sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
log u10 4855b27222d900bea47a27cadba71727
log u35 015f8ae899c9b921d48919dd12ef19a9
log2 u10 2662df9af919680ca62e1752fb1b7539
log2 u35 908b1949db34ea855944f00089b21e23
log10 u10 36645e8031d873d66fd0ec2c5959f273
log1p u10 1383924fb56cf2e7eda27de21320c591
exp u10 084e5be89c2ad03e356078ea4f287bab
exp2 u10 6e36db9ae2cf9eca82e3d9157c622351
exp2 u35 6e36db9ae2cf9eca82e3d9157c622351
exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e
exp10 u35 6904d5509ca794747aa249c13886f90f
expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
pow u10 7e19796027d7c1d1999be948f90e6181
cbrt u10 5d8bf28ac74624594fd1be9217817690
cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
cbrt u35 fc7ee3e3e6c54365d708b752c242a947
cbrt u35 2408714a56d74f8c82389ca6772cdbc1
hypot u05 cc2f18e409e19a02cadf7b91fd869120
hypot u35 be7bbd41dffd746b70261ee773cbd4b2
asin u10 8a21b7c28cdaffc9d3e53f415367932e
asin u35 9c9e8107782898e9faed6924ad1b3cb1
acos u10 28261e4eb8331865660c814676d5c6bc
acos u35 310911130bfc45b10dabe3a072939331
atan u10 f931de72f2f6a7928f307a8a382ae255
atan u10 453f9ef62f58f9829320baf482a1d457
atan u35 6161b6189609f105b017d8768d0a41f1
atan u35 6face71d8d93c69448d49ed6140e361d
atan2 u10 469babaeee9bd30e17af2f473b3ea500
atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f
sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
cosh u10 f77eb95f79e274c12b4e92dc0389259b
tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
asinh u10 01136e54e2a434839530dda54f33cfdb
acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
atanh u10 601a77ba8c1d5175f2808b48a41260c1
lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
tgamma u10 cb9a93844ad1713d2ab92ff5b6398150
erf u10 8a0bc2146a5c67b6bebc58f4b0076568
erfc u15 3e247a54183eeddedc33e99c50118995
fabs bef2f2ac8a4789357e580b4da4f9b9fe
copysign 3219022f267464e3704f90558e8df3bc
fmax 4e4f5220ccfef191864c316df0d18fc0
fmin c0f8effb6c611e2b3b91b820ad943f62
fdim e876d103931f18ceede5bfd7e3df7ab0
fmod 618aa751e13012afdb41ec80dd35e6ba
remainder 8d692dbb44bbc9be5af0c0657d3008b8
modf f03ce73cd4f9ea7f69c017f6e53355d5
nextafter 9eba4e30d12d74dc4e8003fcff0f1582
trunc 1bc7e909eba121dcef7f0e4046937ae5
floor 2cff66b499dc8a30cec9467de659b774
ceil b080e632dcb8f8134d8715752be12917
round 8907e21687ca9c2a539297536e754950
rint e49f837096bc661fe1c742801dd99a30
sinf u35 f8f804eae1d9443103e81fec96293477
sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe
cosf u35 f2f3d1c9f090cde9c02439608dc7066e
cosf u10 dc35f27fae65f63f0aa6ad241f8b387b
tanf u35 68d42ad1fb412e6b8be3853461e61213
tanf u10 97df301d4f59e67d5318b5356b703f06
sincosf u10 a97124d810ec461c135dc4fb0c059b6f
sincosf u35 0cc521e52ae1227d311012c2919c1ff2
sincospif u05 8b3762b67a661957c1414c351ec49034
sincospif u35 8720757f221c00cc8de24b7dc4949144
logf u10 c5a90119943acc4199e1cc7030b5def8
logf u35 b6234302d534d6ccd48155dd6b9a4293
log2f u10 ba8acae369bbb7b6404cccbc633fe25b
log2f u35 74174c90717c86642b71284452a8aef6
log10f u10 7e235a82d960e4434575dd39648d8bb7
log1pf u10 e53dbfa80bcc1a7bcfd21000e6950475
expf u10 9597388315e4b3e89c4c97ce46374dcf
exp2f u10 42d66e5e4cb88feb29c5b36c632159a5
exp2f u35 42d66e5e4cb88feb29c5b36c632159a5
exp10f u10 954f0824b6d949d0da03b49950dc6642
exp10f u35 6fb0e9a829e12a06679d379d05b53ede
expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
powf u10 2ed84af40d03e307a620365f172d010d
cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
cbrtf u10 2a245b03f83e9114644d03b40dac707b
cbrtf u35 6c22a6dc132c5212250970f22f42256d
cbrtf u35 5ab696ae11f9637413d30e6496d5324b
hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
hypotf u35 2a7cd97768287084b7fffc7e9fb39072
asinf u10 e2e571a01984c4ffb3f6e38e0328d90e
asinf u35 70df2dfc3a3569868cce60c38e7b1962
acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb
acosf u35 72b0e2f9791f90f1c43570b9e9ba893f
atanf u10 fa672e387a204055f735b7af98dd8a35
atanf u10 d017670c13bc221b68bc9ee5f41c4b5e
atanf u35 f592e46eaa5d29583f86d3e336f20b6b
atanf u35 e7087fe40de46921826b373d10c40954
atan2f u10 275b2fa8ee554c45551bb142db9f8197
atan2f u35 44b187851195d24bab2561eb8f4ff5d0
sinhf u10 45bc228a14c3e39eeb35e9764394a23e
coshf u10 838d441e85d415ef4fb1e5c5ea966a71
tanhf u10 d19f254d41e8726c748df87b95bc9acd
asinhf u10 927eeb621a3e2d5039f1a07fcf150901
acoshf u10 932520013273174fcabe2be4a55f919f
atanhf u10 164fd77b8372b8c131baaacab1c9e650
lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
tgammaf u10 c3059747811d98846f74a63d3747ac3d
erff u10 f34af3814153de040b93e573ca7d21d8
erfcf u15 687a9c577512d349ddbc0643013d2c56
fabsf a3c72220bc0ade68fe22e0a15eb730d4
copysignf 6b35517b8e1da78d9c9b52915d9a9b19
fmaxf 9833a60a2080e8fd9ae8de32c758966f
fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0
fdimf c5c0fe7b095eb8ccbb19fbf934a36b24
fmodf 77aa84a9703e202a56e5f4609bd2482b
remainderf 5a453b1217c173e4dc0b0211066750be
modff 5fa4f044f20478216aa085a01b189697
nextafterf 517c1c8f072e9024518d3d9ead98b85b
truncf 6937050850be63c44d4b7dbd666febe6
floorf 9341be69ee345c8554bf3ab4e9316133
ceilf c70874771cbe9741f1f05fedd4b629e9
roundf 0cf52f6b8015099771e9a7dfa6b090bc
rintf bed68e788e2b11543c09c9d52198abf8
fastsinf u3500 5c48081c74cd0316379b580b047dbfc2
fastcosf u3500 6f73d116f109283e5632c31f5988f55b
fastpowf u3500 6dbb3110412df4fed5a71f50d40def89

View File

@@ -0,0 +1,777 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>
#include <assert.h>
#include <math.h>
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
#define STDIN_FILENO 0
#else
#include <unistd.h>
#include <sys/types.h>
#endif
#include "sleef.h"
#include "testerutil.h"
#define DORENAME
#include "rename.h"
#define BUFSIZE 1024
int main(int argc, char **argv) {
char buf[BUFSIZE];
printf("3\n");
fflush(stdout);
for(;;) {
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;
if (startsWith(buf, "sin ")) {
uint64_t u;
sscanf(buf, "sin %" PRIx64, &u);
u = d2u(xsin(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sin_u1 ")) {
uint64_t u;
sscanf(buf, "sin_u1 %" PRIx64, &u);
u = d2u(xsin_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cos ")) {
uint64_t u;
sscanf(buf, "cos %" PRIx64, &u);
u = d2u(xcos(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cos_u1 ")) {
uint64_t u;
sscanf(buf, "cos_u1 %" PRIx64, &u);
u = d2u(xcos_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sincos ")) {
uint64_t u;
sscanf(buf, "sincos %" PRIx64, &u);
Sleef_double2 x = xsincos(u2d(u));
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
} else if (startsWith(buf, "sincos_u1 ")) {
uint64_t u;
sscanf(buf, "sincos_u1 %" PRIx64, &u);
Sleef_double2 x = xsincos_u1(u2d(u));
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
} else if (startsWith(buf, "sincospi_u05 ")) {
uint64_t u;
sscanf(buf, "sincospi_u05 %" PRIx64, &u);
Sleef_double2 x = xsincospi_u05(u2d(u));
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
} else if (startsWith(buf, "sincospi_u35 ")) {
uint64_t u;
sscanf(buf, "sincospi_u35 %" PRIx64, &u);
Sleef_double2 x = xsincospi_u35(u2d(u));
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
} else if (startsWith(buf, "sinpi_u05 ")) {
uint64_t u;
sscanf(buf, "sinpi_u05 %" PRIx64, &u);
u = d2u(xsinpi_u05(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cospi_u05 ")) {
uint64_t u;
sscanf(buf, "cospi_u05 %" PRIx64, &u);
u = d2u(xcospi_u05(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "tan ")) {
uint64_t u;
sscanf(buf, "tan %" PRIx64, &u);
u = d2u(xtan(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "tan_u1 ")) {
uint64_t u;
sscanf(buf, "tan_u1 %" PRIx64, &u);
u = d2u(xtan_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "asin ")) {
uint64_t u;
sscanf(buf, "asin %" PRIx64, &u);
u = d2u(xasin(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "acos ")) {
uint64_t u;
sscanf(buf, "acos %" PRIx64, &u);
u = d2u(xacos(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "atan ")) {
uint64_t u;
sscanf(buf, "atan %" PRIx64, &u);
u = d2u(xatan(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "log ")) {
uint64_t u;
sscanf(buf, "log %" PRIx64, &u);
u = d2u(xlog(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "exp ")) {
uint64_t u;
sscanf(buf, "exp %" PRIx64, &u);
u = d2u(xexp(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "atan2 ")) {
uint64_t u, v;
sscanf(buf, "atan2 %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xatan2(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "asin_u1 ")) {
uint64_t u;
sscanf(buf, "asin_u1 %" PRIx64, &u);
u = d2u(xasin_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "acos_u1 ")) {
uint64_t u;
sscanf(buf, "acos_u1 %" PRIx64, &u);
u = d2u(xacos_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "atan_u1 ")) {
uint64_t u;
sscanf(buf, "atan_u1 %" PRIx64, &u);
u = d2u(xatan_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "atan2_u1 ")) {
uint64_t u, v;
sscanf(buf, "atan2_u1 %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xatan2_u1(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "log_u1 ")) {
uint64_t u;
sscanf(buf, "log_u1 %" PRIx64, &u);
u = d2u(xlog_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "pow ")) {
uint64_t u, v;
sscanf(buf, "pow %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xpow(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sinh ")) {
uint64_t u;
sscanf(buf, "sinh %" PRIx64, &u);
u = d2u(xsinh(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cosh ")) {
uint64_t u;
sscanf(buf, "cosh %" PRIx64, &u);
u = d2u(xcosh(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "tanh ")) {
uint64_t u;
sscanf(buf, "tanh %" PRIx64, &u);
u = d2u(xtanh(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sinh_u35 ")) {
uint64_t u;
sscanf(buf, "sinh_u35 %" PRIx64, &u);
u = d2u(xsinh_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cosh_u35 ")) {
uint64_t u;
sscanf(buf, "cosh_u35 %" PRIx64, &u);
u = d2u(xcosh_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "tanh_u35 ")) {
uint64_t u;
sscanf(buf, "tanh_u35 %" PRIx64, &u);
u = d2u(xtanh_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "asinh ")) {
uint64_t u;
sscanf(buf, "asinh %" PRIx64, &u);
u = d2u(xasinh(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "acosh ")) {
uint64_t u;
sscanf(buf, "acosh %" PRIx64, &u);
u = d2u(xacosh(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "atanh ")) {
uint64_t u;
sscanf(buf, "atanh %" PRIx64, &u);
u = d2u(xatanh(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "fma ")) {
uint64_t u, v, w;
sscanf(buf, "fma %" PRIx64 " %" PRIx64 " %" PRIx64, &u, &v, &w);
u = d2u(xfma(u2d(u), u2d(v), u2d(w)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sqrt ")) {
uint64_t u;
sscanf(buf, "sqrt %" PRIx64, &u);
u = d2u(xsqrt(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sqrt_u05 ")) {
uint64_t u;
sscanf(buf, "sqrt_u05 %" PRIx64, &u);
u = d2u(xsqrt_u05(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "sqrt_u35 ")) {
uint64_t u;
sscanf(buf, "sqrt_u35 %" PRIx64, &u);
u = d2u(xsqrt_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cbrt ")) {
uint64_t u;
sscanf(buf, "cbrt %" PRIx64, &u);
u = d2u(xcbrt(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "cbrt_u1 ")) {
uint64_t u;
sscanf(buf, "cbrt_u1 %" PRIx64, &u);
u = d2u(xcbrt_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "exp2 ")) {
uint64_t u;
sscanf(buf, "exp2 %" PRIx64, &u);
u = d2u(xexp2(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "exp2_u35 ")) {
uint64_t u;
sscanf(buf, "exp2_u35 %" PRIx64, &u);
u = d2u(xexp2_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "exp10 ")) {
uint64_t u;
sscanf(buf, "exp10 %" PRIx64, &u);
u = d2u(xexp10(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "exp10_u35 ")) {
uint64_t u;
sscanf(buf, "exp10_u35 %" PRIx64, &u);
u = d2u(xexp10_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "expm1 ")) {
uint64_t u;
sscanf(buf, "expm1 %" PRIx64, &u);
u = d2u(xexpm1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "log10 ")) {
uint64_t u;
sscanf(buf, "log10 %" PRIx64, &u);
u = d2u(xlog10(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "log2 ")) {
uint64_t u;
sscanf(buf, "log2 %" PRIx64, &u);
u = d2u(xlog2(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "log2_u35 ")) {
uint64_t u;
sscanf(buf, "log2_u35 %" PRIx64, &u);
u = d2u(xlog2_u35(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "log1p ")) {
uint64_t u;
sscanf(buf, "log1p %" PRIx64, &u);
u = d2u(xlog1p(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "ldexp ")) {
uint64_t u, v;
sscanf(buf, "ldexp %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xldexp(u2d(u), (int)u2d(v)));
printf("%" PRIx64 "\n", u);
}
else if (startsWith(buf, "hypot_u05 ")) {
uint64_t u, v;
sscanf(buf, "hypot_u05 %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xhypot_u05(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "hypot_u35 ")) {
uint64_t u, v;
sscanf(buf, "hypot_u35 %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xhypot_u35(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "copysign ")) {
uint64_t u, v;
sscanf(buf, "copysign %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xcopysign(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "fmax ")) {
uint64_t u, v;
sscanf(buf, "fmax %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xfmax(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "fmin ")) {
uint64_t u, v;
sscanf(buf, "fmin %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xfmin(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "fdim ")) {
uint64_t u, v;
sscanf(buf, "fdim %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xfdim(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "nextafter ")) {
uint64_t u, v;
sscanf(buf, "nextafter %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xnextafter(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "fmod ")) {
uint64_t u, v;
sscanf(buf, "fmod %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xfmod(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "remainder ")) {
uint64_t u, v;
sscanf(buf, "remainder %" PRIx64 " %" PRIx64, &u, &v);
u = d2u(xremainder(u2d(u), u2d(v)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "fabs ")) {
uint64_t u;
sscanf(buf, "fabs %" PRIx64, &u);
u = d2u(xfabs(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "trunc ")) {
uint64_t u;
sscanf(buf, "trunc %" PRIx64, &u);
u = d2u(xtrunc(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "floor ")) {
uint64_t u;
sscanf(buf, "floor %" PRIx64, &u);
u = d2u(xfloor(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "ceil ")) {
uint64_t u;
sscanf(buf, "ceil %" PRIx64, &u);
u = d2u(xceil(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "round ")) {
uint64_t u;
sscanf(buf, "round %" PRIx64, &u);
u = d2u(xround(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "rint ")) {
uint64_t u;
sscanf(buf, "rint %" PRIx64, &u);
u = d2u(xrint(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "frfrexp ")) {
uint64_t u;
sscanf(buf, "frfrexp %" PRIx64, &u);
u = d2u(xfrfrexp(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "modf ")) {
uint64_t u;
sscanf(buf, "modf %" PRIx64, &u);
Sleef_double2 x = xmodf(u2d(u));
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
} else if (startsWith(buf, "tgamma_u1 ")) {
uint64_t u;
sscanf(buf, "tgamma_u1 %" PRIx64, &u);
u = d2u(xtgamma_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "lgamma_u1 ")) {
uint64_t u;
sscanf(buf, "lgamma_u1 %" PRIx64, &u);
u = d2u(xlgamma_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "erf_u1 ")) {
uint64_t u;
sscanf(buf, "erf_u1 %" PRIx64, &u);
u = d2u(xerf_u1(u2d(u)));
printf("%" PRIx64 "\n", u);
} else if (startsWith(buf, "erfc_u15 ")) {
uint64_t u;
sscanf(buf, "erfc_u15 %" PRIx64, &u);
u = d2u(xerfc_u15(u2d(u)));
printf("%" PRIx64 "\n", u);
}
else if (startsWith(buf, "sinf ")) {
uint32_t u;
sscanf(buf, "sinf %x", &u);
u = f2u(xsinf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "cosf ")) {
uint32_t u;
sscanf(buf, "cosf %x", &u);
u = f2u(xcosf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sincosf ")) {
uint32_t u;
sscanf(buf, "sincosf %x", &u);
Sleef_float2 x = xsincosf(u2f(u));
printf("%x %x\n", f2u(x.x), f2u(x.y));
} else if (startsWith(buf, "tanf ")) {
uint32_t u;
sscanf(buf, "tanf %x", &u);
u = f2u(xtanf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "asinf ")) {
uint32_t u;
sscanf(buf, "asinf %x", &u);
u = f2u(xasinf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "acosf ")) {
uint32_t u;
sscanf(buf, "acosf %x", &u);
u = f2u(xacosf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "atanf ")) {
uint32_t u;
sscanf(buf, "atanf %x", &u);
u = f2u(xatanf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "atan2f ")) {
uint32_t u, v;
sscanf(buf, "atan2f %x %x", &u, &v);
u = f2u(xatan2f(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "logf ")) {
uint32_t u;
sscanf(buf, "logf %x", &u);
u = f2u(xlogf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "expf ")) {
uint32_t u;
sscanf(buf, "expf %x", &u);
u = f2u(xexpf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "cbrtf ")) {
uint32_t u;
sscanf(buf, "cbrtf %x", &u);
u = f2u(xcbrtf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sqrtf ")) {
uint32_t u;
sscanf(buf, "sqrtf %x", &u);
u = f2u(xsqrtf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sqrtf_u05 ")) {
uint32_t u;
sscanf(buf, "sqrtf_u05 %x", &u);
u = f2u(xsqrtf_u05(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sqrtf_u35 ")) {
uint32_t u;
sscanf(buf, "sqrtf_u35 %x", &u);
u = f2u(xsqrtf_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "ldexpf ")) {
uint32_t u, v;
sscanf(buf, "ldexpf %x %x", &u, &v);
u = f2u(xldexpf(u2f(u), (int)u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "powf ")) {
uint32_t u, v;
sscanf(buf, "powf %x %x", &u, &v);
u = f2u(xpowf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "fastpowf_u3500 ")) {
uint32_t u, v;
sscanf(buf, "fastpowf_u3500 %x %x", &u, &v);
u = f2u(xfastpowf_u3500(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "sinhf ")) {
uint32_t u;
sscanf(buf, "sinhf %x", &u);
u = f2u(xsinhf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "coshf ")) {
uint32_t u;
sscanf(buf, "coshf %x", &u);
u = f2u(xcoshf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "tanhf ")) {
uint32_t u;
sscanf(buf, "tanhf %x", &u);
u = f2u(xtanhf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sinhf_u35 ")) {
uint32_t u;
sscanf(buf, "sinhf_u35 %x", &u);
u = f2u(xsinhf_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "coshf_u35 ")) {
uint32_t u;
sscanf(buf, "coshf_u35 %x", &u);
u = f2u(xcoshf_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "tanhf_u35 ")) {
uint32_t u;
sscanf(buf, "tanhf_u35 %x", &u);
u = f2u(xtanhf_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "asinhf ")) {
uint32_t u;
sscanf(buf, "asinhf %x", &u);
u = f2u(xasinhf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "acoshf ")) {
uint32_t u;
sscanf(buf, "acoshf %x", &u);
u = f2u(xacoshf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "atanhf ")) {
uint32_t u;
sscanf(buf, "atanhf %x", &u);
u = f2u(xatanhf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "exp2f ")) {
uint32_t u;
sscanf(buf, "exp2f %x", &u);
u = f2u(xexp2f(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "exp10f ")) {
uint32_t u;
sscanf(buf, "exp10f %x", &u);
u = f2u(xexp10f(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "exp2f_u35 ")) {
uint32_t u;
sscanf(buf, "exp2f_u35 %x", &u);
u = f2u(xexp2f_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "exp10f_u35 ")) {
uint32_t u;
sscanf(buf, "exp10f_u35 %x", &u);
u = f2u(xexp10f_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "expm1f ")) {
uint32_t u;
sscanf(buf, "expm1f %x", &u);
u = f2u(xexpm1f(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "log10f ")) {
uint32_t u;
sscanf(buf, "log10f %x", &u);
u = f2u(xlog10f(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "log2f ")) {
uint32_t u;
sscanf(buf, "log2f %x", &u);
u = f2u(xlog2f(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "log2f_u35 ")) {
uint32_t u;
sscanf(buf, "log2f_u35 %x", &u);
u = f2u(xlog2f_u35(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "log1pf ")) {
uint32_t u;
sscanf(buf, "log1pf %x", &u);
u = f2u(xlog1pf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sinf_u1 ")) {
uint32_t u;
sscanf(buf, "sinf_u1 %x", &u);
u = f2u(xsinf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "cosf_u1 ")) {
uint32_t u;
sscanf(buf, "cosf_u1 %x", &u);
u = f2u(xcosf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "sincosf_u1 ")) {
uint32_t u;
sscanf(buf, "sincosf_u1 %x", &u);
Sleef_float2 x = xsincosf_u1(u2f(u));
printf("%x %x\n", f2u(x.x), f2u(x.y));
} else if (startsWith(buf, "sincospif_u05 ")) {
uint32_t u;
sscanf(buf, "sincospif_u05 %x", &u);
Sleef_float2 x = xsincospif_u05(u2f(u));
printf("%x %x\n", f2u(x.x), f2u(x.y));
} else if (startsWith(buf, "sincospif_u35 ")) {
uint32_t u;
sscanf(buf, "sincospif_u35 %x", &u);
Sleef_float2 x = xsincospif_u35(u2f(u));
printf("%x %x\n", f2u(x.x), f2u(x.y));
} else if (startsWith(buf, "sinpif_u05 ")) {
uint32_t u;
sscanf(buf, "sinpif_u05 %x", &u);
u = f2u(xsinpif_u05(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "cospif_u05 ")) {
uint32_t u;
sscanf(buf, "cospif_u05 %x", &u);
u = f2u(xcospif_u05(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "fastsinf_u3500 ")) {
uint32_t u;
sscanf(buf, "fastsinf_u3500 %x", &u);
u = f2u(xfastsinf_u3500(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "fastcosf_u3500 ")) {
uint32_t u;
sscanf(buf, "fastcosf_u3500 %x", &u);
u = f2u(xfastcosf_u3500(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "tanf_u1 ")) {
uint32_t u;
sscanf(buf, "tanf_u1 %x", &u);
u = f2u(xtanf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "asinf_u1 ")) {
uint32_t u;
sscanf(buf, "asinf_u1 %x", &u);
u = f2u(xasinf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "acosf_u1 ")) {
uint32_t u;
sscanf(buf, "acosf_u1 %x", &u);
u = f2u(xacosf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "atanf_u1 ")) {
uint32_t u;
sscanf(buf, "atanf_u1 %x", &u);
u = f2u(xatanf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "atan2f_u1 ")) {
uint32_t u, v;
sscanf(buf, "atan2f_u1 %x %x", &u, &v);
u = f2u(xatan2f_u1(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "logf_u1 ")) {
uint32_t u;
sscanf(buf, "logf_u1 %x", &u);
u = f2u(xlogf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "cbrtf_u1 ")) {
uint32_t u;
sscanf(buf, "cbrtf_u1 %x", &u);
u = f2u(xcbrtf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "ilogb ")) {
uint64_t u;
int i;
sscanf(buf, "ilogb %" PRIx64, &u);
i = xilogb(u2d(u));
printf("%d\n", i);
} else if (startsWith(buf, "ilogbf ")) {
uint32_t u;
int i;
sscanf(buf, "ilogbf %x", &u);
i = xilogbf(u2f(u));
printf("%d\n", i);
}
else if (startsWith(buf, "hypotf_u05 ")) {
uint32_t u, v;
sscanf(buf, "hypotf_u05 %x %x", &u, &v);
u = f2u(xhypotf_u05(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "hypotf_u35 ")) {
uint32_t u, v;
sscanf(buf, "hypotf_u35 %x %x", &u, &v);
u = f2u(xhypotf_u35(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "copysignf ")) {
uint32_t u, v;
sscanf(buf, "copysignf %x %x", &u, &v);
u = f2u(xcopysignf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "fmaxf ")) {
uint32_t u, v;
sscanf(buf, "fmaxf %x %x", &u, &v);
u = f2u(xfmaxf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "fminf ")) {
uint32_t u, v;
sscanf(buf, "fminf %x %x", &u, &v);
u = f2u(xfminf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "fdimf ")) {
uint32_t u, v;
sscanf(buf, "fdimf %x %x", &u, &v);
u = f2u(xfdimf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "nextafterf ")) {
uint32_t u, v;
sscanf(buf, "nextafterf %x %x", &u, &v);
u = f2u(xnextafterf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "fmodf ")) {
uint32_t u, v;
sscanf(buf, "fmodf %x %x", &u, &v);
u = f2u(xfmodf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "remainderf ")) {
uint32_t u, v;
sscanf(buf, "remainderf %x %x", &u, &v);
u = f2u(xremainderf(u2f(u), u2f(v)));
printf("%x\n", u);
} else if (startsWith(buf, "fabsf ")) {
uint32_t u;
sscanf(buf, "fabsf %x", &u);
u = f2u(xfabsf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "truncf ")) {
uint32_t u;
sscanf(buf, "truncf %x", &u);
u = f2u(xtruncf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "floorf ")) {
uint32_t u;
sscanf(buf, "floorf %x", &u);
u = f2u(xfloorf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "ceilf ")) {
uint32_t u;
sscanf(buf, "ceilf %x", &u);
u = f2u(xceilf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "roundf ")) {
uint32_t u;
sscanf(buf, "roundf %x", &u);
u = f2u(xroundf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "rintf ")) {
uint32_t u;
sscanf(buf, "rintf %x", &u);
u = f2u(xrintf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "frfrexpf ")) {
uint32_t u;
sscanf(buf, "frfrexpf %x", &u);
u = f2u(xfrfrexpf(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "modff ")) {
uint32_t u;
sscanf(buf, "modff %x", &u);
Sleef_float2 x = xmodff(u2f(u));
printf("%x %x\n", f2u(x.x), f2u(x.y));
} else if (startsWith(buf, "tgammaf_u1 ")) {
uint32_t u;
sscanf(buf, "tgammaf_u1 %x", &u);
u = f2u(xtgammaf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "lgammaf_u1 ")) {
uint32_t u;
sscanf(buf, "lgammaf_u1 %x", &u);
u = f2u(xlgammaf_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "erff_u1 ")) {
uint32_t u;
sscanf(buf, "erff_u1 %x", &u);
u = f2u(xerff_u1(u2f(u)));
printf("%x\n", u);
} else if (startsWith(buf, "erfcf_u15 ")) {
uint32_t u;
sscanf(buf, "erfcf_u15 %x", &u);
u = f2u(xerfcf_u15(u2f(u)));
printf("%x\n", u);
}
else {
break;
}
fflush(stdout);
}
return 0;
}

View File

@@ -0,0 +1,546 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <math.h>
#include <float.h>
#include <stdint.h>
#include <cuda.h>
#include "sleefinline_purec_scalar.h"
#include "sleefinline_cuda.h"
#define STDIN_FILENO 0
#define SIMD_SUFFIX _cuda_sleef
#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
//
static int startsWith(const char *str, const char *prefix) {
while(*prefix != '\0') if (*str++ != *prefix++) return 0;
return *prefix == '\0';
}
static double u2d(uint64_t u) {
union {
double f;
uint64_t i;
} tmp;
tmp.i = u;
return tmp.f;
}
static uint64_t d2u(double d) {
union {
double f;
uint64_t i;
} tmp;
tmp.f = d;
return tmp.i;
}
static float u2f(uint32_t u) {
union {
float f;
uint32_t i;
} tmp;
tmp.i = u;
return tmp.f;
}
static uint32_t f2u(float d) {
union {
float f;
uint32_t i;
} tmp;
tmp.f = d;
return tmp.i;
}
//
__global__ void xsin(double *r, double *a0) { *r = Sleef_sind1_u35cuda(*a0); }
__global__ void xcos(double *r, double *a0) { *r = Sleef_cosd1_u35cuda(*a0); }
__global__ void xsincos(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u35cuda(*a0); }
__global__ void xtan(double *r, double *a0) { *r = Sleef_tand1_u35cuda(*a0); }
__global__ void xasin(double *r, double *a0) { *r = Sleef_asind1_u35cuda(*a0); }
__global__ void xacos(double *r, double *a0) { *r = Sleef_acosd1_u35cuda(*a0); }
__global__ void xatan(double *r, double *a0) { *r = Sleef_atand1_u35cuda(*a0); }
__global__ void xatan2(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u35cuda(*a0, *a1); }
__global__ void xlog(double *r, double *a0) { *r = Sleef_logd1_u35cuda(*a0); }
__global__ void xcbrt(double *r, double *a0) { *r = Sleef_cbrtd1_u35cuda(*a0); }
__global__ void xsin_u1(double *r, double *a0) { *r = Sleef_sind1_u10cuda(*a0); }
__global__ void xcos_u1(double *r, double *a0) { *r = Sleef_cosd1_u10cuda(*a0); }
__global__ void xsincos_u1(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u10cuda(*a0); }
__global__ void xtan_u1(double *r, double *a0) { *r = Sleef_tand1_u10cuda(*a0); }
__global__ void xasin_u1(double *r, double *a0) { *r = Sleef_asind1_u10cuda(*a0); }
__global__ void xacos_u1(double *r, double *a0) { *r = Sleef_acosd1_u10cuda(*a0); }
__global__ void xatan_u1(double *r, double *a0) { *r = Sleef_atand1_u10cuda(*a0); }
__global__ void xatan2_u1(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u10cuda(*a0, *a1); }
__global__ void xlog_u1(double *r, double *a0) { *r = Sleef_logd1_u10cuda(*a0); }
__global__ void xcbrt_u1(double *r, double *a0) { *r = Sleef_cbrtd1_u10cuda(*a0); }
__global__ void xexp(double *r, double *a0) { *r = Sleef_expd1_u10cuda(*a0); }
__global__ void xpow(double *r, double *a0, double *a1) { *r = Sleef_powd1_u10cuda(*a0, *a1); }
__global__ void xsinh(double *r, double *a0) { *r = Sleef_sinhd1_u10cuda(*a0); }
__global__ void xcosh(double *r, double *a0) { *r = Sleef_coshd1_u10cuda(*a0); }
__global__ void xtanh(double *r, double *a0) { *r = Sleef_tanhd1_u10cuda(*a0); }
__global__ void xsinh_u35(double *r, double *a0) { *r = Sleef_sinhd1_u35cuda(*a0); }
__global__ void xcosh_u35(double *r, double *a0) { *r = Sleef_coshd1_u35cuda(*a0); }
__global__ void xtanh_u35(double *r, double *a0) { *r = Sleef_tanhd1_u35cuda(*a0); }
__global__ void xasinh(double *r, double *a0) { *r = Sleef_asinhd1_u10cuda(*a0); }
__global__ void xacosh(double *r, double *a0) { *r = Sleef_acoshd1_u10cuda(*a0); }
__global__ void xatanh(double *r, double *a0) { *r = Sleef_atanhd1_u10cuda(*a0); }
__global__ void xexp2(double *r, double *a0) { *r = Sleef_exp2d1_u10cuda(*a0); }
__global__ void xexp2_u35(double *r, double *a0) { *r = Sleef_exp2d1_u35cuda(*a0); }
__global__ void xexp10(double *r, double *a0) { *r = Sleef_exp10d1_u10cuda(*a0); }
__global__ void xexp10_u35(double *r, double *a0) { *r = Sleef_exp10d1_u35cuda(*a0); }
__global__ void xexpm1(double *r, double *a0) { *r = Sleef_expm1d1_u10cuda(*a0); }
__global__ void xlog10(double *r, double *a0) { *r = Sleef_log10d1_u10cuda(*a0); }
__global__ void xlog2(double *r, double *a0) { *r = Sleef_log2d1_u10cuda(*a0); }
__global__ void xlog2_u35(double *r, double *a0) { *r = Sleef_log2d1_u35cuda(*a0); }
__global__ void xlog1p(double *r, double *a0) { *r = Sleef_log1pd1_u10cuda(*a0); }
__global__ void xsincospi_u05(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u05cuda(*a0); }
__global__ void xsincospi_u35(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u35cuda(*a0); }
__global__ void xsinpi_u05(double *r, double *a0) { *r = Sleef_sinpid1_u05cuda(*a0); }
__global__ void xcospi_u05(double *r, double *a0) { *r = Sleef_cospid1_u05cuda(*a0); }
__global__ void xldexp(double *r, double *a0, int *a1) { *r = Sleef_ldexpd1_cuda(*a0, *a1); }
__global__ void xilogb(int *r, double *a0) { *r = Sleef_ilogbd1_cuda(*a0); }
__global__ void xfma(double *r, double *a0, double *a1, double *a2) { *r = Sleef_fmad1_cuda(*a0, *a1, *a2); }
__global__ void xsqrt(double *r, double *a0) { *r = Sleef_sqrtd1_cuda(*a0); }
__global__ void xsqrt_u05(double *r, double *a0) { *r = Sleef_sqrtd1_u05cuda(*a0); }
__global__ void xsqrt_u35(double *r, double *a0) { *r = Sleef_sqrtd1_u35cuda(*a0); }
__global__ void xhypot_u05(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u05cuda(*a0, *a1); }
__global__ void xhypot_u35(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u35cuda(*a0, *a1); }
__global__ void xfabs(double *r, double *a0) { *r = Sleef_fabsd1_cuda(*a0); }
__global__ void xcopysign(double *r, double *a0, double *a1) { *r = Sleef_copysignd1_cuda(*a0, *a1); }
__global__ void xfmax(double *r, double *a0, double *a1) { *r = Sleef_fmaxd1_cuda(*a0, *a1); }
__global__ void xfmin(double *r, double *a0, double *a1) { *r = Sleef_fmind1_cuda(*a0, *a1); }
__global__ void xfdim(double *r, double *a0, double *a1) { *r = Sleef_fdimd1_cuda(*a0, *a1); }
__global__ void xtrunc(double *r, double *a0) { *r = Sleef_truncd1_cuda(*a0); }
__global__ void xfloor(double *r, double *a0) { *r = Sleef_floord1_cuda(*a0); }
__global__ void xceil(double *r, double *a0) { *r = Sleef_ceild1_cuda(*a0); }
__global__ void xround(double *r, double *a0) { *r = Sleef_roundd1_cuda(*a0); }
__global__ void xrint(double *r, double *a0) { *r = Sleef_rintd1_cuda(*a0); }
__global__ void xnextafter(double *r, double *a0, double *a1) { *r = Sleef_nextafterd1_cuda(*a0, *a1); }
__global__ void xfrfrexp(double *r, double *a0) { *r = Sleef_frfrexpd1_cuda(*a0); }
__global__ void xexpfrexp(int *r, double *a0) { *r = Sleef_expfrexpd1_cuda(*a0); }
__global__ void xfmod(double *r, double *a0, double *a1) { *r = Sleef_fmodd1_cuda(*a0, *a1); }
__global__ void xremainder(double *r, double *a0, double *a1) { *r = Sleef_remainderd1_cuda(*a0, *a1); }
__global__ void xmodf(vdouble2 *r, double *a0) { *r = Sleef_modfd1_cuda(*a0); }
__global__ void xlgamma_u1(double *r, double *a0) { *r = Sleef_lgammad1_u10cuda(*a0); }
__global__ void xtgamma_u1(double *r, double *a0) { *r = Sleef_tgammad1_u10cuda(*a0); }
__global__ void xerf_u1(double *r, double *a0) { *r = Sleef_erfd1_u10cuda(*a0); }
__global__ void xerfc_u15(double *r, double *a0) { *r = Sleef_erfcd1_u15cuda(*a0); }
__global__ void xsinf(float *r, float *a0) { *r = Sleef_sinf1_u35cuda(*a0); }
__global__ void xcosf(float *r, float *a0) { *r = Sleef_cosf1_u35cuda(*a0); }
__global__ void xsincosf(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u35cuda(*a0); }
__global__ void xtanf(float *r, float *a0) { *r = Sleef_tanf1_u35cuda(*a0); }
__global__ void xasinf(float *r, float *a0) { *r = Sleef_asinf1_u35cuda(*a0); }
__global__ void xacosf(float *r, float *a0) { *r = Sleef_acosf1_u35cuda(*a0); }
__global__ void xatanf(float *r, float *a0) { *r = Sleef_atanf1_u35cuda(*a0); }
__global__ void xatan2f(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u35cuda(*a0, *a1); }
__global__ void xlogf(float *r, float *a0) { *r = Sleef_logf1_u35cuda(*a0); }
__global__ void xcbrtf(float *r, float *a0) { *r = Sleef_cbrtf1_u35cuda(*a0); }
__global__ void xsinf_u1(float *r, float *a0) { *r = Sleef_sinf1_u10cuda(*a0); }
__global__ void xcosf_u1(float *r, float *a0) { *r = Sleef_cosf1_u10cuda(*a0); }
__global__ void xsincosf_u1(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u10cuda(*a0); }
__global__ void xtanf_u1(float *r, float *a0) { *r = Sleef_tanf1_u10cuda(*a0); }
__global__ void xasinf_u1(float *r, float *a0) { *r = Sleef_asinf1_u10cuda(*a0); }
__global__ void xacosf_u1(float *r, float *a0) { *r = Sleef_acosf1_u10cuda(*a0); }
__global__ void xatanf_u1(float *r, float *a0) { *r = Sleef_atanf1_u10cuda(*a0); }
__global__ void xatan2f_u1(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u10cuda(*a0, *a1); }
__global__ void xlogf_u1(float *r, float *a0) { *r = Sleef_logf1_u10cuda(*a0); }
__global__ void xcbrtf_u1(float *r, float *a0) { *r = Sleef_cbrtf1_u10cuda(*a0); }
__global__ void xexpf(float *r, float *a0) { *r = Sleef_expf1_u10cuda(*a0); }
__global__ void xpowf(float *r, float *a0, float *a1) { *r = Sleef_powf1_u10cuda(*a0, *a1); }
__global__ void xsinhf(float *r, float *a0) { *r = Sleef_sinhf1_u10cuda(*a0); }
__global__ void xcoshf(float *r, float *a0) { *r = Sleef_coshf1_u10cuda(*a0); }
__global__ void xtanhf(float *r, float *a0) { *r = Sleef_tanhf1_u10cuda(*a0); }
__global__ void xsinhf_u35(float *r, float *a0) { *r = Sleef_sinhf1_u35cuda(*a0); }
__global__ void xcoshf_u35(float *r, float *a0) { *r = Sleef_coshf1_u35cuda(*a0); }
__global__ void xtanhf_u35(float *r, float *a0) { *r = Sleef_tanhf1_u35cuda(*a0); }
__global__ void xfastsinf_u3500(float *r, float *a0) { *r = Sleef_fastsinf1_u3500cuda(*a0); }
__global__ void xfastcosf_u3500(float *r, float *a0) { *r = Sleef_fastcosf1_u3500cuda(*a0); }
__global__ void xfastpowf_u3500(float *r, float *a0, float *a1) { *r = Sleef_fastpowf1_u3500cuda(*a0, *a1); }
__global__ void xasinhf(float *r, float *a0) { *r = Sleef_asinhf1_u10cuda(*a0); }
__global__ void xacoshf(float *r, float *a0) { *r = Sleef_acoshf1_u10cuda(*a0); }
__global__ void xatanhf(float *r, float *a0) { *r = Sleef_atanhf1_u10cuda(*a0); }
__global__ void xexp2f(float *r, float *a0) { *r = Sleef_exp2f1_u10cuda(*a0); }
__global__ void xexp2f_u35(float *r, float *a0) { *r = Sleef_exp2f1_u35cuda(*a0); }
__global__ void xexp10f(float *r, float *a0) { *r = Sleef_exp10f1_u10cuda(*a0); }
__global__ void xexp10f_u35(float *r, float *a0) { *r = Sleef_exp10f1_u35cuda(*a0); }
__global__ void xexpm1f(float *r, float *a0) { *r = Sleef_expm1f1_u10cuda(*a0); }
__global__ void xlog10f(float *r, float *a0) { *r = Sleef_log10f1_u10cuda(*a0); }
__global__ void xlog2f(float *r, float *a0) { *r = Sleef_log2f1_u10cuda(*a0); }
__global__ void xlog2f_u35(float *r, float *a0) { *r = Sleef_log2f1_u35cuda(*a0); }
__global__ void xlog1pf(float *r, float *a0) { *r = Sleef_log1pf1_u10cuda(*a0); }
__global__ void xsincospif_u05(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u05cuda(*a0); }
__global__ void xsincospif_u35(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u35cuda(*a0); }
__global__ void xsinpif_u05(float *r, float *a0) { *r = Sleef_sinpif1_u05cuda(*a0); }
__global__ void xcospif_u05(float *r, float *a0) { *r = Sleef_cospif1_u05cuda(*a0); }
__global__ void xldexpf(float *r, float *a0, int *a1) { *r = Sleef_ldexpf1_cuda(*a0, *a1); }
__global__ void xilogbf(int *r, float *a0) { *r = Sleef_ilogbf1_cuda(*a0); }
__global__ void xfmaf(float *r, float *a0, float *a1, float *a2) { *r = Sleef_fmaf1_cuda(*a0, *a1, *a2); }
__global__ void xsqrtf(float *r, float *a0) { *r = Sleef_sqrtf1_cuda(*a0); }
__global__ void xsqrtf_u05(float *r, float *a0) { *r = Sleef_sqrtf1_u05cuda(*a0); }
__global__ void xsqrtf_u35(float *r, float *a0) { *r = Sleef_sqrtf1_u35cuda(*a0); }
__global__ void xhypotf_u05(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u05cuda(*a0, *a1); }
__global__ void xhypotf_u35(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u35cuda(*a0, *a1); }
__global__ void xfabsf(float *r, float *a0) { *r = Sleef_fabsf1_cuda(*a0); }
__global__ void xcopysignf(float *r, float *a0, float *a1) { *r = Sleef_copysignf1_cuda(*a0, *a1); }
__global__ void xfmaxf(float *r, float *a0, float *a1) { *r = Sleef_fmaxf1_cuda(*a0, *a1); }
__global__ void xfminf(float *r, float *a0, float *a1) { *r = Sleef_fminf1_cuda(*a0, *a1); }
__global__ void xfdimf(float *r, float *a0, float *a1) { *r = Sleef_fdimf1_cuda(*a0, *a1); }
__global__ void xtruncf(float *r, float *a0) { *r = Sleef_truncf1_cuda(*a0); }
__global__ void xfloorf(float *r, float *a0) { *r = Sleef_floorf1_cuda(*a0); }
__global__ void xceilf(float *r, float *a0) { *r = Sleef_ceilf1_cuda(*a0); }
__global__ void xroundf(float *r, float *a0) { *r = Sleef_roundf1_cuda(*a0); }
__global__ void xrintf(float *r, float *a0) { *r = Sleef_rintf1_cuda(*a0); }
__global__ void xnextafterf(float *r, float *a0, float *a1) { *r = Sleef_nextafterf1_cuda(*a0, *a1); }
__global__ void xfrfrexpf(float *r, float *a0) { *r = Sleef_frfrexpf1_cuda(*a0); }
__global__ void xexpfrexpf(float *r, float *a0) { *r = Sleef_expfrexpf1_cuda(*a0); }
__global__ void xfmodf(float *r, float *a0, float *a1) { *r = Sleef_fmodf1_cuda(*a0, *a1); }
__global__ void xremainderf(float *r, float *a0, float *a1) { *r = Sleef_remainderf1_cuda(*a0, *a1); }
__global__ void xmodff(vfloat2 *r, float *a0) { *r = Sleef_modff1_cuda(*a0); }
__global__ void xlgammaf_u1(float *r, float *a0) { *r = Sleef_lgammaf1_u10cuda(*a0); }
__global__ void xtgammaf_u1(float *r, float *a0) { *r = Sleef_tgammaf1_u10cuda(*a0); }
__global__ void xerff_u1(float *r, float *a0) { *r = Sleef_erff1_u10cuda(*a0); }
__global__ void xerfcf_u15(float *r, float *a0) { *r = Sleef_erfcf1_u15cuda(*a0); }
//
#define func_d_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u; \
sscanf(buf, funcStr " %" PRIx64, &u); \
*a0 = u2d(u); \
funcName<<<1, 1>>>(r, a0); \
cudaDeviceSynchronize(); \
printf("%" PRIx64 "\n", d2u(*r)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_d2_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u; \
sscanf(buf, funcStr " %" PRIx64, &u); \
*a0 = u2d(u); \
funcName<<<1, 1>>>(r2, a0); \
cudaDeviceSynchronize(); \
printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_d_d_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u, v; \
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
*a0 = u2d(u); \
*a1 = u2d(v); \
funcName<<<1, 1>>>(r, a0, a1); \
cudaDeviceSynchronize(); \
printf("%" PRIx64 "\n", d2u(*r)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_d_d_i(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u, v; \
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
*a0 = u2d(u); \
*i0 = (int)u2d(v); \
funcName<<<1, 1>>>(r, a0, i0); \
cudaDeviceSynchronize(); \
printf("%" PRIx64 "\n", d2u(*r)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_i_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u; \
sscanf(buf, funcStr " %" PRIx64, &u); \
*a0 = u2d(u); \
funcName<<<1, 1>>>(i0, a0); \
cudaDeviceSynchronize(); \
printf("%d\n", *i0); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
//
#define func_f_f(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint32_t u; \
sscanf(buf, funcStr " %x", &u); \
*b0 = u2f(u); \
funcName<<<1, 1>>>(s, b0); \
cudaDeviceSynchronize(); \
printf("%x\n", f2u(*s)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_f2_f(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint32_t u; \
sscanf(buf, funcStr " %x", &u); \
*b0 = u2f(u); \
funcName<<<1, 1>>>(s2, b0); \
cudaDeviceSynchronize(); \
printf("%x %x\n", f2u(s2->x), f2u(s2->y)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_f_f_f(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint32_t u, v; \
sscanf(buf, funcStr " %x %x", &u, &v); \
*b0 = u2f(u); \
*b1 = u2f(v); \
funcName<<<1, 1>>>(s, b0, b1); \
cudaDeviceSynchronize(); \
printf("%x\n", f2u(*s)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
//
#define BUFSIZE 1024
int main(int argc, char **argv) {
#if 0
cuInit(0);
int ndevice;
cuDeviceGetCount(&ndevice);
if (ndevice == 0) {
fprintf(stderr, "No cuda device available\n");
exit(0);
}
CUdevice device;
char deviceName[1024];
cuDeviceGet(&device, 0);
cuDeviceGetName(deviceName, 1000, device);
fprintf(stderr, "Device : %s\n", deviceName);
#endif
cudaSetDeviceFlags(cudaDeviceScheduleSpin);
vdouble2 *r2;
vfloat2 *s2;
double *r, *a0, *a1, *a2;
float *s, *b0, *b1, *b2;
int *i0;
cudaMallocManaged(&r , 1*sizeof(double));
cudaMallocManaged(&r2, 1*sizeof(vdouble2));
cudaMallocManaged(&a0, 1*sizeof(double));
cudaMallocManaged(&a1, 1*sizeof(double));
cudaMallocManaged(&a2, 1*sizeof(double));
cudaMallocManaged(&s , 1*sizeof(float));
cudaMallocManaged(&s2, 1*sizeof(vfloat2));
cudaMallocManaged(&b0, 1*sizeof(float));
cudaMallocManaged(&b1, 1*sizeof(float));
cudaMallocManaged(&b2, 1*sizeof(float));
cudaMallocManaged(&i0, 1*sizeof(int));
printf("3\n");
fflush(stdout);
char buf[BUFSIZE];
if (fgets(buf, BUFSIZE-1, stdin)) {}
while(!feof(stdin)) {
func_d_d("sin", xsin);
func_d_d("cos", xcos);
func_d_d("tan", xtan);
func_d_d("asin", xasin);
func_d_d("acos", xacos);
func_d_d("atan", xatan);
func_d_d("log", xlog);
func_d_d("exp", xexp);
func_d_d("sqrt", xsqrt);
func_d_d("sqrt_u05", xsqrt_u05);
func_d_d("sqrt_u35", xsqrt_u35);
func_d_d("cbrt", xcbrt);
func_d_d("cbrt_u1", xcbrt_u1);
func_d_d("sinh", xsinh);
func_d_d("cosh", xcosh);
func_d_d("tanh", xtanh);
func_d_d("sinh_u35", xsinh_u35);
func_d_d("cosh_u35", xcosh_u35);
func_d_d("tanh_u35", xtanh_u35);
func_d_d("asinh", xasinh);
func_d_d("acosh", xacosh);
func_d_d("atanh", xatanh);
func_d_d("sin_u1", xsin_u1);
func_d_d("cos_u1", xcos_u1);
func_d_d("tan_u1", xtan_u1);
func_d_d("sinpi_u05", xsinpi_u05);
func_d_d("cospi_u05", xcospi_u05);
func_d_d("asin_u1", xasin_u1);
func_d_d("acos_u1", xacos_u1);
func_d_d("atan_u1", xatan_u1);
func_d_d("log_u1", xlog_u1);
func_d_d("exp2", xexp2);
func_d_d("exp10", xexp10);
func_d_d("exp2_u35", xexp2_u35);
func_d_d("exp10_u35", xexp10_u35);
func_d_d("expm1", xexpm1);
func_d_d("log10", xlog10);
func_d_d("log2", xlog2);
func_d_d("log2_u35", xlog2_u35);
func_d_d("log1p", xlog1p);
func_d_d("fabs", xfabs);
func_d_d("trunc", xtrunc);
func_d_d("floor", xfloor);
func_d_d("ceil", xceil);
func_d_d("round", xround);
func_d_d("rint", xrint);
func_d_d("frfrexp", xfrfrexp);
func_d_d("tgamma_u1", xtgamma_u1);
func_d_d("lgamma_u1", xlgamma_u1);
func_d_d("erf_u1", xerf_u1);
func_d_d("erfc_u15", xerfc_u15);
func_d2_d("sincos", xsincos);
func_d2_d("sincos_u1", xsincos_u1);
func_d2_d("sincospi_u35", xsincospi_u35);
func_d2_d("sincospi_u05", xsincospi_u05);
func_d2_d("modf", xmodf);
func_d_d_d("pow", xpow);
func_d_d_d("atan2", xatan2);
func_d_d_d("atan2_u1", xatan2_u1);
func_d_d_d("hypot_u05", xhypot_u05);
func_d_d_d("hypot_u35", xhypot_u35);
func_d_d_d("copysign", xcopysign);
func_d_d_d("fmax", xfmax);
func_d_d_d("fmin", xfmin);
func_d_d_d("fdim", xfdim);
func_d_d_d("nextafter", xnextafter);
func_d_d_d("fmod", xfmod);
func_d_d_d("remainder", xremainder);
func_d_d_i("ldexp", xldexp);
func_i_d("ilogb", xilogb);
func_i_d("expfrexp", xexpfrexp);
//
func_f_f("sinf", xsinf);
func_f_f("cosf", xcosf);
func_f_f("tanf", xtanf);
func_f_f("asinf", xasinf);
func_f_f("acosf", xacosf);
func_f_f("atanf", xatanf);
func_f_f("logf", xlogf);
func_f_f("expf", xexpf);
func_f_f("sqrtf", xsqrtf);
func_f_f("sqrtf_u05", xsqrtf_u05);
func_f_f("sqrtf_u35", xsqrtf_u35);
func_f_f("cbrtf", xcbrtf);
func_f_f("cbrtf_u1", xcbrtf_u1);
func_f_f("sinhf", xsinhf);
func_f_f("coshf", xcoshf);
func_f_f("tanhf", xtanhf);
func_f_f("sinhf_u35", xsinhf_u35);
func_f_f("coshf_u35", xcoshf_u35);
func_f_f("tanhf_u35", xtanhf_u35);
func_f_f("asinhf", xasinhf);
func_f_f("acoshf", xacoshf);
func_f_f("atanhf", xatanhf);
func_f_f("sinf_u1", xsinf_u1);
func_f_f("cosf_u1", xcosf_u1);
func_f_f("tanf_u1", xtanf_u1);
func_f_f("sinpif_u05", xsinpif_u05);
func_f_f("cospif_u05", xcospif_u05);
func_f_f("asinf_u1", xasinf_u1);
func_f_f("acosf_u1", xacosf_u1);
func_f_f("atanf_u1", xatanf_u1);
func_f_f("logf_u1", xlogf_u1);
func_f_f("exp2f", xexp2f);
func_f_f("exp10f", xexp10f);
func_f_f("exp2f_u35", xexp2f_u35);
func_f_f("exp10f_u35", xexp10f_u35);
func_f_f("expm1f", xexpm1f);
func_f_f("log10f", xlog10f);
func_f_f("log2f", xlog2f);
func_f_f("log2f_u35", xlog2f_u35);
func_f_f("log1pf", xlog1pf);
func_f2_f("sincosf", xsincosf);
func_f2_f("sincosf_u1", xsincosf_u1);
func_f2_f("sincospif_u35", xsincospif_u35);
func_f2_f("sincospif_u05", xsincospif_u05);
func_f_f_f("powf", xpowf);
func_f_f_f("atan2f", xatan2f);
func_f_f_f("atan2f_u1", xatan2f_u1);
func_f_f("fabsf", xfabsf);
func_f_f("truncf", xtruncf);
func_f_f("floorf", xfloorf);
func_f_f("ceilf", xceilf);
func_f_f("roundf", xroundf);
func_f_f("rintf", xrintf);
func_f_f("frfrexpf", xfrfrexpf);
func_f_f_f("hypotf_u05", xhypotf_u05);
func_f_f_f("hypotf_u35", xhypotf_u35);
func_f_f_f("copysignf", xcopysignf);
func_f_f_f("fmaxf", xfmaxf);
func_f_f_f("fminf", xfminf);
func_f_f_f("fdimf", xfdimf);
func_f_f_f("nextafterf", xnextafterf);
func_f_f_f("fmodf", xfmodf);
func_f_f_f("remainderf", xremainderf);
func_f2_f("modff", xmodff);
func_f_f("tgammaf_u1", xtgammaf_u1);
func_f_f("lgammaf_u1", xlgammaf_u1);
func_f_f("erff_u1", xerff_u1);
func_f_f("erfcf_u15", xerfcf_u15);
func_f_f("fastsinf_u3500", xfastsinf_u3500);
func_f_f("fastcosf_u3500", xfastcosf_u3500);
func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
}
return 0;
}

View File

@@ -0,0 +1,859 @@
// Copyright Naoki Shibata and contributors 2010 - 2023.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <inttypes.h>
#include <assert.h>
#include <math.h>
#if defined(_MSC_VER)
#define STDIN_FILENO 0
#else
#include <unistd.h>
#include <sys/types.h>
#endif
#include "quaddef.h"
#include "misc.h"
#if !defined(USE_INLINE_HEADER)
#include "sleef.h"
#else // #if !defined(USE_INLINE_HEADER)
#include <stddef.h>
#include <stdint.h>
#include <float.h>
#include <limits.h>
#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__)
#ifndef FP_FAST_FMA
#define FP_FAST_FMA
#endif
#endif
#if defined(_MSC_VER) && !defined(__STDC__)
#define __STDC__ 1
#endif
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
#include <x86intrin.h>
#endif
#if (defined(_MSC_VER))
#include <intrin.h>
#endif
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#endif
#if defined(__riscv) && defined(__riscv_v)
#include <riscv_vector.h>
#endif
#if defined(__VSX__)
#include <altivec.h>
#endif
#if defined(__VX__)
#include <vecintrin.h>
#endif
#define SLEEF_ALWAYS_INLINE inline
#define SLEEF_INLINE
#define SLEEF_CONST
#include USE_INLINE_HEADER
#include MACRO_ONLY_HEADER
#ifndef ENABLE_PUREC_SCALAR
#include "sleefinline_purec_scalar.h"
#endif
#endif // #if !defined(USE_INLINE_HEADER)
#include "testerutil.h"
#define DORENAME
#ifdef ENABLE_SSE2
#include "renamesse2.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helpersse2.h"
typedef Sleef___m128d_2 vdouble2;
typedef Sleef___m128_2 vfloat2;
#endif
#endif
#ifdef ENABLE_SSE4
#include "renamesse4.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 4
#include "helpersse2.h"
typedef Sleef___m128d_2 vdouble2;
typedef Sleef___m128_2 vfloat2;
#endif
#endif
#ifdef ENABLE_AVX
#include "renameavx.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperavx.h"
typedef Sleef___m256d_2 vdouble2;
typedef Sleef___m256_2 vfloat2;
#endif
#endif
#ifdef ENABLE_FMA4
#include "renamefma4.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 4
#include "helperavx.h"
typedef Sleef___m256d_2 vdouble2;
typedef Sleef___m256_2 vfloat2;
#endif
#endif
#ifdef ENABLE_AVX2
#include "renameavx2.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperavx2.h"
typedef Sleef___m256d_2 vdouble2;
typedef Sleef___m256_2 vfloat2;
#endif
#endif
#ifdef ENABLE_AVX2128
#include "renameavx2128.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperavx2_128.h"
typedef Sleef___m128d_2 vdouble2;
typedef Sleef___m128_2 vfloat2;
#endif
#endif
#ifdef ENABLE_AVX512F
#include "renameavx512f.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperavx512f.h"
typedef Sleef___m512d_2 vdouble2;
typedef Sleef___m512_2 vfloat2;
#endif
#endif
#ifdef ENABLE_AVX512FNOFMA
#include "renameavx512fnofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helperavx512f.h"
typedef Sleef___m512d_2 vdouble2;
typedef Sleef___m512_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VECEXT
#define CONFIG 1
#include "helpervecext.h"
#include "norename.h"
#endif
#ifdef ENABLE_PUREC
#define CONFIG 1
#include "helperpurec.h"
#include "norename.h"
#endif
#ifdef ENABLE_NEON32
#include "renameneon32.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperneon32.h"
typedef Sleef_float32x4_t_2 vfloat2;
#endif
#endif
#ifdef ENABLE_NEON32VFPV4
#include "renameneon32vfpv4.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 4
#include "helperneon32.h"
typedef Sleef_float32x4_t_2 vfloat2;
#endif
#endif
#ifdef ENABLE_ADVSIMD
#include "renameadvsimd.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperadvsimd.h"
typedef Sleef_float64x2_t_2 vdouble2;
typedef Sleef_float32x4_t_2 vfloat2;
#endif
#endif
#ifdef ENABLE_ADVSIMDNOFMA
#include "renameadvsimdnofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helperadvsimd.h"
typedef Sleef_float64x2_t_2 vdouble2;
typedef Sleef_float32x4_t_2 vfloat2;
#endif
#endif
#ifdef ENABLE_DSP128
#define CONFIG 2
#include "helpersse2.h"
#include "renamedsp128.h"
typedef Sleef___m128d_2 vdouble2;
typedef Sleef___m128_2 vfloat2;
#endif
#ifdef ENABLE_SVE
#include "renamesve.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helpersve.h"
#endif
#endif
#ifdef ENABLE_SVENOFMA
#include "renamesvenofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helpersve.h"
#endif
#endif
#ifdef ENABLE_DSP256
#define CONFIG 1
#include "helperavx.h"
#include "renamedsp256.h"
typedef Sleef___m256d_2 vdouble2;
typedef Sleef___m256_2 vfloat2;
#endif
#ifdef ENABLE_VSX
#include "renamevsx.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperpower_128.h"
#include "renamevsx.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VSXNOFMA
#include "renamevsxnofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helperpower_128.h"
#include "renamevsxnofma.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VSX3
#include "renamevsx3.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 3
#include "helperpower_128.h"
#include "renamevsx3.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VSX3NOFMA
#include "renamevsx3nofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 4
#include "helperpower_128.h"
#include "renamevsx3nofma.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VXE
#include "renamevxe.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 140
#include "helpers390x_128.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VXENOFMA
#include "renamevxenofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 141
#include "helpers390x_128.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VXE2
#include "renamevxe2.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 150
#include "helpers390x_128.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_VXE2NOFMA
#include "renamevxe2nofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 151
#include "helpers390x_128.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#endif
#ifdef ENABLE_DSPPOWER_128
#define CONFIG 1
#include "helperpower_128.h"
#include "renamedsp128.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#ifdef ENABLE_DSPS390X_128
#define CONFIG 140
#include "helpers390x_128.h"
#include "renamedsp128.h"
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
#endif
#ifdef ENABLE_RVVM1
#include "renamervvm1.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperrvv.h"
#endif
#endif
#ifdef ENABLE_RVVM1NOFMA
#include "renamervvm1nofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helperrvv.h"
#endif
#endif
#ifdef ENABLE_RVVM2
#include "renamervvm2.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperrvv.h"
#endif
#endif
#ifdef ENABLE_RVVM2NOFMA
#include "renamervvm2nofma.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helperrvv.h"
#endif
#endif
#ifdef ENABLE_PUREC_SCALAR
#include "renamepurec_scalar.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 1
#include "helperpurec_scalar.h"
typedef Sleef_double_2 vdouble2;
typedef Sleef_float_2 vfloat2;
#endif
#endif
#ifdef ENABLE_PURECFMA_SCALAR
#include "renamepurecfma_scalar.h"
#if !defined(USE_INLINE_HEADER)
#define CONFIG 2
#include "helperpurec_scalar.h"
typedef Sleef_double_2 vdouble2;
typedef Sleef_float_2 vfloat2;
#endif
#endif
#ifdef ENABLE_DSP_SCALAR
#include "renamedspscalar.h"
#define CONFIG 1
#include "helperpurec_scalar.h"
typedef Sleef_double_2 vdouble2;
typedef Sleef_float_2 vfloat2;
#endif
#ifdef USE_INLINE_HEADER
#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
#define vmask CONCAT_SIMD_SUFFIX(vmask, SIMD_SUFFIX)
#define vopmask CONCAT_SIMD_SUFFIX(vopmask, SIMD_SUFFIX)
#define vdouble CONCAT_SIMD_SUFFIX(vdouble, SIMD_SUFFIX)
#define vint CONCAT_SIMD_SUFFIX(vint, SIMD_SUFFIX)
#define vfloat CONCAT_SIMD_SUFFIX(vfloat, SIMD_SUFFIX)
#define vint2 CONCAT_SIMD_SUFFIX(vint2, SIMD_SUFFIX)
#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
#define vd2getx_vd_vd2 CONCAT_SIMD_SUFFIX(vd2getx_vd_vd2, SIMD_SUFFIX)
#define vd2gety_vd_vd2 CONCAT_SIMD_SUFFIX(vd2gety_vd_vd2, SIMD_SUFFIX)
#define vf2getx_vf_vf2 CONCAT_SIMD_SUFFIX(vf2getx_vf_vf2, SIMD_SUFFIX)
#define vf2gety_vf_vf2 CONCAT_SIMD_SUFFIX(vf2gety_vf_vf2, SIMD_SUFFIX)
#define vloadu_vd_p CONCAT_SIMD_SUFFIX(vloadu_vd_p, SIMD_SUFFIX)
#define vstoreu_v_p_vd CONCAT_SIMD_SUFFIX(vstoreu_v_p_vd, SIMD_SUFFIX)
#define vloadu_vf_p CONCAT_SIMD_SUFFIX(vloadu_vf_p, SIMD_SUFFIX)
#define vstoreu_v_p_vf CONCAT_SIMD_SUFFIX(vstoreu_v_p_vf, SIMD_SUFFIX)
#define vloadu_vi_p CONCAT_SIMD_SUFFIX(vloadu_vi_p, SIMD_SUFFIX)
#define vstoreu_v_p_vi CONCAT_SIMD_SUFFIX(vstoreu_v_p_vi, SIMD_SUFFIX)
#endif
//
int check_feature(double d, float f) {
#ifdef ENABLE_DP
{
double s[VECTLENDP];
int i;
for(i=0;i<VECTLENDP;i++) {
s[i] = d;
}
vdouble a = vloadu_vd_p(s);
a = xpow(a, a);
vstoreu_v_p_vd(s, a);
if (s[0] == s[0]) return 1;
}
#endif
#ifdef ENABLE_SP
{
float s[VECTLENSP];
int i;
for(i=0;i<VECTLENSP;i++) {
s[i] = d;
}
vfloat a = vloadu_vf_p(s);
a = xpowf(a, a);
vstoreu_v_p_vf(s, a);
if (s[0] == s[0]) return 1;
}
#endif
return 0;
}
#if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
#endif
#if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
#endif
//
#define func_d_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u; \
sscanf(buf, funcStr " %" PRIx64, &u); \
double s[VECTLENDP]; \
memrand(s, sizeof(s)); \
int idx = xrand() & (VECTLENDP-1); \
s[idx] = u2d(u); \
vdouble a = vloadu_vd_p(s); \
a = funcName(a); \
vstoreu_v_p_vd(s, a); \
u = d2u(s[idx]); \
printf("%" PRIx64 "\n", u); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_d2_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u; \
sscanf(buf, funcStr " %" PRIx64, &u); \
double s[VECTLENDP], t[VECTLENDP]; \
memrand(s, sizeof(s)); \
memrand(t, sizeof(t)); \
int idx = xrand() & (VECTLENDP-1); \
s[idx] = u2d(u); \
vdouble2 v; \
vdouble a = vloadu_vd_p(s); \
v = funcName(a); \
vstoreu_v_p_vd(s, vd2getx_vd_vd2(v)); \
vstoreu_v_p_vd(t, vd2gety_vd_vd2(v)); \
Sleef_double2 d2; \
d2.x = s[idx]; \
d2.y = t[idx]; \
printf("%" PRIx64 " %" PRIx64 "\n", d2u(d2.x), d2u(d2.y)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_d_d_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u, v; \
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
double s[VECTLENDP], t[VECTLENDP]; \
memrand(s, sizeof(s)); \
memrand(t, sizeof(t)); \
int idx = xrand() & (VECTLENDP-1); \
s[idx] = u2d(u); \
t[idx] = u2d(v); \
vdouble a, b; \
a = vloadu_vd_p(s); \
b = vloadu_vd_p(t); \
a = funcName(a, b); \
vstoreu_v_p_vd(s, a); \
u = d2u(s[idx]); \
printf("%" PRIx64 "\n", u); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_d_d_i(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u, v; \
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
double s[VECTLENDP]; \
int t[VECTLENDP*2]; \
memrand(s, sizeof(s)); \
memrand(t, sizeof(t)); \
int idx = xrand() & (VECTLENDP-1); \
s[idx] = u2d(u); \
t[idx] = (int)u2d(v); \
vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t))); \
u = d2u(s[idx]); \
printf("%" PRIx64 "\n", u); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_i_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint64_t u; \
int i; \
sscanf(buf, funcStr " %" PRIx64, &u); \
double s[VECTLENDP]; \
int t[VECTLENDP*2]; \
memrand(s, sizeof(s)); \
memrand(t, sizeof(t)); \
int idx = xrand() & (VECTLENDP-1); \
s[idx] = u2d(u); \
vdouble a = vloadu_vd_p(s); \
vint vi = funcName(a); \
vstoreu_v_p_vi(t, vi); \
i = t[idx]; \
printf("%d\n", i); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
//
#define func_f_f(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint32_t u; \
sscanf(buf, funcStr " %x", &u); \
float s[VECTLENSP]; \
memrand(s, sizeof(s)); \
int idx = xrand() & (VECTLENSP-1); \
s[idx] = u2f(u); \
vfloat a = vloadu_vf_p(s); \
a = funcName(a); \
vstoreu_v_p_vf(s, a); \
u = f2u(s[idx]); \
printf("%x\n", u); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_f2_f(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint32_t u; \
sscanf(buf, funcStr " %x", &u); \
float s[VECTLENSP], t[VECTLENSP]; \
memrand(s, sizeof(s)); \
memrand(t, sizeof(t)); \
int idx = xrand() & (VECTLENSP-1); \
s[idx] = u2f(u); \
vfloat2 v; \
vfloat a = vloadu_vf_p(s); \
v = funcName(a); \
vstoreu_v_p_vf(s, vf2getx_vf_vf2(v)); \
vstoreu_v_p_vf(t, vf2gety_vf_vf2(v)); \
Sleef_float2 d2; \
d2.x = s[idx]; \
d2.y = t[idx]; \
printf("%x %x\n", f2u(d2.x), f2u(d2.y)); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
#define func_f_f_f(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
uint32_t u, v; \
sscanf(buf, funcStr " %x %x", &u, &v); \
float s[VECTLENSP], t[VECTLENSP]; \
memrand(s, sizeof(s)); \
memrand(t, sizeof(t)); \
int idx = xrand() & (VECTLENSP-1); \
s[idx] = u2f(u); \
t[idx] = u2f(v); \
vfloat a, b; \
a = vloadu_vf_p(s); \
b = vloadu_vf_p(t); \
a = funcName(a, b); \
vstoreu_v_p_vf(s, a); \
u = f2u(s[idx]); \
printf("%x\n", u); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}
//
#define BUFSIZE 1024
int main2(int argc, char **argv) {
xsrand(time(NULL));
{
int k = 0;
#ifdef ENABLE_DP
k += 1;
#endif
#ifdef ENABLE_SP
k += 2;
#endif
#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
k += 4; // flush to zero
#elif defined(ENABLE_VECEXT)
if (vcast_f_vf(xpowf(vcast_vf_f(0.5f), vcast_vf_f(140))) == 0) k += 4;
#endif
#if defined(DETERMINISTIC)
k += 8;
#endif
printf("%d\n", k);
fflush(stdout);
}
#if !defined(USE_INLINE_HEADER)
fprintf(stderr, "IUT : %s\n", (const char *)xgetPtrf(0));
#endif
fflush(stderr);
char buf[BUFSIZE];
fgets(buf, BUFSIZE-1, stdin);
while(!feof(stdin)) {
#ifdef ENABLE_DP
func_d_d("sin", xsin);
func_d_d("cos", xcos);
func_d_d("tan", xtan);
func_d_d("asin", xasin);
func_d_d("acos", xacos);
func_d_d("atan", xatan);
func_d_d("log", xlog);
func_d_d("exp", xexp);
#ifndef DETERMINISTIC
func_d_d("sqrt", xsqrt);
func_d_d("sqrt_u05", xsqrt_u05);
func_d_d("sqrt_u35", xsqrt_u35);
#endif
func_d_d("cbrt", xcbrt);
func_d_d("cbrt_u1", xcbrt_u1);
func_d_d("sinh", xsinh);
func_d_d("cosh", xcosh);
func_d_d("tanh", xtanh);
func_d_d("sinh_u35", xsinh_u35);
func_d_d("cosh_u35", xcosh_u35);
func_d_d("tanh_u35", xtanh_u35);
func_d_d("asinh", xasinh);
func_d_d("acosh", xacosh);
func_d_d("atanh", xatanh);
func_d_d("sin_u1", xsin_u1);
func_d_d("cos_u1", xcos_u1);
func_d_d("tan_u1", xtan_u1);
func_d_d("sinpi_u05", xsinpi_u05);
func_d_d("cospi_u05", xcospi_u05);
func_d_d("asin_u1", xasin_u1);
func_d_d("acos_u1", xacos_u1);
func_d_d("atan_u1", xatan_u1);
func_d_d("log_u1", xlog_u1);
func_d_d("exp2", xexp2);
func_d_d("exp10", xexp10);
func_d_d("exp2_u35", xexp2_u35);
func_d_d("exp10_u35", xexp10_u35);
func_d_d("expm1", xexpm1);
func_d_d("log10", xlog10);
func_d_d("log2", xlog2);
func_d_d("log2_u35", xlog2_u35);
func_d_d("log1p", xlog1p);
func_d2_d("sincos", xsincos);
func_d2_d("sincos_u1", xsincos_u1);
func_d2_d("sincospi_u35", xsincospi_u35);
func_d2_d("sincospi_u05", xsincospi_u05);
func_d_d_d("pow", xpow);
func_d_d_d("atan2", xatan2);
func_d_d_d("atan2_u1", xatan2_u1);
func_d_d_i("ldexp", xldexp);
func_i_d("ilogb", xilogb);
func_d_d("fabs", xfabs);
func_d_d("trunc", xtrunc);
func_d_d("floor", xfloor);
func_d_d("ceil", xceil);
func_d_d("round", xround);
func_d_d("rint", xrint);
func_d_d("frfrexp", xfrfrexp);
func_i_d("expfrexp", xexpfrexp);
func_d_d_d("hypot_u05", xhypot_u05);
func_d_d_d("hypot_u35", xhypot_u35);
func_d_d_d("copysign", xcopysign);
func_d_d_d("fmax", xfmax);
func_d_d_d("fmin", xfmin);
func_d_d_d("fdim", xfdim);
func_d_d_d("nextafter", xnextafter);
func_d_d_d("fmod", xfmod);
func_d_d_d("remainder", xremainder);
func_d2_d("modf", xmodf);
func_d_d("tgamma_u1", xtgamma_u1);
func_d_d("lgamma_u1", xlgamma_u1);
func_d_d("erf_u1", xerf_u1);
func_d_d("erfc_u15", xerfc_u15);
#endif
#ifdef ENABLE_SP
func_f_f("sinf", xsinf);
func_f_f("cosf", xcosf);
func_f_f("tanf", xtanf);
func_f_f("asinf", xasinf);
func_f_f("acosf", xacosf);
func_f_f("atanf", xatanf);
func_f_f("logf", xlogf);
func_f_f("expf", xexpf);
#ifndef DETERMINISTIC
func_f_f("sqrtf", xsqrtf);
func_f_f("sqrtf_u05", xsqrtf_u05);
func_f_f("sqrtf_u35", xsqrtf_u35);
#endif
func_f_f("cbrtf", xcbrtf);
func_f_f("cbrtf_u1", xcbrtf_u1);
func_f_f("sinhf", xsinhf);
func_f_f("coshf", xcoshf);
func_f_f("tanhf", xtanhf);
func_f_f("sinhf_u35", xsinhf_u35);
func_f_f("coshf_u35", xcoshf_u35);
func_f_f("tanhf_u35", xtanhf_u35);
func_f_f("asinhf", xasinhf);
func_f_f("acoshf", xacoshf);
func_f_f("atanhf", xatanhf);
func_f_f("sinf_u1", xsinf_u1);
func_f_f("cosf_u1", xcosf_u1);
func_f_f("tanf_u1", xtanf_u1);
func_f_f("sinpif_u05", xsinpif_u05);
func_f_f("cospif_u05", xcospif_u05);
func_f_f("asinf_u1", xasinf_u1);
func_f_f("acosf_u1", xacosf_u1);
func_f_f("atanf_u1", xatanf_u1);
func_f_f("logf_u1", xlogf_u1);
func_f_f("exp2f", xexp2f);
func_f_f("exp10f", xexp10f);
func_f_f("exp2f_u35", xexp2f_u35);
func_f_f("exp10f_u35", xexp10f_u35);
func_f_f("expm1f", xexpm1f);
func_f_f("log10f", xlog10f);
func_f_f("log2f", xlog2f);
func_f_f("log2f_u35", xlog2f_u35);
func_f_f("log1pf", xlog1pf);
func_f2_f("sincosf", xsincosf);
func_f2_f("sincosf_u1", xsincosf_u1);
func_f2_f("sincospif_u35", xsincospif_u35);
func_f2_f("sincospif_u05", xsincospif_u05);
func_f_f_f("powf", xpowf);
func_f_f_f("atan2f", xatan2f);
func_f_f_f("atan2f_u1", xatan2f_u1);
func_f_f("fabsf", xfabsf);
func_f_f("truncf", xtruncf);
func_f_f("floorf", xfloorf);
func_f_f("ceilf", xceilf);
func_f_f("roundf", xroundf);
func_f_f("rintf", xrintf);
func_f_f("frfrexpf", xfrfrexpf);
func_f_f_f("hypotf_u05", xhypotf_u05);
func_f_f_f("hypotf_u35", xhypotf_u35);
func_f_f_f("copysignf", xcopysignf);
func_f_f_f("fmaxf", xfmaxf);
func_f_f_f("fminf", xfminf);
func_f_f_f("fdimf", xfdimf);
func_f_f_f("nextafterf", xnextafterf);
func_f_f_f("fmodf", xfmodf);
func_f_f_f("remainderf", xremainderf);
func_f2_f("modff", xmodff);
func_f_f("tgammaf_u1", xtgammaf_u1);
func_f_f("lgammaf_u1", xlgammaf_u1);
func_f_f("erff_u1", xerff_u1);
func_f_f("erfcf_u15", xerfcf_u15);
func_f_f("fastsinf_u3500", xfastsinf_u3500);
func_f_f("fastcosf_u3500", xfastcosf_u3500);
func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
#endif
}
return 0;
}

View File

@@ -0,0 +1,92 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <x86intrin.h>
#include <sleef.h>
#define N 64
#define M 256
double r0[N], a0[N], a1[N], a2[N];
void do_libm() { for(int i=0;i<N;i++) r0[i] = sin(a0[i]); }
#if defined(__SSE2__)
void do_sleef_sse2() { _mm_storeu_pd(r0, Sleef_sind2_u10sse2(_mm_loadu_pd(a0))); }
#endif
#if defined(__AVX__)
void do_sleef_avx() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx(_mm256_loadu_pd(a0))); }
#endif
#if defined(__AVX2__)
void do_sleef_avx2() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx2(_mm256_loadu_pd(a0))); }
#endif
#if defined(__AVX512F__)
void do_sleef_avx512f() { _mm512_storeu_pd(r0, Sleef_sind8_u10avx512f(_mm512_loadu_pd(a0))); }
#endif
int do_test_once(double d) {
for(int i=0;i<N;i++) a0[i] = d;
do_libm();
double rm = r0[0];
#if defined(__SSE2__)
for(int i=0;i<N;i++) a0[i] = d;
do_sleef_sse2();
if (rm == r0[0]) return 1;
#endif
#if defined(__AVX__)
for(int i=0;i<N;i++) a0[i] = d;
do_sleef_avx();
if (rm == r0[0]) return 1;
#endif
#if defined(__AVX2__)
for(int i=0;i<N;i++) a0[i] = d;
do_sleef_avx2();
if (rm == r0[0]) return 1;
#endif
#if defined(__AVX512F__)
for(int i=0;i<N;i++) a0[i] = d;
do_sleef_avx512f();
if (rm == r0[0]) return 1;
#endif
return 0;
}
int check_feature(double d, float f) {
#if defined(__SSE2__)
do_sleef_sse2();
#endif
#if defined(__AVX__)
do_sleef_avx();
#endif
#if defined(__AVX2__)
do_sleef_avx2();
#endif
#if defined(__AVX512F__)
do_sleef_avx512f();
#endif
return 1;
}
int main2(int argc, char **argv) {
for(int i=0;i<M;i++) {
if (!do_test_once(10.0 * ((2.0 * rand() / RAND_MAX) - 1))) {
printf("fail\n");
exit(-1);
}
}
printf("pass\n");
exit(0);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,991 @@
// Copyright Naoki Shibata and contributors 2010 - 2021.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <mpfr.h>
#include <time.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#ifdef ENABLE_SYS_getrandom
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/random.h>
#endif
#include "sleef.h"
#include "testerutil.h"
#define DORENAME
#include "rename.h"
#define DENORMAL_DBL_MIN (4.9406564584124654418e-324)
#define POSITIVE_INFINITY INFINITY
#define NEGATIVE_INFINITY (-INFINITY)
typedef union {
double d;
uint64_t u64;
int64_t i64;
} conv_t;
double nexttoward0(double x, int n) {
union {
double f;
uint64_t u;
} cx;
cx.f = x;
cx.u -=n ;
return cx.f;
}
double rnd() {
conv_t c;
switch(random() & 63) {
case 0: return nexttoward0( 0.0, -(random() & ((1 << (random() & 31)) - 1)));
case 1: return nexttoward0(-0.0, -(random() & ((1 << (random() & 31)) - 1)));
case 2: return nexttoward0( INFINITY, (random() & ((1 << (random() & 31)) - 1)));
case 3: return nexttoward0(-INFINITY, (random() & ((1 << (random() & 31)) - 1)));
}
#ifdef ENABLE_SYS_getrandom
syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
#else
c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
#endif
return c.d;
}
double rnd_fr() {
conv_t c;
do {
#ifdef ENABLE_SYS_getrandom
syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
#else
c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
#endif
} while(!isnumber(c.d));
return c.d;
}
double rnd_zo() {
conv_t c;
do {
#ifdef ENABLE_SYS_getrandom
syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
#else
c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
#endif
} while(!isnumber(c.d) || c.d < -1 || 1 < c.d);
return c.d;
}
int main(int argc,char **argv)
{
mpfr_t frw, frx, fry, frz;
mpfr_set_default_prec(1280);
mpfr_inits(frw, frx, fry, frz, NULL);
conv_t cd;
double d, t;
double d2, d3, zo;
int cnt, ecnt = 0;
srandom(time(NULL));
for(cnt = 0;ecnt < 1000;cnt++) {
switch(cnt & 7) {
case 0:
d = rnd();
d2 = rnd();
d3 = rnd();
zo = rnd();
break;
case 1:
cd.d = rint(rnd_zo() * 1e+10) * M_PI_4;
cd.i64 += (random() & 0xff) - 0x7f;
d = cd.d;
d2 = rnd();
d3 = rnd();
zo = rnd();
break;
case 2:
cd.d = rnd_fr() * M_PI_4;
cd.i64 += (random() & 0xf) - 0x7;
d = cd.d;
d2 = rnd();
d3 = rnd();
zo = rnd();
break;
default:
d = rnd_fr();
d2 = rnd_fr();
d3 = rnd_fr();
zo = rnd_zo();
break;
}
Sleef_double2 sc = xsincospi_u05(d);
Sleef_double2 sc2 = xsincospi_u35(d);
{
const double rangemax2 = 1e+9/4;
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_sinpi(frx, frx, GMP_RNDN);
double u0 = countULP2dp(t = sc.x, frx);
if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincospi_u05 sin arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULP2dp(t = sc2.x, frx);
if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincospi_u35 sin arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
double u2 = countULP2dp(t = xsinpi_u05(d), frx);
if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sinpi_u05 arg=%.20g ulp=%.20g\n", d, u2);
fflush(stdout); ecnt++;
}
}
{
const double rangemax2 = 1e+9/4;
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_cospi(frx, frx, GMP_RNDN);
double u0 = countULP2dp(t = sc.y, frx);
if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincospi_u05 cos arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULP2dp(t = sc.y, frx);
if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincospi_u35 cos arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
double u2 = countULP2dp(t = xcospi_u05(d), frx);
if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C cospi_u05 arg=%.20g ulp=%.20g\n", d, u2);
fflush(stdout); ecnt++;
}
}
sc = xsincos(d);
sc2 = xsincos_u1(d);
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_sin(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xsin(d), frx);
if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(sc.x, frx);
if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
double u2 = countULPdp(t = xsin_u1(d), frx);
if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
double u3 = countULPdp(t = sc2.x, frx);
if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_cos(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xcos(d), frx);
if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = sc.y, frx);
if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
double u2 = countULPdp(t = xcos_u1(d), frx);
if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2);
fflush(stdout); ecnt++;
}
double u3 = countULPdp(t = sc2.y, frx);
if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_tan(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xtan(d), frx);
if (u0 != 0 && (u0 > 3.5 || isnan(t))) {
printf("Pure C tan arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xtan_u1(d), frx);
if (u1 != 0 && (u1 > 1 || isnan(t))) {
printf("Pure C tan_u1 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, fabs(d), GMP_RNDN);
mpfr_log(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xlog(fabs(d)), frx);
if (u0 > 3.5) {
printf("Pure C log arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xlog_u1(fabs(d)), frx);
if (u1 > 1) {
printf("Pure C log_u1 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, fabs(d), GMP_RNDN);
mpfr_log10(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xlog10(fabs(d)), frx);
if (u0 > 1) {
printf("Pure C log10 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, fabs(d), GMP_RNDN);
mpfr_log2(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xlog2(fabs(d)), frx);
if (u0 > 1) {
printf("Pure C log2 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xlog2_u35(fabs(d)), frx);
if (u1 > 3.5) {
printf("Pure C log2_u35 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_log1p(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xlog1p(d), frx);
if ((-1 <= d && d <= 1e+307 && u0 > 1) ||
(d < -1 && !isnan(t)) ||
(d > 1e+307 && !(u0 <= 1 || isinf(t)))) {
printf("Pure C log1p arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_exp(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xexp(d), frx);
if (u0 > 1) {
printf("Pure C exp arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_exp2(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xexp2(d), frx);
if (u0 > 1) {
printf("Pure C exp2 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xexp2_u35(d), frx);
if (u1 > 3.5) {
printf("Pure C exp2_u35 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_exp10(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xexp10(d), frx);
if (u0 > 1.09) {
printf("Pure C exp10 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xexp10_u35(d), frx);
if (u1 > 3.5) {
printf("Pure C exp10_u35 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_expm1(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xexpm1(d), frx);
if (u0 > 1) {
printf("Pure C expm1 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_pow(frx, fry, frx, GMP_RNDN);
double u0 = countULPdp(t = xpow(d2, d), frx);
if (u0 > 1) {
printf("Pure C pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_cbrt(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xcbrt(d), frx);
if (u0 > 3.5) {
printf("Pure C cbrt arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xcbrt_u1(d), frx);
if (u1 > 1) {
printf("Pure C cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, zo, GMP_RNDN);
mpfr_asin(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xasin(zo), frx);
if (u0 > 3.5) {
printf("Pure C asin arg=%.20g ulp=%.20g\n", zo, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xasin_u1(zo), frx);
if (u1 > 1) {
printf("Pure C asin_u1 arg=%.20g ulp=%.20g\n", zo, u1);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, zo, GMP_RNDN);
mpfr_acos(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xacos(zo), frx);
if (u0 > 3.5) {
printf("Pure C acos arg=%.20g ulp=%.20g\n", zo, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xacos_u1(zo), frx);
if (u1 > 1) {
printf("Pure C acos_u1 arg=%.20g ulp=%.20g\n", zo, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_atan(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xatan(d), frx);
if (u0 > 3.5) {
printf("Pure C atan arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULPdp(t = xatan_u1(d), frx);
if (u1 > 1) {
printf("Pure C atan_u1 arg=%.20g ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_atan2(frx, fry, frx, GMP_RNDN);
double u0 = countULPdp(t = xatan2(d2, d), frx);
if (u0 > 3.5) {
printf("Pure C atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULP2dp(t = xatan2_u1(d2, d), frx);
if (u1 > 1) {
printf("Pure C atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_sinh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xsinh(d), frx);
if ((fabs(d) <= 709 && u0 > 1) ||
(d > 709 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
(d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
printf("Pure C sinh arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_cosh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xcosh(d), frx);
if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
printf("Pure C cosh arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_tanh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xtanh(d), frx);
if (u0 > 1) {
printf("Pure C tanh arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_sinh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xsinh_u35(d), frx);
if ((fabs(d) <= 709 && u0 > 3.5) ||
(d > 709 && !(u0 <= 3.5 || (isinf(t) && t > 0))) ||
(d < -709 && !(u0 <= 3.5 || (isinf(t) && t < 0)))) {
printf("Pure C sinh_u35 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_cosh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xcosh_u35(d), frx);
if ((fabs(d) <= 709 && u0 > 3.5) || !(u0 <= 3.5 || (isinf(t) && t > 0))) {
printf("Pure C cosh_u35 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_tanh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xtanh_u35(d), frx);
if (u0 > 3.5) {
printf("Pure C tanh_u35 arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_asinh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xasinh(d), frx);
if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
(d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
(d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) {
printf("Pure C asinh arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_acosh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xacosh(d), frx);
if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
(d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
(d <= -sqrt(DBL_MAX) && !isnan(t))) {
printf("Pure C acosh arg=%.20g ulp=%.20g\n", d, u0);
printf("%.20g\n", t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_atanh(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xatanh(d), frx);
if (u0 > 1) {
printf("Pure C atanh arg=%.20g ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
}
//
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_abs(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xfabs(d), frx);
if (u0 != 0) {
printf("Pure C fabs arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_copysign(frx, frx, fry, GMP_RNDN);
double u0 = countULPdp(t = xcopysign(d, d2), frx);
if (u0 != 0 && !isnan(d2)) {
printf("Pure C copysign arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_max(frx, frx, fry, GMP_RNDN);
double u0 = countULPdp(t = xfmax(d, d2), frx);
if (u0 != 0) {
printf("Pure C fmax arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_min(frx, frx, fry, GMP_RNDN);
double u0 = countULPdp(t = xfmin(d, d2), frx);
if (u0 != 0) {
printf("Pure C fmin arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_dim(frx, frx, fry, GMP_RNDN);
double u0 = countULPdp(t = xfdim(d, d2), frx);
if (u0 > 0.5) {
printf("Pure C fdim arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_trunc(frx, frx);
double u0 = countULPdp(t = xtrunc(d), frx);
if (u0 != 0) {
printf("Pure C trunc arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_floor(frx, frx);
double u0 = countULPdp(t = xfloor(d), frx);
if (u0 != 0) {
printf("Pure C floor arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_ceil(frx, frx);
double u0 = countULPdp(t = xceil(d), frx);
if (u0 != 0) {
printf("Pure C ceil arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_round(frx, frx);
double u0 = countULPdp(t = xround(d), frx);
if (u0 != 0) {
printf("Pure C round arg=%.24g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_rint(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xrint(d), frx);
if (u0 != 0) {
printf("Pure C rint arg=%.24g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_set_d(frz, d3, GMP_RNDN);
mpfr_fma(frx, frx, fry, frz, GMP_RNDN);
double u0 = countULP2dp(t = xfma(d, d2, d3), frx);
double c = mpfr_get_d(frx, GMP_RNDN);
if ((-1e+303 < c && c < 1e+303 && u0 > 0.5) ||
!(u0 <= 0.5 || isinf(t))) {
printf("Pure C fma arg=%.20g, %.20g, %.20g ulp=%.20g\n", d, d2, d3, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_sqrt(frx, frx, GMP_RNDN);
double u0 = countULPdp(t = xsqrt_u05(d), frx);
if (u0 > 0.50001) {
printf("Pure C sqrt_u05 arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_hypot(frx, frx, fry, GMP_RNDN);
double u0 = countULP2dp(t = xhypot_u05(d, d2), frx);
if (u0 > 0.5) {
printf("Pure C hypot arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_hypot(frx, frx, fry, GMP_RNDN);
double u0 = countULP2dp(t = xhypot_u35(d, d2), frx);
double c = mpfr_get_d(frx, GMP_RNDN);
if ((-1e+308 < c && c < 1e+308 && u0 > 3.5) ||
!(u0 <= 3.5 || isinf(t))) {
printf("Pure C hypot arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
t = xnextafter(d, d2);
double c = nextafter(d, d2);
if (!(isnan(t) && isnan(c)) && t != c) {
printf("Pure C nextafter arg=%.20g, %.20g\n", d, d2);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_exp(frx, 0);
double u0 = countULPdp(t = xfrfrexp(d), frx);
if (d != 0 && isnumber(d) && u0 != 0) {
printf("Pure C frfrexp arg=%.20g ulp=%.20g\n", d, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
int cexp = mpfr_get_exp(frx);
int texp = xexpfrexp(d);
if (d != 0 && isnumber(d) && cexp != texp) {
printf("Pure C expfrexp arg=%.20g\n", d);
printf("correct = %d, test = %d\n", cexp, texp);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_fmod(frx, frx, fry, GMP_RNDN);
double u0 = countULPdp(t = xfmod(d, d2), frx);
if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
printf("Pure C fmod arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_d(fry, d2, GMP_RNDN);
mpfr_remainder(frx, frx, fry, GMP_RNDN);
double u0 = countULPdp(t = xremainder(d, d2), frx);
if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
printf("Pure C remainder arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
int exp = (random() & 8191) - 4096;
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_set_exp(frx, mpfr_get_exp(frx) + exp);
double u0 = countULPdp(t = xldexp(d, exp), frx);
if (u0 > 0.5) {
printf("Pure C ldexp arg=%.20g %d ulp=%.20g\n", d, exp, u0);
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_modf(fry, frz, frx, GMP_RNDN);
Sleef_double2 t2 = xmodf(d);
double u0 = countULPdp(t2.x, frz);
double u1 = countULPdp(t2.y, fry);
if (u0 != 0 || u1 != 0) {
printf("Pure C modf arg=%.20g ulp=%.20g %.20g\n", d, u0, u1);
printf("correct = %.20g, %.20g\n", mpfr_get_d(frz, GMP_RNDN), mpfr_get_d(fry, GMP_RNDN));
printf("test = %.20g, %.20g\n", t2.x, t2.y);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
int s;
mpfr_lgamma(frx, &s, frx, GMP_RNDN);
double u0 = countULPdp(t = xlgamma_u1(d), frx);
if (((d < 0 && fabsl(t - mpfr_get_ld(frx, GMP_RNDN)) > 1e-15 && u0 > 1) || (0 <= d && d < 2e+305 && u0 > 1) || (2e+305 <= d && !(u0 <= 1 || isinf(t))))) {
printf("Pure C xlgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_gamma(frx, frx, GMP_RNDN);
double u0 = countULP2dp(t = xtgamma_u1(d), frx);
if (u0 > 1.0) {
printf("Pure C xtgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
printf("Correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_erfc(frx, frx, GMP_RNDN);
static double ebz = 9.8813129168249308835e-324; // nextafter(nextafter(0, 1), 1);
double u0 = countULP2dp(t = xerfc_u15(d), frx);
if ((d > 26.2 && u0 > 2.5 && !(mpfr_get_d(frx, GMP_RNDN) == 0 && t <= ebz)) || (d <= 26.2 && u0 > 1.5)) {
printf("Pure C xerfc_u15 arg=%.20g ulp=%.20g\n", d, u0);
printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
{
mpfr_set_d(frx, d, GMP_RNDN);
mpfr_erf(frx, frx, GMP_RNDN);
double u0 = countULP2dp(t = xerf_u1(d), frx);
if (u0 > 0.75) {
printf("Pure C xerf_u1 arg=%.20g ulp=%.20g\n", d, u0);
printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
fflush(stdout); ecnt++;
}
}
}
mpfr_clears(frw, frx, fry, frz, NULL);
exit(0);
}

View File

@@ -0,0 +1,241 @@
// Copyright Naoki Shibata and contributors 2010 - 2024.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <mpfr.h>
#include <time.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include "misc.h"
#ifdef ENABLE_SYS_getrandom
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/random.h>
#endif
#include "sleef.h"
#include "testerutil.h"
#define DORENAME
#include "rename.h"
#define DENORMAL_LDBL_MIN (3.6451995318824746025284059336194e-4951L)
#define XLDBL_MIN (3.3621031431120935062626778173218e-4932L)
#ifndef M_PIl
#define M_PIl 3.141592653589793238462643383279502884L
#endif
#ifndef M_PI_4l
#define M_PI_4l .785398163397448309615660845819875721049292L
#endif
#define POSITIVE_INFINITY INFINITY
#define NEGATIVE_INFINITY (-INFINITY)
int isnumberl(long double x) { return x != SLEEF_INFINITYl && x != -SLEEF_INFINITYl && x == x; }
int isPlusZerol(long double x) { return x == 0 && copysignl(1, x) == 1; }
int isMinusZerol(long double x) { return x == 0 && copysignl(1, x) == -1; }
mpfr_t fra, frb, frd;
double countULP(long double d, mpfr_t c) {
long double c2 = mpfr_get_ld(c, GMP_RNDN);
if (c2 == 0 && d != 0) return 10000;
//if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
//if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
if (isnanl(c2) && isnanl(d)) return 0;
if (isnanl(c2) || isnanl(d)) return 10001;
if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
if (!isnumberl(c2) && !isnumberl(d)) return 0;
int e;
frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), DENORMAL_LDBL_MIN), GMP_RNDN);
mpfr_set_ld(frd, d, GMP_RNDN);
mpfr_sub(fra, frd, c, GMP_RNDN);
mpfr_div(fra, fra, frb, GMP_RNDN);
double u = fabs(mpfr_get_d(fra, GMP_RNDN));
return u;
}
double countULP2(long double d, mpfr_t c) {
long double c2 = mpfr_get_ld(c, GMP_RNDN);
if (c2 == 0 && d != 0) return 10000;
//if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
//if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
if (isnanl(c2) && isnanl(d)) return 0;
if (isnanl(c2) || isnanl(d)) return 10001;
if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
if (!isnumberl(c2) && !isnumberl(d)) return 0;
int e;
frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), LDBL_MIN), GMP_RNDN);
mpfr_set_ld(frd, d, GMP_RNDN);
mpfr_sub(fra, frd, c, GMP_RNDN);
mpfr_div(fra, fra, frb, GMP_RNDN);
double u = fabs(mpfr_get_d(fra, GMP_RNDN));
return u;
}
typedef union {
long double d;
__int128 u128;
} conv_t;
long double rnd() {
conv_t c;
switch(random() & 15) {
case 0: return INFINITY;
case 1: return -INFINITY;
}
#ifdef ENABLE_SYS_getrandom
syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
#else
c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
#endif
return c.d;
}
long double rnd_fr() {
conv_t c;
do {
#ifdef ENABLE_SYS_getrandom
syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
#else
c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
#endif
} while(!isnumberl(c.d));
return c.d;
}
long double rnd_zo() {
conv_t c;
do {
#ifdef ENABLE_SYS_getrandom
syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
#else
c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
#endif
} while(!isnumberl(c.d) || c.d < -1 || 1 < c.d);
return c.d;
}
void sinpifr(mpfr_t ret, long double d) {
mpfr_t frpi, frd;
mpfr_inits(frpi, frd, NULL);
mpfr_const_pi(frpi, GMP_RNDN);
mpfr_set_d(frd, 1.0, GMP_RNDN);
mpfr_mul(frpi, frpi, frd, GMP_RNDN);
mpfr_set_ld(frd, d, GMP_RNDN);
mpfr_mul(frd, frpi, frd, GMP_RNDN);
mpfr_sin(ret, frd, GMP_RNDN);
mpfr_clears(frpi, frd, NULL);
}
void cospifr(mpfr_t ret, long double d) {
mpfr_t frpi, frd;
mpfr_inits(frpi, frd, NULL);
mpfr_const_pi(frpi, GMP_RNDN);
mpfr_set_d(frd, 1.0, GMP_RNDN);
mpfr_mul(frpi, frpi, frd, GMP_RNDN);
mpfr_set_ld(frd, d, GMP_RNDN);
mpfr_mul(frd, frpi, frd, GMP_RNDN);
mpfr_cos(ret, frd, GMP_RNDN);
mpfr_clears(frpi, frd, NULL);
}
int main(int argc,char **argv)
{
mpfr_t frx;
mpfr_set_default_prec(256);
mpfr_inits(fra, frb, frd, frx, NULL);
conv_t cd;
long double d, t;
int cnt, ecnt = 0;
srandom(time(NULL));
for(cnt = 0;ecnt < 1000;cnt++) {
switch(cnt & 7) {
case 0:
d = rnd();
break;
case 1:
cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+10) * M_PI_4;
cd.u128 += (random() & 0xff) - 0x7f;
d = cd.d;
break;
default:
d = rnd_fr();
break;
}
Sleef_longdouble2 sc = xsincospil_u05(d);
Sleef_longdouble2 sc2 = xsincospil_u35(d);
{
const double rangemax2 = 1e+9;
sinpifr(frx, d);
double u0 = countULP2(t = sc.x, frx);
if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
printf("Pure C sincospil_u05 sin arg=%.30Lg ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULP2(t = sc2.x, frx);
if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
printf("Pure C sincospil_u35 sin arg=%.30Lg ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
{
const double rangemax2 = 1e+9;
cospifr(frx, d);
double u0 = countULP2(t = sc.y, frx);
if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
printf("Pure C sincospil_u05 cos arg=%.30Lg ulp=%.20g\n", d, u0);
fflush(stdout); ecnt++;
}
double u1 = countULP2(t = sc.y, frx);
if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
printf("Pure C sincospil_u35 cos arg=%.30Lg ulp=%.20g\n", d, u1);
fflush(stdout); ecnt++;
}
}
}
}

Some files were not shown because too many files have changed in this diff Show More